add module for TP5

cba742c0 · chloebt · 56974eab · cba742c0
Commit cba742c0 authored 2 years ago by chloebt
--- a/data/reader_pytorch_tp5.py
+++ b/data/reader_pytorch_tp5.py
+import time, io
+import pandas as pd
+import numpy as np
+# torch and torch modules to deal with text data
+import torch
+import torch.nn as nn
+from torchtext.data.utils import get_tokenizer
+from torchtext.vocab import build_vocab_from_iterator
+from torch.utils.data import DataLoader
+
+
+
+
+# Load the weight matrix: modify the code below to check the coverage of the
+# pre-trained embeddings
+def load_weights_matrix( train, vectors, emb_dim=300 ):
+  matrix_len = len(train.vocab)
+  weights_matrix = np.zeros((matrix_len, emb_dim))
+
+  for i in range(0, len(train.vocab)):
+      word = train.vocab.lookup_token(i)
+      try:
+          weights_matrix[i] = vectors[word]
+      except KeyError:
+          weights_matrix[i] = np.random.normal(scale=0.6, size=(emb_dim, ))
+  weights_matrix = torch.from_numpy(weights_matrix).to( torch.float32)
+  return weights_matrix
+
+def load_vectors(fname):
+  fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
+  n, d = map(int, fin.readline().split())
+  print("Originally we have: ", n, 'tokens, and vectors of',d, 'dimensions') #here in fact only 10000 words
+  data = {}
+  for line in fin:
+    tokens = line.rstrip().split(' ')
+    data[tokens[0]] = [float(t) for t in tokens[1:]]
+  return data
+
+
+# Here we create a custom Dataset class that inherits from the Dataset class in PyTorch
+# A custom Dataset class must implement three functions: __init__, __len__, and __getitem__
+class Dataset(torch.utils.data.Dataset):
+
+    def __init__(self, tsv_file, vocab=None ):
+      """ (REQUIRED) Here we save the location of our input file,
+        load the data, i.e. retrieve the list of texts and associated labels,
+        build the vocabulary if none is given,
+        and define the pipelines used to prepare the data """
+      self.tsv_file = tsv_file
+      self.data, self.label_list = self.load_data( )
+      # splits the string sentence by space, can t make the fr tokenzer work
+      self.tokenizer = get_tokenizer( None )
+      self.vocab = vocab
+      if not vocab:
+        self.build_vocab()
+      # pipelines for text and label
+      self.text_pipeline = lambda x: self.vocab(self.tokenizer(x)) #return a list of indices from a text
+      self.label_pipeline = lambda x: int(x) #simple mapping to self
+
+    def load_data( self ):
+      """ Read a tsv file and return the list of texts and associated labels"""
+      data = pd.read_csv( self.tsv_file, header=0, delimiter="\t", quoting=3)
+      instances = []
+      label_list = []
+      for i in data.index:
+        label_list.append( data["sentiment"][i] )
+        instances.append( data["review"][i] )
+      return instances, label_list
+
+    def build_vocab(self):
+      """ Build the vocabulary, i.e. retrieve the list of unique tokens
+      appearing in the corpus (= training set). Se also add a specific index
+      corresponding to unknown words.  """
+      self.vocab = build_vocab_from_iterator(self.yield_tokens(), specials=["<unk>"])
+      self.vocab.set_default_index(self.vocab["<unk>"])
+
+    def yield_tokens(self):
+      """ Iterator on tokens """
+      for text in self.data:
+        yield self.tokenizer(text)
+
+    def __len__(self):
+      """ (REQUIRED) Return the len of the data,
+      i.e. the total number of instances """
+      return len(self.data)
+
+    def __getitem__(self, index):
+      """ (REQUIRED) Return a specific instance in a format that can be
+      processed by Pytorch, i.e. torch tensors """
+      return (
+            tuple( [torch.tensor(self.text_pipeline( self.data[index] ), dtype=torch.int64),
+                    torch.tensor( self.label_pipeline( self.label_list[index] ), dtype=torch.int64) ] )
+        )
+