diff --git a/data/reader_pytorch_tp5.py b/data/reader_pytorch_tp5.py new file mode 100644 index 0000000000000000000000000000000000000000..fa541f83a959d9be82ce5cf494707dfea7d5f24e --- /dev/null +++ b/data/reader_pytorch_tp5.py @@ -0,0 +1,94 @@ +import time, io +import pandas as pd +import numpy as np +# torch and torch modules to deal with text data +import torch +import torch.nn as nn +from torchtext.data.utils import get_tokenizer +from torchtext.vocab import build_vocab_from_iterator +from torch.utils.data import DataLoader + + + + +# Load the weight matrix: modify the code below to check the coverage of the +# pre-trained embeddings +def load_weights_matrix( train, vectors, emb_dim=300 ): + matrix_len = len(train.vocab) + weights_matrix = np.zeros((matrix_len, emb_dim)) + + for i in range(0, len(train.vocab)): + word = train.vocab.lookup_token(i) + try: + weights_matrix[i] = vectors[word] + except KeyError: + weights_matrix[i] = np.random.normal(scale=0.6, size=(emb_dim, )) + weights_matrix = torch.from_numpy(weights_matrix).to( torch.float32) + return weights_matrix + +def load_vectors(fname): + fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore') + n, d = map(int, fin.readline().split()) + print("Originally we have: ", n, 'tokens, and vectors of',d, 'dimensions') #here in fact only 10000 words + data = {} + for line in fin: + tokens = line.rstrip().split(' ') + data[tokens[0]] = [float(t) for t in tokens[1:]] + return data + + +# Here we create a custom Dataset class that inherits from the Dataset class in PyTorch +# A custom Dataset class must implement three functions: __init__, __len__, and __getitem__ +class Dataset(torch.utils.data.Dataset): + + def __init__(self, tsv_file, vocab=None ): + """ (REQUIRED) Here we save the location of our input file, + load the data, i.e. retrieve the list of texts and associated labels, + build the vocabulary if none is given, + and define the pipelines used to prepare the data """ + self.tsv_file = tsv_file + self.data, self.label_list = self.load_data( ) + # splits the string sentence by space, can t make the fr tokenzer work + self.tokenizer = get_tokenizer( None ) + self.vocab = vocab + if not vocab: + self.build_vocab() + # pipelines for text and label + self.text_pipeline = lambda x: self.vocab(self.tokenizer(x)) #return a list of indices from a text + self.label_pipeline = lambda x: int(x) #simple mapping to self + + def load_data( self ): + """ Read a tsv file and return the list of texts and associated labels""" + data = pd.read_csv( self.tsv_file, header=0, delimiter="\t", quoting=3) + instances = [] + label_list = [] + for i in data.index: + label_list.append( data["sentiment"][i] ) + instances.append( data["review"][i] ) + return instances, label_list + + def build_vocab(self): + """ Build the vocabulary, i.e. retrieve the list of unique tokens + appearing in the corpus (= training set). Se also add a specific index + corresponding to unknown words. """ + self.vocab = build_vocab_from_iterator(self.yield_tokens(), specials=["<unk>"]) + self.vocab.set_default_index(self.vocab["<unk>"]) + + def yield_tokens(self): + """ Iterator on tokens """ + for text in self.data: + yield self.tokenizer(text) + + def __len__(self): + """ (REQUIRED) Return the len of the data, + i.e. the total number of instances """ + return len(self.data) + + def __getitem__(self, index): + """ (REQUIRED) Return a specific instance in a format that can be + processed by Pytorch, i.e. torch tensors """ + return ( + tuple( [torch.tensor(self.text_pipeline( self.data[index] ), dtype=torch.int64), + torch.tensor( self.label_pipeline( self.label_list[index] ), dtype=torch.int64) ] ) + ) +