Skip to content
Snippets Groups Projects
Commit cba742c0 authored by chloebt's avatar chloebt
Browse files

add module for TP5

parent 56974eab
No related branches found
No related tags found
No related merge requests found
import time, io
import pandas as pd
import numpy as np
# torch and torch modules to deal with text data
import torch
import torch.nn as nn
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader
# Load the weight matrix: modify the code below to check the coverage of the
# pre-trained embeddings
def load_weights_matrix( train, vectors, emb_dim=300 ):
matrix_len = len(train.vocab)
weights_matrix = np.zeros((matrix_len, emb_dim))
for i in range(0, len(train.vocab)):
word = train.vocab.lookup_token(i)
try:
weights_matrix[i] = vectors[word]
except KeyError:
weights_matrix[i] = np.random.normal(scale=0.6, size=(emb_dim, ))
weights_matrix = torch.from_numpy(weights_matrix).to( torch.float32)
return weights_matrix
def load_vectors(fname):
fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
n, d = map(int, fin.readline().split())
print("Originally we have: ", n, 'tokens, and vectors of',d, 'dimensions') #here in fact only 10000 words
data = {}
for line in fin:
tokens = line.rstrip().split(' ')
data[tokens[0]] = [float(t) for t in tokens[1:]]
return data
# Here we create a custom Dataset class that inherits from the Dataset class in PyTorch
# A custom Dataset class must implement three functions: __init__, __len__, and __getitem__
class Dataset(torch.utils.data.Dataset):
def __init__(self, tsv_file, vocab=None ):
""" (REQUIRED) Here we save the location of our input file,
load the data, i.e. retrieve the list of texts and associated labels,
build the vocabulary if none is given,
and define the pipelines used to prepare the data """
self.tsv_file = tsv_file
self.data, self.label_list = self.load_data( )
# splits the string sentence by space, can t make the fr tokenzer work
self.tokenizer = get_tokenizer( None )
self.vocab = vocab
if not vocab:
self.build_vocab()
# pipelines for text and label
self.text_pipeline = lambda x: self.vocab(self.tokenizer(x)) #return a list of indices from a text
self.label_pipeline = lambda x: int(x) #simple mapping to self
def load_data( self ):
""" Read a tsv file and return the list of texts and associated labels"""
data = pd.read_csv( self.tsv_file, header=0, delimiter="\t", quoting=3)
instances = []
label_list = []
for i in data.index:
label_list.append( data["sentiment"][i] )
instances.append( data["review"][i] )
return instances, label_list
def build_vocab(self):
""" Build the vocabulary, i.e. retrieve the list of unique tokens
appearing in the corpus (= training set). Se also add a specific index
corresponding to unknown words. """
self.vocab = build_vocab_from_iterator(self.yield_tokens(), specials=["<unk>"])
self.vocab.set_default_index(self.vocab["<unk>"])
def yield_tokens(self):
""" Iterator on tokens """
for text in self.data:
yield self.tokenizer(text)
def __len__(self):
""" (REQUIRED) Return the len of the data,
i.e. the total number of instances """
return len(self.data)
def __getitem__(self, index):
""" (REQUIRED) Return a specific instance in a format that can be
processed by Pytorch, i.e. torch tensors """
return (
tuple( [torch.tensor(self.text_pipeline( self.data[index] ), dtype=torch.int64),
torch.tensor( self.label_pipeline( self.label_list[index] ), dtype=torch.int64) ] )
)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment