diff --git a/classifier_features_pytorch.py b/classifier_features_pytorch.py index e0847d1f4634560480b8ef0e121051dc13cd67b6..54cd8487f507afb45c4bfd894515dfa970baae7e 100644 --- a/classifier_features_pytorch.py +++ b/classifier_features_pytorch.py @@ -3,12 +3,7 @@ import torch import numpy as np -from transformers import ( - AutoModel, - AutoTokenizer, - get_linear_schedule_with_warmup, - set_seed, -) +from transformers import AutoModel, AutoTokenizer, get_linear_schedule_with_warmup, set_seed from torch import nn from torch.optim import AdamW from torch.utils.data import DataLoader @@ -27,38 +22,41 @@ args = parse_args() now = datetime.now() dt_string = now.strftime("%d.%m.%y-%H:%M:%S") layers_to_freeze = args.freeze_layers.split(";") +substitutions_file = 'mappings/substitutions.txt' +# mapping_classes = args.mappings_file[:-4].split('-')[-1] +# specific_results = open_specific_results('mappings/specific_results.txt')['B'] -print("Datasets used: " + args.langs_to_use) -print("\nDirection: " + args.normalize_direction) -print("\nMappings file: " + args.mappings_file, flush="True") +print('\nlangs to use: '+ args.langs_to_use) +print('\ndirection: ' + args.normalize_direction) +print('\nmappings file: ' + args.mappings_file, flush='True') + +set_seed(42) +torch.manual_seed(42) # =============== # Dataset class # =============== - class Dataset(torch.utils.data.Dataset): + def __init__(self, sentences): + self.labels = [sent[-1] for sent in sentences] - self.texts = [ - tokenizer( - sent[-2], - is_split_into_words=True, - padding="max_length", - max_length=512, - truncation=True, - return_tensors="pt", - ) - for sent in sentences - ] + self.texts = [tokenizer(sent[-2], + is_split_into_words=True, + padding='max_length', + max_length = 512, + truncation=True, + return_tensors="pt") + for sent in sentences] def classes(self): return self.labels def __len__(self): return len(self.labels) - + def get_batch_labels(self, idx): # Fetch a batch of labels return np.array(self.labels[idx]) @@ -68,12 +66,12 @@ class Dataset(torch.utils.data.Dataset): return self.texts[idx] def __getitem__(self, idx): + batch_texts = self.get_batch_texts(idx) batch_y = self.get_batch_labels(idx) return batch_texts, batch_y - # =============== # Load datasets # =============== @@ -81,90 +79,54 @@ class Dataset(torch.utils.data.Dataset): # Open mappings mappings, inv_mappings = open_mappings(args.mappings_file) batch_size = args.batch_size -tokenizer = AutoTokenizer.from_pretrained(args.transformer_model) - -( - train_sentences, - dev_dict_sentences, - test_dict_sentences, - framework_labels, - disco_features, -) = open_sentences_with_feats(args.data_path, mappings) - -# add disco features as tokens of bert -tokenizer.add_tokens( - [ - "German", - "English", - "Basque", - "Farsi", - "French", - "Dutch", - "Portuguese", - "Russian", - "Spanish", - "Turkish", - "Chinese", - "spa.rst.sctb", - "rus.rst.rrt", - "fra.sdrt.annodis", - "por.rst.cstn", - "eng.sdrt.stac", - "eus.rst.ert", - "eng.pdtb.pdtb", - "deu.rst.pcc", - "eng.rst.rstdt", - "zho.rst.sctb", - "nld.rst.nldt", - "tur.pdtb.tdb", - "spa.rst.rststb", - "fas.rst.prstc", - "zho.pdtb.cdtb", - "eng.rst.gum", - "rst", - "pdtb", - "sdrt", - ] -) -tokenizer.add_tokens(disco_features) +tokenizer = AutoTokenizer.from_pretrained(args.transformer_model) + +train_sentences, dev_dict_sentences, test_dict_sentences, framework_labels, disco_features = open_sentences(args.data_path, mappings) +# print('\nCheck encodings:\n') +# print(train_sentences[0]) + +# add disco features as tokens +if args.use_featres != no: + tokenizer.add_tokens(['German', 'English', 'Basque', 'Farsi', 'French', 'Dutch', 'Portuguese', 'Russian', 'Spanish', 'Turkish', 'Chinese', 'spa.rst.sctb', 'rus.rst.rrt', 'fra.sdrt.annodis', 'por.rst.cstn', 'eng.sdrt.stac', 'eus.rst.ert', 'eng.pdtb.pdtb', 'deu.rst.pcc', 'eng.rst.rstdt', 'zho.rst.sctb', 'nld.rst.nldt', 'tur.pdtb.tdb', 'spa.rst.rststb', 'fas.rst.prstc', 'zho.pdtb.cdtb', 'eng.rst.gum', 'rst', 'pdtb', 'sdrt']) + tokenizer.add_tokens(disco_features) +# print(disco_features) # Determine linear size (= number of classes in the sets + 1) num_labels = len(set(sent[-1] for sent in train_sentences)) + 1 # make train/dev datasets train_dataset = Dataset(train_sentences) -dev_dataset = {corpus: Dataset(s) for corpus, s in dev_dict_sentences.items()} -test_dataset = {corpus: Dataset(s) for corpus, s in test_dict_sentences.items()} +dev_dataset = {corpus: Dataset(s) for corpus, s in dev_dict_sentences.items()} +test_dataset = {corpus: Dataset(s) for corpus, s in test_dict_sentences.items()} # Make dasets with batches and dataloader train_dataloader = DataLoader(train_dataset, batch_size, shuffle=True) -dev_dict_dataloader = { - corpus: DataLoader(dev_data, batch_size) for corpus, dev_data in dev_dataset.items() -} -test_dict_dataloader = { - corpus: DataLoader(test_data, batch_size) - for corpus, test_data in test_dataset.items() -} +dev_dict_dataloader = {corpus: DataLoader(dev_data, batch_size) + for corpus, dev_data in dev_dataset.items()} +test_dict_dataloader = {corpus: DataLoader(test_data, batch_size) + for corpus, test_data in test_dataset.items()} # =============== # Model setup # =============== - class TransformerClassifier(nn.Module): + def __init__(self, dropout=args.dropout): + super(TransformerClassifier, self).__init__() self.tr_model = AutoModel.from_pretrained(args.transformer_model) self.dropout = nn.Dropout(dropout) - self.linear = nn.Linear(768, num_labels) # bert input x num of classes + self.linear = nn.Linear(768, num_labels) # bert input x num of classes self.relu = nn.ReLU() def forward(self, input_id, mask): - outputs = self.tr_model( - input_ids=input_id, attention_mask=mask, return_dict=True - )["last_hidden_state"][:, 0, :] + + outputs = self.tr_model(input_ids = input_id, + attention_mask = mask, + return_dict = True)['last_hidden_state'][:, 0, :] dropout_output = self.dropout(outputs) linear_output = self.linear(dropout_output) final_layer = self.relu(linear_output) @@ -173,122 +135,181 @@ class TransformerClassifier(nn.Module): model = TransformerClassifier() -model.resize_token_embeddings(len(tokenizer)) - - -def train( - model, - train_dataloader, - dev_dict_dataloader, - test_dict_sentences, - test_dict_dataloader, - epochs, - # specific_results -): +# model.resize_token_embeddings(len(tokenizer)) + +def train(model, + train_dataloader, + dev_dict_dataloader, + test_dict_sentences, + test_dict_dataloader, + epochs, + #specific_results + ): + device = torch.device("cuda" if args.use_cuda else "cpu") criterion = nn.CrossEntropyLoss() - optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8) # Adam # 1e-6 + optimizer = AdamW(model.parameters(), #Adam + lr = 2e-5, #1e-6 + eps = 1e-8 + ) if args.use_cuda: model = model.cuda() criterion = criterion.cuda() - + gradient_accumulation_steps = args.gradient_accumulation_steps total_steps = len(train_dataloader) * epochs - scheduler = get_linear_schedule_with_warmup( - optimizer, num_warmup_steps=0, num_training_steps=total_steps - ) - - seed_val = seed_val - set_seed(42) + scheduler = get_linear_schedule_with_warmup(optimizer, + num_warmup_steps = 0, + num_training_steps = total_steps) + + seed_val = 42 torch.manual_seed(seed_val) torch.cuda.manual_seed_all(seed_val) - + # freeze layers, see argument in configure.py - if args.freeze_layers != "": + if args.freeze_layers != '': for name, param in model.named_parameters(): if any(x in name for x in layers_to_freeze): param.requires_grad = False for epoch_num in range(0, epochs): - print("\n=== Epoch {:} / {:} ===".format(epoch_num + 1, epochs), flush="True") - + print('\n=== Epoch {:} / {:} ==='.format(epoch_num + 1, epochs), flush='True') + model.train() total_acc_train = 0 total_loss_train = 0 batch_counter = 0 - + for train_input, train_label in tqdm(train_dataloader): +# for train_input, train_label in train_dataloader: batch_counter += 1 train_label = train_label.to(device) - mask = train_input["attention_mask"].to(device) - input_id = train_input["input_ids"].squeeze(1).to(device) + mask = train_input['attention_mask'].to(device) + input_id = train_input['input_ids'].squeeze(1).to(device) output = model(input_id, mask) +# batch_loss = criterion(output, train_label.long()) +# total_loss_train += batch_loss.item() + +# acc = (output.argmax(dim=1) == train_label).sum().item() +# total_acc_train += acc + # Compute Loss and Perform Back-propagation loss = criterion(output, train_label.long()) + # Normalize the Gradients loss = loss / gradient_accumulation_steps loss.backward() - if batch_counter % gradient_accumulation_steps == 0: + + if (batch_counter % gradient_accumulation_steps == 0): # Update Optimizer - optimizer.step() # or flip them? + optimizer.step() # or flip them? optimizer.zero_grad() - + model.zero_grad() +# loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) +# optimizer.step() scheduler.step() - + # ------ Validation -------- - - print("\nValidation for epoch:", epoch_num + 1) - + + print('\nValidation for epoch:', epoch_num + 1) + # Dev and test results for each corpus. We don't need to save the results. for corpus in dev_dict_dataloader: - dev_results = get_predictions(model, corpus, dev_dict_dataloader[corpus]) + dev_results = get_predictions( + model, + corpus, + dev_dict_dataloader[corpus] + ) better_dev_results = get_better_predictions( - model, - corpus, - dev_dict_dataloader[corpus], - framework_labels[corpus.split(".")[1]], - inv_mappings, - epoch_num + 1, - save_conf_matrix=False, - ) + model, + corpus, + dev_dict_dataloader[corpus], + framework_labels[corpus.split('.')[1]], + inv_mappings, + epoch_num+1, + save_conf_matrix=False + ) + + +# path_results = 'results/dev/language_' + mapping_classes + '_' + str(epoch_num+1) +# if not os.path.exists(path_results): +# os.makedirs(path_results) + +# print_results_to_file(corpus, +# dev_dict_sentences[corpus], +# dev_results, +# inv_mappings, #substitutions_file, +# path_results) # ------ Test -------- - - print("\nTest results for epoch:", epoch_num + 1) - + + print('\nTest results for epoch:', epoch_num + 1) + for corpus in test_dict_dataloader: - test_results = get_predictions(model, corpus, test_dict_dataloader[corpus]) + test_results = get_predictions( + model, + corpus, + test_dict_dataloader[corpus] + ) better_test_results = get_better_predictions( - model, - corpus, - test_dict_dataloader[corpus], - framework_labels[corpus.split(".")[1]], - inv_mappings, - epoch_num + 1, - save_conf_matrix=False, - ) - - -# ------- Start the training ------- - -print("\nModel: ", args.transformer_model) -print("Batch size: ", args.batch_size * args.gradient_accumulation_steps) -print("\nStart training...\n") -train( - model, - train_dataloader, - dev_dict_dataloader, - test_dict_sentences, - test_dict_dataloader, - args.num_epochs, -) -print("\nTraining Done!") + model, + corpus, + test_dict_dataloader[corpus], + framework_labels[corpus.split('.')[1]], + inv_mappings, + epoch_num+1, + save_conf_matrix=False + ) + +# path_results = 'results/test/language_' + mapping_classes + '_' + str(epoch_num+1) +# if not os.path.exists(path_results): +# os.makedirs(path_results) + +# print_results_to_file(corpus, +# test_dict_sentences[corpus], +# test_results, +# inv_mappings, #substitutions_file, +# path_results) + + +# # we want the results of specific epochs for specific corpora. +# # we define the epochs and the corpora and we save only these results. + +# if epoch_num+1 in specific_results: +# for corpus in specific_results[epoch_num+1]: +# test_results = get_predictions(model, +# corpus, +# test_dict_dataloader[corpus], +# print_results=False) + + + # ========= New Code! ============= + # Save for each epoch the dev and test results + + + + +# ------- Start the training ------- + +print('\nModel: ', args.transformer_model) +print('Batch size: ', args.batch_size * args.gradient_accumulation_steps) +print('\nStart training...\n') +train(model, + train_dataloader, + dev_dict_dataloader, + test_dict_sentences, + test_dict_dataloader, + args.num_epochs, +# specific_results + ) +print('\nTraining Done!') + diff --git a/configure.py b/configure.py index e246f4fa996081bb7a793fa6e8581aaf6d150bf6..b7b49e881c873cf2e26f3cf97dec82ef0bad93eb 100644 --- a/configure.py +++ b/configure.py @@ -90,8 +90,17 @@ def parse_args(): "--normalize_direction", default="no", type=str, - help="How will the direction be handled.", + help="How will the direction be handled. Options: discret|disco|no", ) + + # add features + parser.add_argument( + "--add_features", + default="no", + type=str, + help="Which features to add. Options: no|lcf|all|common|lcf+all|lcf+common", + ) + # only specific languages/corpora parser.add_argument( diff --git a/utils.py b/utils.py index ac0a68225779ba77bc551f1c79a573404728ce92..3d045234332f5ed02a0a6f572b072aaf29abff04 100644 --- a/utils.py +++ b/utils.py @@ -105,112 +105,29 @@ def encode_label(og_label, mappings_dict): return mappings_dict["unk"] -def open_file(filename, mappings_dict): - """Function to open a .rels file. - Arguments: - - filename: the path to a .rels file - - mappings_dict: a dictionary of mappings of unique labels to integers - Returns a list of lists, where each list is: - the line + [two sentences combined with special BERT token, encoded label] - """ - - max_len = 254 # 512 (max bert len) / 2 (2 sents) -2 (special tokens) - lines = [] - if "bert" in args.transformer_model: - SEP_token = "[SEP]" - CLS_token = "[CLS]" - elif "roberta" in args.transformer_model: - SEP_token = "</s>" - CLS_token = "<s>" - - with open(filename, "r", encoding="utf-8") as f: - next(f) - for line in f: - l = line.strip().split("\t") - - if len(l) > 1: - # chop the sentences to max_len if too long - sent_1 = l[3].split(" ") - sent_2 = l[4].split(" ") - - if len(sent_1) > max_len: - sent_1 = sent_1[:max_len] - if len(sent_2) > max_len: - sent_2 = sent_2[:max_len] - - # flip them if different direction - if args.normalize_direction == "discret": - if l[9] == "1>2": - lines.append( - l - + [ - [CLS_token] + sent_1 + [SEP_token] + sent_2, - encode_label(l[-1], mappings_dict), - ] - ) - else: - lines.append( - l - + [ - [CLS_token] + sent_2 + [SEP_token] + sent_1, - encode_label(l[-1], mappings_dict), - ] - ) - - # implement the stupid Disco thing - elif args.normalize_direction == "disco": - if l[9] == "1>2": - # it should look like: [CLS] } sent_1 ? > [SEP] sent_2 [SEP] - lines.append( - l - + [ - [CLS_token, "}"] - + sent_1 - + [">", SEP_token] - + sent_2 - + [SEP_token], - encode_label(orig_label, mappings_dict), - ] - ) - else: - # it should look like: [CLS] sent_1 [SEP] < sent_2 { [SEP] - lines.append( - l - + [ - [CLS_token] - + sent_1 - + [SEP_token, "<"] - + sent_2 - + ["{", SEP_token], - encode_label(orig_label, mappings_dict), - ] - ) - - # no direction change - else: - lines.append( - l - + [ - [CLS_token] + sent_1 + [SEP_token] + sent_2, - encode_label(l[-1], mappings_dict), - ] - ) - - return lines - - -def open_file_with_feats(filename, mappings_dict): - """Same as above, but first token is language""" - - max_len = int((512 - 32) / 2) - # max_len = 254 +def open_file(filename, mappings_dict): + + ''' Function to open a .rels file. + Arguments: + - filename: the path to a .rels file + - mappings_dict: a dictionary of mappings of unique labels to integers + + Configure presets (careful!): + - args.transformer_model + - args.use_features + - args.normalize_direction + ''' + lines = [] - if "bert" in args.transformer_model: - SEP_token = "[SEP]" - CLS_token = "[CLS]" - elif "roberta" in args.transformer_model: - SEP_token = "</s>" - CLS_token = "<s>" + if args.transformer_model.startswith('bert'): + SEP_token = '[SEP]' + CLS_token = '[CLS]' + elif args.transformer_model.startswith('roberta'): + SEP_token = '</s>' + CLS_token = '<s>' + else: + SEP_token = '' + CLS_token = '' langs = { "deu": "German", @@ -228,105 +145,73 @@ def open_file_with_feats(filename, mappings_dict): "zho": "Chinese", } - with open(filename, "r", encoding="utf-8") as f: + with open(filename, 'r', encoding='utf-8') as f: next(f) - - lang = langs[filename.split("/")[-2].split(".")[0]] - framework = filename.split("/")[-2].split(".")[1] - fullname = filename.split("/")[-2] - + for line in f: - l = line.strip().split("\t") + l = line.strip().split('\t') + og_label = l[-1] + # encode the label + final_label = encode_label(og_label, mappings_dict) - orig_label = l[-1] - additional_features = [] - # genre = l[12] - # additional_features = [lang, fullname, framework] + l[10:34] + l[35:38] # LCF + all disco features - # additional_features = l[10:34] + l[35:38] # only disco features (all) - # additional_features = l[10:17]+[l[19]]+l[22:24]+l[25:29]+[l[30]]+[l[33]]+l[35:38] # LCF + top disco features + lang = langs[filename.split("/")[-2].split(".")[0]] + framework = filename.split("/")[-2].split(".")[1] + fullname = filename.split("/")[-2] + + if len(l) > 1: # empty lines in some files - if len(l) > 1: # chop the sentences to max_len if too long - sent_1 = l[3].split(" ") - sent_2 = l[4].split(" ") + sent_1 = l[3].split(' ') + sent_2 = l[4].split(' ') if len(sent_1) > max_len: sent_1 = sent_1[:max_len] if len(sent_2) > max_len: sent_2 = sent_2[:max_len] + + # add the DiscoDisco features, if we want to + start_of_seq = [CLS_token] + if args.use_features == 'lcf': + start_of_seq = [lang, framework, fullname] + start_of_seq + elif args.use_features == 'all': + start_of_seq = l[10:34] + l[35:38] + start_of_seq + elif args.use_features == 'common': + start_of_seq = l[10:17]+[l[19]]+l[22:24]+l[25:29]+[l[30]]+[l[33]]+l[35:38] + start_of_seq + elif args.use_features == 'lcf+all': + start_of_seq = [lang, framework, fullname] + l[10:34] + l[35:38] + start_of_seq + elif args.use_features == 'lcf+common': + start_of_seq = [lang, framework, fullname] + l[10:17]+[l[19]]+l[22:24]+l[25:29]+[l[30]]+[l[33]]+l[35:38] + start_of_seq # flip them if different direction - if args.normalize_direction == "discret": - if l[9] == "1>2": - # lang, fullname, framework - lines.append( - l - + [ - additional_features - + [CLS_token] - + sent_1 - + [SEP_token] - + sent_2, - encode_label(orig_label, mappings_dict), - ] - ) + if args.normalize_direction == 'discret': + if l[9] == '1>2': + lines.append(l + [ + start_of_seq + sent_1 + [SEP_token] + sent_2, # sentence in list + final_label]) # encoded labels else: - lines.append( - l - + [ - additional_features - + [CLS_token] - + sent_2 - + [SEP_token] - + sent_1, - encode_label(orig_label, mappings_dict), - ] - ) - - # implement the stupid Disco thing - elif args.normalize_direction == "disco": - if l[9] == "1>2": + lines.append(l + [ + start_of_seq + sent_2 + [SEP_token] + sent_1, + final_label]) + + # implement the Disco direction + elif args.normalize_direction == 'disco': + if l[9] == '1>2': # it should look like: [CLS] } sent_1 ? > [SEP] sent_2 [SEP] - lines.append( - l - + [ - additional_features - + [CLS_token, "}"] - + sent_1 - + [">", SEP_token] - + sent_2 - + [SEP_token], - encode_label(orig_label, mappings_dict), - ] - ) + lines.append(l + [ + start_of_seq + ['}'] + sent_1 + ['>', SEP_token] + sent_2, + final_label]) else: # it should look like: [CLS] sent_1 [SEP] < sent_2 { [SEP] - lines.append( - l - + [ - additional_features - + [CLS_token] - + sent_1 - + [SEP_token, "<"] - + sent_2 - + ["{", SEP_token], - encode_label(orig_label, mappings_dict), - ] - ) + lines.append(l + [ + start_of_seq + sent_1 + [SEP_token, '<'] + sent_2 + ['{'], + final_label]) + # no direction change else: - lines.append( - l - + [ - additional_features - + [CLS_token] - + sent_1 - + [SEP_token] - + sent_2, - encode_label(l[-1], mappings_dict), - ] - ) - + lines.append(l + [ + start_of_seq + sent_1 + [SEP_token] + sent_2, + final_label]) + return lines @@ -340,91 +225,12 @@ def encode_batch(batch): ) + # =============== # OPENING FILES FUNCTIONS # =============== -def open_sentences(path_to_corpora, mappings_dict): - """Opens all the corpora and the surprise corpora in train/dev/test sets. - Uses the open_file() function from utils. - Returns: - - list of sentences for TRAIN: all the corpora and surprise corpora together - - dict of sentences for DEV: each dev set categorized per corpus - - dict of sentences for TEST: each test set categorized per corpus - - ** NEW ** : dict of labels per framework - """ - langs_to_use = False - - if args.langs_to_use != "@": - langs_to_use = args.langs_to_use.split(";") - - corpora = [ - folder - for folder in os.listdir(path_to_corpora) - if not any(i in folder for i in [".md", "DS_", "utils", "ipynb"]) - ] - - # --------------------- - train_sentences = [] - dev_dict_sentences = {} - test_dict_sentences = {} - - all_labels = {} - - for corpus in corpora: - framework = corpus.split(".")[-2] - if not framework in all_labels: - all_labels[framework] = [] - - # ===== open train ==== - try: - # open normal files - - if langs_to_use: - # if we only train with cetrain corpora, we only load them - train_file = [ - "/".join([path_to_corpora, corpus, x]) - for x in os.listdir(path_to_corpora + "/" + corpus) - if "train" in x and "rels" in x - if any(l in x for l in langs_to_use) - ][0] - else: - train_file = [ - "/".join([path_to_corpora, corpus, x]) - for x in os.listdir(path_to_corpora + "/" + corpus) - if "train" in x and "rels" in x - ][0] - temp = open_file_with_feats(train_file, mappings_dict) # - train_sentences += temp - all_labels[framework] += [l[-1] for l in temp] - - except: # some of them don't have train - pass - - # ======== open dev ======== - dev_dict_sentences[corpus] = [] - dev_file = os.path.join(args.data_path, corpus, corpus + "_dev.rels") - temp = open_file_with_feats(dev_file, mappings_dict) # - dev_dict_sentences[corpus] += temp - all_labels[framework] += [l[-1] for l in temp] - - # ======== open test ======== - test_dict_sentences[corpus] = [] - test_file = os.path.join(args.data_path, corpus, corpus + "_test.rels") - temp = open_file_with_feats(test_file, mappings_dict) # - test_dict_sentences[corpus] += temp - all_labels[framework] += [l[-1] for l in temp] - - corpus_labels = {framework: set(all_labels[framework]) for framework in all_labels} - # delete unk as a sanity check - for framework in corpus_labels: - if "unk" in corpus_labels[framework]: - corpus_labels[framework].remove("unk") - - return train_sentences, dev_dict_sentences, test_dict_sentences, corpus_labels - - def open_sentences_with_feats(path_to_corpora, mappings_dict): """Opens all the corpora and the surprise corpora in train/dev/test sets. Uses the open_file() function from utils. @@ -473,7 +279,7 @@ def open_sentences_with_feats(path_to_corpora, mappings_dict): for x in os.listdir(path_to_corpora + "/" + corpus) if "train" in x and ".rels" in x ][0] - temp = open_file_with_feats(train_file, mappings_dict) + temp = open_file(train_file, mappings_dict) # add sentence train_sentences += temp # add disco features @@ -490,7 +296,7 @@ def open_sentences_with_feats(path_to_corpora, mappings_dict): for x in os.listdir(path_to_corpora + "/" + corpus) if "dev" in x and ".rels" in x ][0] - temp = open_file_with_feats(dev_file, mappings_dict) + temp = open_file(dev_file, mappings_dict) dev_dict_sentences[corpus] += temp # add disco features disco_features += [feature for l in temp for feature in l[10:38]] @@ -504,7 +310,7 @@ def open_sentences_with_feats(path_to_corpora, mappings_dict): for x in os.listdir(path_to_corpora + "/" + corpus) if "test" in x and ".rels" in x ][0] - temp = open_file_with_feats(test_file, mappings_dict) + temp = open_file(test_file, mappings_dict) test_dict_sentences[corpus] += temp # add disco features disco_features += [feature for l in temp for feature in l[10:38]]