diff --git a/code/contextual_embeddings/configs/bert.jsonnet b/code/contextual_embeddings/configs/bert.jsonnet index 4df2d3ef50289765f1074525602074943e7c5769..4a0b9853d49ec75d85eaa9bee5dedb4e0195bff6 100644 --- a/code/contextual_embeddings/configs/bert.jsonnet +++ b/code/contextual_embeddings/configs/bert.jsonnet @@ -67,6 +67,6 @@ "num_serialized_models_to_keep": 3, "num_epochs": 10, "grad_norm": 5.0, - "cuda_device": 0 + "cuda_device": -1 } } diff --git a/code/ssplit/README.md b/code/ssplit/README.md new file mode 100644 index 0000000000000000000000000000000000000000..eae79e543ca20308c771bd9b6a057c0ecc706360 --- /dev/null +++ b/code/ssplit/README.md @@ -0,0 +1,23 @@ +#Requirements + +`ersatz` (`pip install ersatz`) + +#Usage + +`python tok2conllu.py <file>` + +File must be a `.tok` file. + +#Output + +`<file>_conllu`: same file but sentences are separated by a line-jump and a commented line with sentence id. + +#Ersatz library + +Command-line usage: + +`ersatz --input <input.txt> --output <output.txt>` + +Takes as input any text file and outputs the same text file with sentences separated by a line-break. + + diff --git a/code/ssplit/parse_stanza.py b/code/ssplit/parse_stanza.py index b5367fb9b2e1592df161b933e1a4fd60b9024590..c4373a75268e18a811c7992ec641f2c44c4b389e 100644 --- a/code/ssplit/parse_stanza.py +++ b/code/ssplit/parse_stanza.py @@ -65,4 +65,4 @@ def ssplit_stanza(lang, fp_toks, out_dir, treebank=None): f_out.write('\t'.join(row)+'\n') #print('\t'.join(row).encode('utf-8'), file=f_out) f_out.write('\n') - #print('', file=f_out) \ No newline at end of file + #print('', file=f_out) diff --git a/code/ssplit/tok2conllu.py b/code/ssplit/tok2conllu.py new file mode 100644 index 0000000000000000000000000000000000000000..a223a0e0dfe8ac02ddb984296edeac4fd9e3d34f --- /dev/null +++ b/code/ssplit/tok2conllu.py @@ -0,0 +1,87 @@ +import sys +import pandas as pd +import os + +tab = "\t" +space = " " + +def parse_file(f): + """Take a .tok file and turn it into a sequence of token ids and tokens (.tok_seq). Token id precedes token.""" + + column_names = ['tok_id', 'tok', '1', '2', '3', '4', '5', '6', '7', 'seg'] + + dataframe = pd.read_csv(f, names=column_names, comment="#", sep="\t",skipinitialspace=True) + tok_ids = dataframe['tok_id'].values + toks = dataframe['tok'].values + + return dataframe, tok_ids, toks + +def write_seq_file(f, tok_ids, toks): + """Write sequence of token ids and tokens to a .tok_ssplit file""" + + with open(f, "w") as wf: + for i, tok in zip(tok_ids, toks): + wf.write(str(i) + tab + tok + tab) + +def parse_ssplit_file(f): + """Take a .tok_ssplit file and return ids of sentence-beginning tokens.""" + + with open(f, "r") as rf: + sentences = rf.readlines() + + sstart_ids = [0] * len(sentences) + for i, sentence in enumerate(sentences): + ids_toks = sentence.strip().split(space) + sstart_ids[i] = ids_toks[0] + + return sstart_ids + +def make_ssplit(rf, wf, sstart_ids): + """Write new file with sentence boundaries""" + with open(rf, "r") as f: + lines = f.readlines() + + doc_id = None + next_sentence = 0 #index of token beginning next sentence in sstart_ids + sent_counter = 0 + + with open(wf, "w") as f: + for line in lines: + split = line.strip().split(tab) + tok_id = split[0] + if tok_id.startswith("#"): + doc_id = line + sent_counter = 0 + f.write(line) + elif tok_id == sstart_ids[next_sentence]: + doc_id_nb = doc_id.strip().split("= ")[1] + if sent_counter: newline = "\n" + else: newline = "" + sent_counter += 1 + sent_id = "# sent_id = " + doc_id_nb + "-" + str(sent_counter) + f.write(newline + sent_id + "\n") + f.write(line) + next_sentence += 1 + else: + f.write(line) + +def t2c(f): + dataframe, tok_ids, toks = parse_file(f) + f_seq = f + "_seq" + write_seq_file(f_seq, tok_ids, toks) + f_ssplit = f + "_ssplit" + os.system(f"ersatz --input {f_seq} --output {f_ssplit}") + sstart_ids = parse_ssplit_file(f_ssplit) + f_conllu = f + "_conllu" + make_ssplit(f, f_conllu, sstart_ids) + os.system(f"rm {f_seq} {f_ssplit}") #remove temporary files + +def main(): + if len(sys.argv) < 2: + print("usage: python tok2conllu.py <file>") + sys.exit() + for f in sys.argv[1:]: + t2c(f) + +if __name__ == '__main__': + main() diff --git a/code/utils/get_scores.py b/code/utils/get_scores.py new file mode 100644 index 0000000000000000000000000000000000000000..bf505bc239b4ce1af678313d7dc4f2ebeff9f105 --- /dev/null +++ b/code/utils/get_scores.py @@ -0,0 +1,54 @@ +import sys +import os +import pandas as pd + +#Must be executed from discut dir + +def parse_score(line): + split = line.strip().split(": ") + return str((round(float(split[-1]), 4) * 100))[:5] + +def parse_model(line): + split = line.strip().split("_") + model_ft = split[1].strip().split("-") + if len(model_ft) > 1: model = model_ft + else: model = [model_ft[0], "n/a"] + return model + +def parse_test(line): + split = line.strip().split("_") + return split[0] + +def write_scores(result_dir, output_file): + if not os.path.isdir(result_dir): + print(f"directory {result_dir} doesn't exist.") + + _, dirs, _ = next(iter(os.walk(result_dir))) + table = [] + + for d in dirs: + files = os.listdir(os.path.join(result_dir, d)) + for fil in files: + if fil.endswith('test.scores'): + with open(os.path.join(result_dir, d, fil), "r") as r_f: + lines = r_f.readlines() + precision, recall, fscore = [parse_score(l) for l in lines[-3:]] + model = parse_model(d) + entry = [model[0], model[1], parse_test(fil), precision, recall, fscore] + table.append(entry) + + columns = ['Model', 'Fine-tuning', 'Test', 'Precision', 'Recall', 'F1'] + df = pd.DataFrame(table, columns = columns) + df.to_csv((output_file + ".tsv"), sep="\t", index=False) + +def main(): + if len(sys.argv) != 3: + print("usage: python get_scores.py <result_dir> <output_file>"); sys.exit() + else: + result_dir = sys.argv[1] + output_file = sys.argv[2] + write_scores(result_dir, output_file) + +if __name__ == '__main__': + main() + diff --git a/scores_expes/220622_conllu.tsv b/scores_expes/220622_conllu.tsv new file mode 100644 index 0000000000000000000000000000000000000000..8414370029fca110f52a378265dd779bfd51f6e7 --- /dev/null +++ b/scores_expes/220622_conllu.tsv @@ -0,0 +1,39 @@ +Model Fine-tuning Test Precision Recall F1 +fra.sdrt.annodis n/a fra.sdrt.annodis 86.85 88.67 87.75 +fra.sdrt.annodis n/a deu.rst.pcc 72.96 97.28 83.38 +fra.sdrt.annodis n/a eng.sdrt.stac 94.42 89.72 92.01 +fra.sdrt.annodis n/a eus.rst.ert 56.98 87.7 69.08 +fra.sdrt.annodis n/a fas.rst.prstc 79.5 86.27 82.75 +deu.rst.pcc n/a deu.rst.pcc 95.62 96.6 96.11 +deu.rst.pcc n/a rus.rst.rrt 69.38 77.88 73.38 +deu.rst.pcc n/a zho.rst.sctb 54.76 95.83 69.69 +deu.rst.pcc n/a eng.sdrt.stac 98.34 88.17 92.97 +deu.rst.pcc n/a eus.rst.ert 77.85 79.32 78.58 +deu.rst.pcc n/a fas.rst.prstc 91.57 68.06 78.08 +deu.rst.pcc n/a fra.sdrt.annodis 86.00 66.67 75.11 +deu.rst.pcc n/a nld.rst.nldt 92.55 88.17 90.3 +deu.rst.pcc n/a spa.rst.rststb 71.31 91.3 80.08 +deu.rst.pcc n/a por.rst.cstn 82.44 70.59 76.06 +eus.rst.ert n/a deu.rst.pcc 87.06 91.5 89.22 +eus.rst.ert n/a eng.sdrt.stac 99.05 87.2 92.75 +eus.rst.ert n/a eus.rst.ert 90.06 83.24 86.52 +eus.rst.ert n/a fas.rst.prstc 85.34 68.66 76.1 +fas.rst.prstc n/a deu.rst.pcc 81.55 93.2 86.98 +fas.rst.prstc n/a eng.sdrt.stac 98.76 87.46 92.77 +fas.rst.prstc n/a eus.rst.ert 74.49 79.32 76.83 +fas.rst.prstc n/a fas.rst.prstc 92.38 92.24 92.31 +deu.rst.pcc deu.rst.pcc deu.rst.pcc 95.88 94.89 95.38 +deu.rst.pcc eus.rst.ert deu.rst.pcc 98.58 94.56 96.53 +deu.rst.pcc fas.rst.prstc deu.rst.pcc 94.65 96.26 95.45 +deu.rst.pcc fra.sdrt.annodis deu.rst.pcc 96.86 94.56 95.7 +eng.sdrt.stac deu.rst.pcc eng.sdrt.stac 94.69 95.67 95.17 +eng.sdrt.stac eus.rst.ert eng.sdrt.stac 96.46 93.47 94.94 +eng.sdrt.stac fas.rst.prstc eng.sdrt.stac 96.41 94.05 95.22 +eng.sdrt.stac fra.sdrt.annodis eng.sdrt.stac 96.82 94.51 95.65 +eus.rst.ert deu.rst.pcc eus.rst.ert 90.62 82.3 86.26 +eus.rst.ert eus.rst.ert eus.rst.ert 87.31 86.49 86.9 +eus.rst.ert fas.rst.prstc eus.rst.ert 89.55 81.08 85.11 +eus.rst.ert fra.sdrt.annodis eus.rst.ert 92.53 82.03 86.96 +fas.rst.prstc deu.rst.pcc fas.rst.prstc 88.94 93.73 91.28 +fas.rst.prstc eus.rst.ert fas.rst.prstc 93.05 91.94 92.49 +fas.rst.prstc fas.rst.prstc fas.rst.prstc 93.8 90.3 92.02