From 4cac2450ab66d03e905c7a46bb4f800f2b71fb48 Mon Sep 17 00:00:00 2001 From: Alice Pain <alice.pain@ens.psl.eu> Date: Wed, 22 Jun 2022 11:41:10 +0200 Subject: [PATCH] script to parse results into tsv --- .../configs/bert.jsonnet | 2 +- code/ssplit/README.md | 23 +++++ code/ssplit/parse_stanza.py | 2 +- code/ssplit/tok2conllu.py | 87 +++++++++++++++++++ code/utils/get_scores.py | 54 ++++++++++++ scores_expes/220622_conllu.tsv | 39 +++++++++ 6 files changed, 205 insertions(+), 2 deletions(-) create mode 100644 code/ssplit/README.md create mode 100644 code/ssplit/tok2conllu.py create mode 100644 code/utils/get_scores.py create mode 100644 scores_expes/220622_conllu.tsv diff --git a/code/contextual_embeddings/configs/bert.jsonnet b/code/contextual_embeddings/configs/bert.jsonnet index 4df2d3e..4a0b985 100644 --- a/code/contextual_embeddings/configs/bert.jsonnet +++ b/code/contextual_embeddings/configs/bert.jsonnet @@ -67,6 +67,6 @@ "num_serialized_models_to_keep": 3, "num_epochs": 10, "grad_norm": 5.0, - "cuda_device": 0 + "cuda_device": -1 } } diff --git a/code/ssplit/README.md b/code/ssplit/README.md new file mode 100644 index 0000000..eae79e5 --- /dev/null +++ b/code/ssplit/README.md @@ -0,0 +1,23 @@ +#Requirements + +`ersatz` (`pip install ersatz`) + +#Usage + +`python tok2conllu.py <file>` + +File must be a `.tok` file. + +#Output + +`<file>_conllu`: same file but sentences are separated by a line-jump and a commented line with sentence id. + +#Ersatz library + +Command-line usage: + +`ersatz --input <input.txt> --output <output.txt>` + +Takes as input any text file and outputs the same text file with sentences separated by a line-break. + + diff --git a/code/ssplit/parse_stanza.py b/code/ssplit/parse_stanza.py index b5367fb..c4373a7 100644 --- a/code/ssplit/parse_stanza.py +++ b/code/ssplit/parse_stanza.py @@ -65,4 +65,4 @@ def ssplit_stanza(lang, fp_toks, out_dir, treebank=None): f_out.write('\t'.join(row)+'\n') #print('\t'.join(row).encode('utf-8'), file=f_out) f_out.write('\n') - #print('', file=f_out) \ No newline at end of file + #print('', file=f_out) diff --git a/code/ssplit/tok2conllu.py b/code/ssplit/tok2conllu.py new file mode 100644 index 0000000..a223a0e --- /dev/null +++ b/code/ssplit/tok2conllu.py @@ -0,0 +1,87 @@ +import sys +import pandas as pd +import os + +tab = "\t" +space = " " + +def parse_file(f): + """Take a .tok file and turn it into a sequence of token ids and tokens (.tok_seq). Token id precedes token.""" + + column_names = ['tok_id', 'tok', '1', '2', '3', '4', '5', '6', '7', 'seg'] + + dataframe = pd.read_csv(f, names=column_names, comment="#", sep="\t",skipinitialspace=True) + tok_ids = dataframe['tok_id'].values + toks = dataframe['tok'].values + + return dataframe, tok_ids, toks + +def write_seq_file(f, tok_ids, toks): + """Write sequence of token ids and tokens to a .tok_ssplit file""" + + with open(f, "w") as wf: + for i, tok in zip(tok_ids, toks): + wf.write(str(i) + tab + tok + tab) + +def parse_ssplit_file(f): + """Take a .tok_ssplit file and return ids of sentence-beginning tokens.""" + + with open(f, "r") as rf: + sentences = rf.readlines() + + sstart_ids = [0] * len(sentences) + for i, sentence in enumerate(sentences): + ids_toks = sentence.strip().split(space) + sstart_ids[i] = ids_toks[0] + + return sstart_ids + +def make_ssplit(rf, wf, sstart_ids): + """Write new file with sentence boundaries""" + with open(rf, "r") as f: + lines = f.readlines() + + doc_id = None + next_sentence = 0 #index of token beginning next sentence in sstart_ids + sent_counter = 0 + + with open(wf, "w") as f: + for line in lines: + split = line.strip().split(tab) + tok_id = split[0] + if tok_id.startswith("#"): + doc_id = line + sent_counter = 0 + f.write(line) + elif tok_id == sstart_ids[next_sentence]: + doc_id_nb = doc_id.strip().split("= ")[1] + if sent_counter: newline = "\n" + else: newline = "" + sent_counter += 1 + sent_id = "# sent_id = " + doc_id_nb + "-" + str(sent_counter) + f.write(newline + sent_id + "\n") + f.write(line) + next_sentence += 1 + else: + f.write(line) + +def t2c(f): + dataframe, tok_ids, toks = parse_file(f) + f_seq = f + "_seq" + write_seq_file(f_seq, tok_ids, toks) + f_ssplit = f + "_ssplit" + os.system(f"ersatz --input {f_seq} --output {f_ssplit}") + sstart_ids = parse_ssplit_file(f_ssplit) + f_conllu = f + "_conllu" + make_ssplit(f, f_conllu, sstart_ids) + os.system(f"rm {f_seq} {f_ssplit}") #remove temporary files + +def main(): + if len(sys.argv) < 2: + print("usage: python tok2conllu.py <file>") + sys.exit() + for f in sys.argv[1:]: + t2c(f) + +if __name__ == '__main__': + main() diff --git a/code/utils/get_scores.py b/code/utils/get_scores.py new file mode 100644 index 0000000..bf505bc --- /dev/null +++ b/code/utils/get_scores.py @@ -0,0 +1,54 @@ +import sys +import os +import pandas as pd + +#Must be executed from discut dir + +def parse_score(line): + split = line.strip().split(": ") + return str((round(float(split[-1]), 4) * 100))[:5] + +def parse_model(line): + split = line.strip().split("_") + model_ft = split[1].strip().split("-") + if len(model_ft) > 1: model = model_ft + else: model = [model_ft[0], "n/a"] + return model + +def parse_test(line): + split = line.strip().split("_") + return split[0] + +def write_scores(result_dir, output_file): + if not os.path.isdir(result_dir): + print(f"directory {result_dir} doesn't exist.") + + _, dirs, _ = next(iter(os.walk(result_dir))) + table = [] + + for d in dirs: + files = os.listdir(os.path.join(result_dir, d)) + for fil in files: + if fil.endswith('test.scores'): + with open(os.path.join(result_dir, d, fil), "r") as r_f: + lines = r_f.readlines() + precision, recall, fscore = [parse_score(l) for l in lines[-3:]] + model = parse_model(d) + entry = [model[0], model[1], parse_test(fil), precision, recall, fscore] + table.append(entry) + + columns = ['Model', 'Fine-tuning', 'Test', 'Precision', 'Recall', 'F1'] + df = pd.DataFrame(table, columns = columns) + df.to_csv((output_file + ".tsv"), sep="\t", index=False) + +def main(): + if len(sys.argv) != 3: + print("usage: python get_scores.py <result_dir> <output_file>"); sys.exit() + else: + result_dir = sys.argv[1] + output_file = sys.argv[2] + write_scores(result_dir, output_file) + +if __name__ == '__main__': + main() + diff --git a/scores_expes/220622_conllu.tsv b/scores_expes/220622_conllu.tsv new file mode 100644 index 0000000..8414370 --- /dev/null +++ b/scores_expes/220622_conllu.tsv @@ -0,0 +1,39 @@ +Model Fine-tuning Test Precision Recall F1 +fra.sdrt.annodis n/a fra.sdrt.annodis 86.85 88.67 87.75 +fra.sdrt.annodis n/a deu.rst.pcc 72.96 97.28 83.38 +fra.sdrt.annodis n/a eng.sdrt.stac 94.42 89.72 92.01 +fra.sdrt.annodis n/a eus.rst.ert 56.98 87.7 69.08 +fra.sdrt.annodis n/a fas.rst.prstc 79.5 86.27 82.75 +deu.rst.pcc n/a deu.rst.pcc 95.62 96.6 96.11 +deu.rst.pcc n/a rus.rst.rrt 69.38 77.88 73.38 +deu.rst.pcc n/a zho.rst.sctb 54.76 95.83 69.69 +deu.rst.pcc n/a eng.sdrt.stac 98.34 88.17 92.97 +deu.rst.pcc n/a eus.rst.ert 77.85 79.32 78.58 +deu.rst.pcc n/a fas.rst.prstc 91.57 68.06 78.08 +deu.rst.pcc n/a fra.sdrt.annodis 86.00 66.67 75.11 +deu.rst.pcc n/a nld.rst.nldt 92.55 88.17 90.3 +deu.rst.pcc n/a spa.rst.rststb 71.31 91.3 80.08 +deu.rst.pcc n/a por.rst.cstn 82.44 70.59 76.06 +eus.rst.ert n/a deu.rst.pcc 87.06 91.5 89.22 +eus.rst.ert n/a eng.sdrt.stac 99.05 87.2 92.75 +eus.rst.ert n/a eus.rst.ert 90.06 83.24 86.52 +eus.rst.ert n/a fas.rst.prstc 85.34 68.66 76.1 +fas.rst.prstc n/a deu.rst.pcc 81.55 93.2 86.98 +fas.rst.prstc n/a eng.sdrt.stac 98.76 87.46 92.77 +fas.rst.prstc n/a eus.rst.ert 74.49 79.32 76.83 +fas.rst.prstc n/a fas.rst.prstc 92.38 92.24 92.31 +deu.rst.pcc deu.rst.pcc deu.rst.pcc 95.88 94.89 95.38 +deu.rst.pcc eus.rst.ert deu.rst.pcc 98.58 94.56 96.53 +deu.rst.pcc fas.rst.prstc deu.rst.pcc 94.65 96.26 95.45 +deu.rst.pcc fra.sdrt.annodis deu.rst.pcc 96.86 94.56 95.7 +eng.sdrt.stac deu.rst.pcc eng.sdrt.stac 94.69 95.67 95.17 +eng.sdrt.stac eus.rst.ert eng.sdrt.stac 96.46 93.47 94.94 +eng.sdrt.stac fas.rst.prstc eng.sdrt.stac 96.41 94.05 95.22 +eng.sdrt.stac fra.sdrt.annodis eng.sdrt.stac 96.82 94.51 95.65 +eus.rst.ert deu.rst.pcc eus.rst.ert 90.62 82.3 86.26 +eus.rst.ert eus.rst.ert eus.rst.ert 87.31 86.49 86.9 +eus.rst.ert fas.rst.prstc eus.rst.ert 89.55 81.08 85.11 +eus.rst.ert fra.sdrt.annodis eus.rst.ert 92.53 82.03 86.96 +fas.rst.prstc deu.rst.pcc fas.rst.prstc 88.94 93.73 91.28 +fas.rst.prstc eus.rst.ert fas.rst.prstc 93.05 91.94 92.49 +fas.rst.prstc fas.rst.prstc fas.rst.prstc 93.8 90.3 92.02 -- GitLab