Skip to content
Snippets Groups Projects
Commit 4cac2450 authored by Alice Pain's avatar Alice Pain
Browse files

script to parse results into tsv

parent c4800a86
Branches
No related tags found
No related merge requests found
......@@ -67,6 +67,6 @@
"num_serialized_models_to_keep": 3,
"num_epochs": 10,
"grad_norm": 5.0,
"cuda_device": 0
"cuda_device": -1
}
}
#Requirements
`ersatz` (`pip install ersatz`)
#Usage
`python tok2conllu.py <file>`
File must be a `.tok` file.
#Output
`<file>_conllu`: same file but sentences are separated by a line-jump and a commented line with sentence id.
#Ersatz library
Command-line usage:
`ersatz --input <input.txt> --output <output.txt>`
Takes as input any text file and outputs the same text file with sentences separated by a line-break.
......@@ -65,4 +65,4 @@ def ssplit_stanza(lang, fp_toks, out_dir, treebank=None):
f_out.write('\t'.join(row)+'\n')
#print('\t'.join(row).encode('utf-8'), file=f_out)
f_out.write('\n')
#print('', file=f_out)
\ No newline at end of file
#print('', file=f_out)
import sys
import pandas as pd
import os
tab = "\t"
space = " "
def parse_file(f):
"""Take a .tok file and turn it into a sequence of token ids and tokens (.tok_seq). Token id precedes token."""
column_names = ['tok_id', 'tok', '1', '2', '3', '4', '5', '6', '7', 'seg']
dataframe = pd.read_csv(f, names=column_names, comment="#", sep="\t",skipinitialspace=True)
tok_ids = dataframe['tok_id'].values
toks = dataframe['tok'].values
return dataframe, tok_ids, toks
def write_seq_file(f, tok_ids, toks):
"""Write sequence of token ids and tokens to a .tok_ssplit file"""
with open(f, "w") as wf:
for i, tok in zip(tok_ids, toks):
wf.write(str(i) + tab + tok + tab)
def parse_ssplit_file(f):
"""Take a .tok_ssplit file and return ids of sentence-beginning tokens."""
with open(f, "r") as rf:
sentences = rf.readlines()
sstart_ids = [0] * len(sentences)
for i, sentence in enumerate(sentences):
ids_toks = sentence.strip().split(space)
sstart_ids[i] = ids_toks[0]
return sstart_ids
def make_ssplit(rf, wf, sstart_ids):
"""Write new file with sentence boundaries"""
with open(rf, "r") as f:
lines = f.readlines()
doc_id = None
next_sentence = 0 #index of token beginning next sentence in sstart_ids
sent_counter = 0
with open(wf, "w") as f:
for line in lines:
split = line.strip().split(tab)
tok_id = split[0]
if tok_id.startswith("#"):
doc_id = line
sent_counter = 0
f.write(line)
elif tok_id == sstart_ids[next_sentence]:
doc_id_nb = doc_id.strip().split("= ")[1]
if sent_counter: newline = "\n"
else: newline = ""
sent_counter += 1
sent_id = "# sent_id = " + doc_id_nb + "-" + str(sent_counter)
f.write(newline + sent_id + "\n")
f.write(line)
next_sentence += 1
else:
f.write(line)
def t2c(f):
dataframe, tok_ids, toks = parse_file(f)
f_seq = f + "_seq"
write_seq_file(f_seq, tok_ids, toks)
f_ssplit = f + "_ssplit"
os.system(f"ersatz --input {f_seq} --output {f_ssplit}")
sstart_ids = parse_ssplit_file(f_ssplit)
f_conllu = f + "_conllu"
make_ssplit(f, f_conllu, sstart_ids)
os.system(f"rm {f_seq} {f_ssplit}") #remove temporary files
def main():
if len(sys.argv) < 2:
print("usage: python tok2conllu.py <file>")
sys.exit()
for f in sys.argv[1:]:
t2c(f)
if __name__ == '__main__':
main()
import sys
import os
import pandas as pd
#Must be executed from discut dir
def parse_score(line):
split = line.strip().split(": ")
return str((round(float(split[-1]), 4) * 100))[:5]
def parse_model(line):
split = line.strip().split("_")
model_ft = split[1].strip().split("-")
if len(model_ft) > 1: model = model_ft
else: model = [model_ft[0], "n/a"]
return model
def parse_test(line):
split = line.strip().split("_")
return split[0]
def write_scores(result_dir, output_file):
if not os.path.isdir(result_dir):
print(f"directory {result_dir} doesn't exist.")
_, dirs, _ = next(iter(os.walk(result_dir)))
table = []
for d in dirs:
files = os.listdir(os.path.join(result_dir, d))
for fil in files:
if fil.endswith('test.scores'):
with open(os.path.join(result_dir, d, fil), "r") as r_f:
lines = r_f.readlines()
precision, recall, fscore = [parse_score(l) for l in lines[-3:]]
model = parse_model(d)
entry = [model[0], model[1], parse_test(fil), precision, recall, fscore]
table.append(entry)
columns = ['Model', 'Fine-tuning', 'Test', 'Precision', 'Recall', 'F1']
df = pd.DataFrame(table, columns = columns)
df.to_csv((output_file + ".tsv"), sep="\t", index=False)
def main():
if len(sys.argv) != 3:
print("usage: python get_scores.py <result_dir> <output_file>"); sys.exit()
else:
result_dir = sys.argv[1]
output_file = sys.argv[2]
write_scores(result_dir, output_file)
if __name__ == '__main__':
main()
Model Fine-tuning Test Precision Recall F1
fra.sdrt.annodis n/a fra.sdrt.annodis 86.85 88.67 87.75
fra.sdrt.annodis n/a deu.rst.pcc 72.96 97.28 83.38
fra.sdrt.annodis n/a eng.sdrt.stac 94.42 89.72 92.01
fra.sdrt.annodis n/a eus.rst.ert 56.98 87.7 69.08
fra.sdrt.annodis n/a fas.rst.prstc 79.5 86.27 82.75
deu.rst.pcc n/a deu.rst.pcc 95.62 96.6 96.11
deu.rst.pcc n/a rus.rst.rrt 69.38 77.88 73.38
deu.rst.pcc n/a zho.rst.sctb 54.76 95.83 69.69
deu.rst.pcc n/a eng.sdrt.stac 98.34 88.17 92.97
deu.rst.pcc n/a eus.rst.ert 77.85 79.32 78.58
deu.rst.pcc n/a fas.rst.prstc 91.57 68.06 78.08
deu.rst.pcc n/a fra.sdrt.annodis 86.00 66.67 75.11
deu.rst.pcc n/a nld.rst.nldt 92.55 88.17 90.3
deu.rst.pcc n/a spa.rst.rststb 71.31 91.3 80.08
deu.rst.pcc n/a por.rst.cstn 82.44 70.59 76.06
eus.rst.ert n/a deu.rst.pcc 87.06 91.5 89.22
eus.rst.ert n/a eng.sdrt.stac 99.05 87.2 92.75
eus.rst.ert n/a eus.rst.ert 90.06 83.24 86.52
eus.rst.ert n/a fas.rst.prstc 85.34 68.66 76.1
fas.rst.prstc n/a deu.rst.pcc 81.55 93.2 86.98
fas.rst.prstc n/a eng.sdrt.stac 98.76 87.46 92.77
fas.rst.prstc n/a eus.rst.ert 74.49 79.32 76.83
fas.rst.prstc n/a fas.rst.prstc 92.38 92.24 92.31
deu.rst.pcc deu.rst.pcc deu.rst.pcc 95.88 94.89 95.38
deu.rst.pcc eus.rst.ert deu.rst.pcc 98.58 94.56 96.53
deu.rst.pcc fas.rst.prstc deu.rst.pcc 94.65 96.26 95.45
deu.rst.pcc fra.sdrt.annodis deu.rst.pcc 96.86 94.56 95.7
eng.sdrt.stac deu.rst.pcc eng.sdrt.stac 94.69 95.67 95.17
eng.sdrt.stac eus.rst.ert eng.sdrt.stac 96.46 93.47 94.94
eng.sdrt.stac fas.rst.prstc eng.sdrt.stac 96.41 94.05 95.22
eng.sdrt.stac fra.sdrt.annodis eng.sdrt.stac 96.82 94.51 95.65
eus.rst.ert deu.rst.pcc eus.rst.ert 90.62 82.3 86.26
eus.rst.ert eus.rst.ert eus.rst.ert 87.31 86.49 86.9
eus.rst.ert fas.rst.prstc eus.rst.ert 89.55 81.08 85.11
eus.rst.ert fra.sdrt.annodis eus.rst.ert 92.53 82.03 86.96
fas.rst.prstc deu.rst.pcc fas.rst.prstc 88.94 93.73 91.28
fas.rst.prstc eus.rst.ert fas.rst.prstc 93.05 91.94 92.49
fas.rst.prstc fas.rst.prstc fas.rst.prstc 93.8 90.3 92.02
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment