diff --git a/ssplit/README.md b/ssplit/README.md new file mode 100644 index 0000000000000000000000000000000000000000..eae79e543ca20308c771bd9b6a057c0ecc706360 --- /dev/null +++ b/ssplit/README.md @@ -0,0 +1,23 @@ +#Requirements + +`ersatz` (`pip install ersatz`) + +#Usage + +`python tok2conllu.py <file>` + +File must be a `.tok` file. + +#Output + +`<file>_conllu`: same file but sentences are separated by a line-jump and a commented line with sentence id. + +#Ersatz library + +Command-line usage: + +`ersatz --input <input.txt> --output <output.txt>` + +Takes as input any text file and outputs the same text file with sentences separated by a line-break. + + diff --git a/ssplit/tok2conllu.py b/ssplit/tok2conllu.py new file mode 100644 index 0000000000000000000000000000000000000000..a223a0e0dfe8ac02ddb984296edeac4fd9e3d34f --- /dev/null +++ b/ssplit/tok2conllu.py @@ -0,0 +1,87 @@ +import sys +import pandas as pd +import os + +tab = "\t" +space = " " + +def parse_file(f): + """Take a .tok file and turn it into a sequence of token ids and tokens (.tok_seq). Token id precedes token.""" + + column_names = ['tok_id', 'tok', '1', '2', '3', '4', '5', '6', '7', 'seg'] + + dataframe = pd.read_csv(f, names=column_names, comment="#", sep="\t",skipinitialspace=True) + tok_ids = dataframe['tok_id'].values + toks = dataframe['tok'].values + + return dataframe, tok_ids, toks + +def write_seq_file(f, tok_ids, toks): + """Write sequence of token ids and tokens to a .tok_ssplit file""" + + with open(f, "w") as wf: + for i, tok in zip(tok_ids, toks): + wf.write(str(i) + tab + tok + tab) + +def parse_ssplit_file(f): + """Take a .tok_ssplit file and return ids of sentence-beginning tokens.""" + + with open(f, "r") as rf: + sentences = rf.readlines() + + sstart_ids = [0] * len(sentences) + for i, sentence in enumerate(sentences): + ids_toks = sentence.strip().split(space) + sstart_ids[i] = ids_toks[0] + + return sstart_ids + +def make_ssplit(rf, wf, sstart_ids): + """Write new file with sentence boundaries""" + with open(rf, "r") as f: + lines = f.readlines() + + doc_id = None + next_sentence = 0 #index of token beginning next sentence in sstart_ids + sent_counter = 0 + + with open(wf, "w") as f: + for line in lines: + split = line.strip().split(tab) + tok_id = split[0] + if tok_id.startswith("#"): + doc_id = line + sent_counter = 0 + f.write(line) + elif tok_id == sstart_ids[next_sentence]: + doc_id_nb = doc_id.strip().split("= ")[1] + if sent_counter: newline = "\n" + else: newline = "" + sent_counter += 1 + sent_id = "# sent_id = " + doc_id_nb + "-" + str(sent_counter) + f.write(newline + sent_id + "\n") + f.write(line) + next_sentence += 1 + else: + f.write(line) + +def t2c(f): + dataframe, tok_ids, toks = parse_file(f) + f_seq = f + "_seq" + write_seq_file(f_seq, tok_ids, toks) + f_ssplit = f + "_ssplit" + os.system(f"ersatz --input {f_seq} --output {f_ssplit}") + sstart_ids = parse_ssplit_file(f_ssplit) + f_conllu = f + "_conllu" + make_ssplit(f, f_conllu, sstart_ids) + os.system(f"rm {f_seq} {f_ssplit}") #remove temporary files + +def main(): + if len(sys.argv) < 2: + print("usage: python tok2conllu.py <file>") + sys.exit() + for f in sys.argv[1:]: + t2c(f) + +if __name__ == '__main__': + main()