tok2conllu

:

tok2conllu
c4800a86 · Alice Pain · 27fab324 · c4800a86 · c4800a86
Commit c4800a86 authored 3 years ago by Alice Pain
--- a/ssplit/README.md
+++ b/ssplit/README.md
+#Requirements 
+
+`ersatz` (`pip install ersatz`)
+
+#Usage
+
+`python tok2conllu.py <file>` 
+
+File must be a `.tok` file.
+
+#Output
+
+`<file>_conllu`: same file but sentences are separated by a line-jump and a commented line with sentence id.
+
+#Ersatz library
+
+Command-line usage:
+
+`ersatz --input <input.txt> --output <output.txt>`
+
+Takes as input any text file and outputs the same text file with sentences separated by a line-break.
+
+
--- a/ssplit/tok2conllu.py
+++ b/ssplit/tok2conllu.py
+import sys
+import pandas as pd
+import os
+
+tab = "\t"
+space = " "
+
+def parse_file(f):
+    """Take a .tok file and turn it into a sequence of token ids and tokens (.tok_seq). Token id precedes token."""
+
+    column_names = ['tok_id', 'tok', '1', '2', '3', '4', '5', '6', '7', 'seg']
+
+    dataframe = pd.read_csv(f, names=column_names, comment="#", sep="\t",skipinitialspace=True)
+    tok_ids = dataframe['tok_id'].values
+    toks = dataframe['tok'].values
+
+    return dataframe, tok_ids, toks
+
+def write_seq_file(f, tok_ids, toks):
+    """Write sequence of token ids and tokens to a .tok_ssplit file"""
+
+    with open(f, "w") as wf:
+        for i, tok in zip(tok_ids, toks):
+            wf.write(str(i) + tab + tok + tab)
+
+def parse_ssplit_file(f):
+    """Take a .tok_ssplit file and return ids of sentence-beginning tokens."""
+
+    with open(f, "r") as rf:
+        sentences = rf.readlines()
+
+    sstart_ids = [0] * len(sentences)
+    for i, sentence in enumerate(sentences):
+        ids_toks = sentence.strip().split(space) 
+        sstart_ids[i] = ids_toks[0]
+
+    return sstart_ids
+
+def make_ssplit(rf, wf, sstart_ids):
+    """Write new file with sentence boundaries"""
+    with open(rf, "r") as f:
+        lines = f.readlines()
+
+    doc_id = None
+    next_sentence = 0 #index of token beginning next sentence in sstart_ids
+    sent_counter = 0
+
+    with open(wf, "w") as f:
+        for line in lines:
+            split = line.strip().split(tab)
+            tok_id = split[0]
+            if tok_id.startswith("#"):
+                doc_id = line
+                sent_counter = 0
+                f.write(line)
+            elif tok_id == sstart_ids[next_sentence]:
+                doc_id_nb = doc_id.strip().split("= ")[1]
+                if sent_counter: newline = "\n" 
+                else: newline = ""
+                sent_counter += 1
+                sent_id = "# sent_id = " + doc_id_nb + "-" + str(sent_counter)
+                f.write(newline + sent_id + "\n")
+                f.write(line)
+                next_sentence += 1
+            else:
+                f.write(line)
+
+def t2c(f):
+    dataframe, tok_ids, toks = parse_file(f)
+    f_seq = f + "_seq"
+    write_seq_file(f_seq, tok_ids, toks)
+    f_ssplit = f + "_ssplit"
+    os.system(f"ersatz --input {f_seq} --output {f_ssplit}")
+    sstart_ids = parse_ssplit_file(f_ssplit)
+    f_conllu = f + "_conllu"
+    make_ssplit(f, f_conllu, sstart_ids)
+    os.system(f"rm {f_seq} {f_ssplit}") #remove temporary files
+
+def main():
+    if len(sys.argv) < 2: 
+        print("usage: python tok2conllu.py <file>")
+        sys.exit()
+    for f in sys.argv[1:]:
+        t2c(f)
+
+if __name__ == '__main__':
+    main()