Skip to content
Snippets Groups Projects
Commit c4800a86 authored by Alice Pain's avatar Alice Pain
Browse files

tok2conllu

:
parent 27fab324
No related branches found
No related tags found
No related merge requests found
#Requirements
`ersatz` (`pip install ersatz`)
#Usage
`python tok2conllu.py <file>`
File must be a `.tok` file.
#Output
`<file>_conllu`: same file but sentences are separated by a line-jump and a commented line with sentence id.
#Ersatz library
Command-line usage:
`ersatz --input <input.txt> --output <output.txt>`
Takes as input any text file and outputs the same text file with sentences separated by a line-break.
import sys
import pandas as pd
import os
tab = "\t"
space = " "
def parse_file(f):
"""Take a .tok file and turn it into a sequence of token ids and tokens (.tok_seq). Token id precedes token."""
column_names = ['tok_id', 'tok', '1', '2', '3', '4', '5', '6', '7', 'seg']
dataframe = pd.read_csv(f, names=column_names, comment="#", sep="\t",skipinitialspace=True)
tok_ids = dataframe['tok_id'].values
toks = dataframe['tok'].values
return dataframe, tok_ids, toks
def write_seq_file(f, tok_ids, toks):
"""Write sequence of token ids and tokens to a .tok_ssplit file"""
with open(f, "w") as wf:
for i, tok in zip(tok_ids, toks):
wf.write(str(i) + tab + tok + tab)
def parse_ssplit_file(f):
"""Take a .tok_ssplit file and return ids of sentence-beginning tokens."""
with open(f, "r") as rf:
sentences = rf.readlines()
sstart_ids = [0] * len(sentences)
for i, sentence in enumerate(sentences):
ids_toks = sentence.strip().split(space)
sstart_ids[i] = ids_toks[0]
return sstart_ids
def make_ssplit(rf, wf, sstart_ids):
"""Write new file with sentence boundaries"""
with open(rf, "r") as f:
lines = f.readlines()
doc_id = None
next_sentence = 0 #index of token beginning next sentence in sstart_ids
sent_counter = 0
with open(wf, "w") as f:
for line in lines:
split = line.strip().split(tab)
tok_id = split[0]
if tok_id.startswith("#"):
doc_id = line
sent_counter = 0
f.write(line)
elif tok_id == sstart_ids[next_sentence]:
doc_id_nb = doc_id.strip().split("= ")[1]
if sent_counter: newline = "\n"
else: newline = ""
sent_counter += 1
sent_id = "# sent_id = " + doc_id_nb + "-" + str(sent_counter)
f.write(newline + sent_id + "\n")
f.write(line)
next_sentence += 1
else:
f.write(line)
def t2c(f):
dataframe, tok_ids, toks = parse_file(f)
f_seq = f + "_seq"
write_seq_file(f_seq, tok_ids, toks)
f_ssplit = f + "_ssplit"
os.system(f"ersatz --input {f_seq} --output {f_ssplit}")
sstart_ids = parse_ssplit_file(f_ssplit)
f_conllu = f + "_conllu"
make_ssplit(f, f_conllu, sstart_ids)
os.system(f"rm {f_seq} {f_ssplit}") #remove temporary files
def main():
if len(sys.argv) < 2:
print("usage: python tok2conllu.py <file>")
sys.exit()
for f in sys.argv[1:]:
t2c(f)
if __name__ == '__main__':
main()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment