Skip to content
Snippets Groups Projects
Commit d6975588 authored by larivier's avatar larivier
Browse files

Merge branch 'dev-allen' into 'main'

updates and add usecase 2

See merge request !1
parents 2fcdded4 31228c51
Branches
No related tags found
1 merge request!1updates and add usecase 2
# Project DisCut22 *still WIP*
# Project DisCut22 *still WIP* : Discourse Annotator Tool
A tool for Discourse Segmentation. Inheritor of ToNy and DisCut, segmentors for DISRPT 2019 and 2021. The goal of this version is to be easy to use with or without IT knowledge.
A tool for Discourse Annotation. Inheritor of ToNy and DisCut, segmentors for DISRPT 2019 and 2021. The goal of this version is to be easy to use with or without IT knowledge.
__2021__
*[Multi-lingual Discourse Segmentation and Connective Identification: MELODI at Disrpt2021](https://aclanthology.org/2021.disrpt-1.3.pdf)*
......@@ -13,17 +13,22 @@ Code: https://gitlab.inria.fr/andiamo/tony
# Usage
## Usecases
- **Discourse Segmentation:** Take a raw text as input, use a loaded model to make predictions. Output the same text but with EDU segmentation.
- **Segmentation Evaluation:** Take an EDU gold segmented text as input, use a loaded model to make predictions. Output scores of model predictions against gold, and output discrepancies.
## Content description
[TBD : xplain directories automatically created during scripts run]
- `data/` Contains input data, raw and/or pre-processed format(s).
- `data/results/` Contains output data, scores and post-processed data. (Also logs of allennlp)
- `data/MyProjet/` Contains input data, raw and/or pre-processed format(s).
- `results/` Contains output data, scores and post-processed data. (Also logs of allennlp)
- `code/` Contains main scripts.
- `code/utils` Contains useful scripts to be called.
- `model/` Contains loaded or created model.
- `doc.pdf` Contains detailed documentation (TBD?)
- `code/config.json` A file to be completed (or a dir with choise between simple use_case configs and a template for a custom config)
- `discut22_1.py` One python script to run them all.
- `config_XX.json` A file to be completed (or a dir with choise between simple use_case configs and a template for a custom config).
- `utils/` Contains useful scripts to be called.
- `model/` Contains model to be loaded or created.
- `documentation.md` Contains detailed documentation (TBD?)
## Set up environnement
- Conda stuff pour python 3.7 (TBD ?)
......@@ -40,10 +45,12 @@ pip install -r <dir?>requirements.txt
(go to `code` directory)
Run this command:
```
python discut22.py --config config_1.json
python code/discut22.py --config code/config_1.json
```
## Authors and acknowledgment
Morteza Ezzabady
Amir Zeldes
<!---
## Test and Deploy
......
......@@ -5,17 +5,21 @@
class Input:
def __init__(self, infos):
self.name = infos['name']
self.lang = infos['language']
self.path = infos['folder_path']
self.file = infos['file']
self.form = infos['format']
self.gold = infos['gold']
self.resu = infos['results_path']
self.lang = infos['language']
self.path = infos['folder_path'] # misused
self.file = infos['file']
self.form = infos['format'] # not used
self.gold = infos['gold'] # not used
self.resu = infos['results_path'] # misused : le créer automatiquement
class Process:
def __init__(self, infos, data):
self.toke = infos['pre-processing']['tokenization']
self.main = infos["main"]
self.toke = infos['pre-processing']['tokenization'] # not used
self.data = data
self.model = infos['discourse_segmenter']['model'] # ezpz for Tony
\ No newline at end of file
self.ssplit = infos['pre-processing']['sentence_split']
self.ssplitor = infos['pre-processing']['sentence_split_splitor']
self.post_tab = infos['post-processing']['json_to_tab']
self.post_bracket = infos['post-processing']['tab_to_bracket']
\ No newline at end of file
......@@ -14,9 +14,10 @@
"framework": "sdrt"
},
"steps":{
"main": "annotation",
"pre-processing": {
"tokenization": true,
"sentence_split": true,
"sentence_split": false,
"syntactic_parsing": false,
"NER_format_initialisation": true
},
......
{
"usecase_description": "Config file for usecase_2 : Take a EDU gold segmented text au format tok as input, use a loaded model to make predictions. Output scores of model predictions against gold, and output discrepancies. To start, we evaluate tony on annodis dev set.",
"input": {
"name": "fra.sdrt.annodis_dev",
"file": ".ttok",
"folder_path": "../data/fra.sdrt.annodis_dev",
"format": "truc",
"language": "fr",
"gold": true,
"results_path": "../data/fra.sdrt.annodis_dev/results"
},
"output": {
"format": "ner_tok",
"framework": "sdrt"
},
"steps":{
"main": "test",
"pre-processing": {
"tokenization": false,
"sentence_split": true,
"sentence_split_splitor": "stanza",
"syntactic_parsing": false,
"NER_format_initialisation": true
},
"discourse_segmenter": {
"model": "tony"
},
"post-processing": {
"json_to_tab": true,
"tab_to_bracket":false
},
"evaluation": true
}
}
{
"usecase_description": "Config file for usecase_2.2 : Take a EDU gold segmented text au format conll as input, use a loaded model to make predictions. Output scores of model predictions against gold, and output discrepancies. To start, we evaluate tony on annodis dev set.",
"input": {
"name": "fra.sdrt.annodis_dev",
"file": ".conllu",
"file_options": [".conllu", ".tok"],
"folder_path": "../data/fra.sdrt.annodis_dev",
"format": "truc",
"language": "fr",
"gold": true,
"results_path": "../data/fra.sdrt.annodis_dev/results"
},
"output": {
"format": "ner_tok",
"framework": "sdrt"
},
"steps":{
"main": "test",
"pre-processing": {
"tokenization": false,
"sentence_split": false,
"sentence_split_splitor": "stanza",
"syntactic_parsing": false,
"NER_format_initialisation": true
},
"discourse_segmenter": {
"model": "tony"
},
"post-processing": {
"json_to_tab": true,
"tab_to_bracket":false
},
"evaluation": true
}
}
......@@ -12,10 +12,15 @@ import pandas as pd # for futur clean output in df
import json
from classes_def import Input, Process
import utils
import utils.fr_tokenize as tk
import utils.conv2ner as c2n
import utils.json2conll as j2c
import utils.conll2bracket as c2bracket
import utils.sent_split as ssent
#import utils.ssplit.parse_corpus as ssent
#import utils.ssplit.parse_corpus as ssent
#import utils.ssplit.parse_stanza as ssent
# fonction to get config stuffs
......@@ -53,21 +58,33 @@ def main(config):
#print([x for x in enumerate(steps)])
#suivant la liste ordonnée, faire les trucs (for now simple usecase1):
# FN: soit besoin sent split, soit besoin tokenizer, soit aucun des deux
if steps.ssplit == True : # python code/ssplit/parse_corpus.py ${dataset} --parser stanza --out_dir data
#### Split text into sentence : not in usecase1
data_in = "{}/{}{}".format(steps.data.path, steps.data.name, steps.data.file)
data_tok = "{}/{}.tok".format(steps.data.path, steps.data.name)
print("Starting sentence spliting...to {}".format(steps.data.path, steps.data.name))
# ssent.main(data_in, data_tok, "stanza", steps.data.lang)
ssent.main(data_in, data_tok, "stanza", steps.data.lang)
elif steps.toke == True :
#### Tokenization du text # #python ${SEG_DIR}/code/utils/fr_tokenize.py $RAW > ${RAW}.tok
data_in = "{}/{}{}".format(steps.data.path, steps.data.name, steps.data.file)
data_tok = "{}/{}.tok".format(steps.data.path, steps.data.name)
# sys.exit("check path")
print("Starting Tokenization...to {}".format(data_tok))
tk.main(data_in, data_tok) # .ss -> .tok
data_in = "{}/{}{}".format(steps.data.path, steps.data.name, steps.data.file)
data_tok = "{}/{}.tok".format(steps.data.path, steps.data.name)
# sys.exit("check path")
print("Starting Tokenization...to {}".format(data_tok))
tk.main(data_in, data_tok) # .ss -> .tok
else:
data_tok = f"{steps.data.path}/{steps.data.name}{steps.data.file}"
#### Conversion en NER pb # #python $RUNTIME/conv2ner.py ${RAW}.tok > ${RAW}.ner.tok
data_ner = "{}/{}.ner.tok".format(steps.data.path, steps.data.name)
print("Starting conversion to NER format...to {}".format(data_ner))
c2n.main(data_tok, data_ner)
c2n.main(data_tok, data_ner, steps.data.file)
#### Appliquer le model choisi, sortir le JSON avec les predictions :score, proba, tags
......@@ -77,24 +94,39 @@ def main(config):
data_json = "{}/{}.json".format(steps.data.resu, steps.data.name)
cmd = "allennlp predict --use-dataset-reader --output-file {} {} {} &> {}/logs.txt".format(data_json, model_path, data_ner, steps.data.resu)
if not os.path.isdir(steps.data.resu):
print(" result does not exist")
print(" result directory does not exist")
os.mkdir(steps.data.resu)
print("Starting Prediction...")
os.system(cmd)
#### ------------------------------- TBD do the same but with python script (or JIANT ??)
if steps.post_tab == True :
#### Appliquer les predictions au texte et sortir le texte tokenisé avec la colone des tags-prédis # #python $RUNTIME/json2conll.py ${RESULT_DIR}/${FILE}.json split.tok > ${RESULT_DIR}/${FILE}.split.tok
data_conll = "{}/{}.split.tok".format(steps.data.resu, steps.data.name)
format = "split.tok" # to retrive from config file !!!
print("Starting Formating from json to tok format...to {}".format(data_conll))
j2c.main(data_json, format, data_conll)
data_conll = "{}/{}.split.tok".format(steps.data.resu, steps.data.name)
format = "split.tok" # to retrive from config file !!!
print("Starting Formating from json to tok format...to {}".format(data_conll))
j2c.main(data_json, format, data_conll)
####### EVALUATION AGAINST GOLD
# python discut/code/utils/seg_eval.py data_gold data_pred (-s)
data_gold = data_tok
data_pred = data_conll
cmd = f"python utils/seg_eval.py {data_gold} {data_pred} &> {steps.data.resu}/Evaluation.txt"
os.system(cmd)
if steps.post_bracket == True :
####prendre le texte tokénisé+tags-prédits et sortir le texte en plain (format du d'ebut, for now en suite de phrases) avec les brackets # #python $RUNTIME/conll2bracket.py ${RESULT_DIR}/${FILE}.split.tok > ${RESULT_DIR}/${FILE}.split.tok.bracket
data_bracket = "{}/{}.split.tok.bracket".format(steps.data.resu, steps.data.name)
print("Starting formating into bracket text...to {}".format(data_bracket))
c2bracket.main(data_conll, data_bracket)
data_bracket = "{}/{}.split.tok.bracket".format(steps.data.resu, steps.data.name)
print("Starting formating into bracket text...to {}".format(data_bracket))
c2bracket.main(data_conll, data_bracket)
......
......@@ -120,8 +120,9 @@ def conversion2ner(input, output, params=None):
def main(f_in, f_out):
def main(f_in, f_out, f):
input = f_in
output = f_out
input_format = f
#param = get_param()
conversion2ner(input, output) # add param
\ No newline at end of file
import io, os, sys, argparse
"""
Script to evaluate segmentation f-score and perfect discourse unit segmentation proportion from two files. Two input formats are permitted:
* One token per line, with ten columns, no sentence breaks (default *.tok format) - segmentation indicated in column 10
* The same, but with blank lines between sentences (*.conll format)
Token columns follow the CoNLL-U format, with token IDs in the first column and pipe separated key=value pairs in the last column.
Document boundaries are indicated by a comment: # newdoc id = ...
The evaluation uses micro-averaged F-Scores per corpus (not document macro average).
Example:
```
# newdoc id = GUM_bio_byron
1 Education _ _ _ _ _ _ _ BeginSeg=Yes
2 and _ _ _ _ _ _ _ _
3 early _ _ _ _ _ _ _ _
4 loves _ _ _ _ _ _ _ _
5 Byron _ _ _ _ _ _ _ BeginSeg=Yes
6 received _ _ _ _ _ _ _ _
...
```
Or:
```
# newdoc id = GUM_bio_byron
# sent_id = GUM_bio_byron-1
# text = Education and early loves
1 Education education NOUN NN Number=Sing 0 root _ BeginSeg=Yes
2 and and CCONJ CC _ 4 cc _ _
3 early early ADJ JJ Degree=Pos 4 amod _ _
4 loves love NOUN NNS Number=Plur 1 conj _ _
# sent_id = GUM_bio_byron-2
# text = Byron received his early formal education at Aberdeen Grammar School, and in August 1799 entered the school of Dr. William Glennie, in Dulwich. [17]
1 Byron Byron PROPN NNP Number=Sing 2 nsubj _ BeginSeg=Yes
2 received receive VERB VBD Mood=Ind|Tense=Past|VerbForm=Fin 0 root _ _
```
For PDTB-style corpora, we calculate exact span-wise f-scores for BIO encoding, without partial credit. In other words,
predicting an incorrect span with partial overlap is the same as missing a gold span and predicting an incorrect span
somewhere else in the corpus. Note also that spans must begin with B-Conn - predicted spans beginning with I-Conn are ignored.
The file format for PDTB style corpora is similar, but with different labels:
```
1 Fidelity Fidelity PROPN NNP _ 6 nsubj _ _
2 , , PUNCT , _ 6 punct _ _
3 for for ADP IN _ 4 case _ Seg=B-Conn
4 example example NOUN NN _ 6 obl _ Seg=I-Conn
5 , , PUNCT , _ 6 punct _ _
6 prepared prepare VERB VBN _ 0 root _ _
7 ads ad NOUN NNS _ 6 obj _ _
...
```
Arguments:
* goldfile: shared task gold test data
* predfile: same format, with predicted segments positions in column 10 - note **number of tokens must match**
* string_input: if specified, files are replaced by strings with file contents instead of file names
"""
__author__ = "Amir Zeldes"
__license__ = "Apache 2.0"
__version__ = "1.0.1"
def parse_data(infile, string_input=False):
if not string_input:
data = io.open(infile, encoding="utf8").read().strip().replace("\r", "")
else:
data = infile.strip()
tokens = []
labels = []
spans = []
counter = 0
span_start = -1
span_end = -1
for line in data.split("\n"):
if "\t" in line: # Token
fields = line.split("\t")
if "-" in fields[0]:
continue
label = fields[-1]
# Ensure correct labeling even if other pipe-delimited annotations found in column 10
if "BeginSeg=Yes" in label:
label = "BeginSeg=Yes"
elif "Seg=B-Conn" in label:
label ="Seg=B-Conn"
span_start = counter
elif "Seg=I-Conn" in label:
label = "Seg=I-Conn"
span_end = counter
else:
label = "_"
if span_start > -1: # Add span
if span_end == -1:
span_end = span_start
spans.append((span_start,span_end))
span_start = -1
span_end = -1
tokens.append(fields[1])
labels.append(label)
counter +=1
if span_start > -1 and span_end > -1: # Add last span
spans.append((span_start,span_end))
return tokens, labels, spans
def get_scores(gold_file, pred_file, string_input=False):
"""
:param gold_file: Gold shared task file
:param pred_file: File with predictions
:param string_input: If True, files are replaced by strings with file contents (for import inside other scripts)
:return: dictionary of scores for printing
"""
report = ""
gold_tokens, gold_labels, gold_spans = parse_data(gold_file, string_input)
pred_tokens, pred_labels, pred_spans = parse_data(pred_file, string_input)
if os.path.isfile(gold_file):
doc_name = os.path.basename(gold_file)
else:
# Use first few tokens to identify file
doc_name = " ".join(gold_tokens[0:10]) + "..."
# Check same number of tokens in both files
if len(gold_tokens) != len(pred_tokens):
report += "\nFATAL: different number of tokens detected in gold and pred:\n"
report += " o In " + doc_name + ": " + str(len(gold_tokens)) + " gold tokens but " + str(len(pred_tokens)) + " predicted tokens\n\n"
sys.stderr.write(report)
sys.exit(0)
# Check tokens are identical
for i, tok in enumerate(gold_tokens):
if tok != pred_tokens[i]:
report += "\nWARN: token strings do not match in gold and pred:\n"
report += " o First instance in " + doc_name + " token " + str(i) + "\n"
report += "Gold: " + tok + " but Pred: " + pred_tokens[i] + "\n\n"
sys.stderr.write(report)
break
# Check if this is EDU or Conn-style data
if "BeginSeg=Yes" in gold_labels:
mode = "edu"
seg_type = "EDUs"
else:
mode = "conn"
seg_type = "conn spans"
true_positive = 0
false_positive = 0
false_negative = 0
if mode == "edu":
for i, gold_label in enumerate(gold_labels):
pred_label = pred_labels[i]
if gold_label == pred_label:
if gold_label == "_":
continue
else:
true_positive += 1
else:
if pred_label == "_":
false_negative += 1
else:
if gold_label == "_":
false_positive += 1
else: # I-Conn/B-Conn mismatch
false_positive +=1
else:
for span in gold_spans:
if span in pred_spans:
true_positive +=1
else:
false_negative +=1
for span in pred_spans:
if span not in gold_spans:
false_positive += 1
try:
precision = true_positive / (float(true_positive) + false_positive)
except Exception as e:
precision = 0
try:
recall = true_positive / (float(true_positive) + false_negative)
except Exception as e:
recall = 0
try:
f_score = 2 * (precision * recall) / (precision + recall)
except:
f_score = 0
score_dict = {}
score_dict["doc_name"] = doc_name
score_dict["tok_count"] = len(gold_tokens)
score_dict["seg_type"] = seg_type
score_dict["gold_seg_count"] = true_positive+false_negative
score_dict["pred_seg_count"] = true_positive+false_positive
score_dict["prec"] = precision
score_dict["rec"] = recall
score_dict["f_score"] = f_score
return score_dict
if __name__ == "__main__":
p = argparse.ArgumentParser()
p.add_argument("goldfile",help="Shared task gold file in .tok or .conll format")
p.add_argument("predfile",help="Corresponding file with system predictions")
p.add_argument("-s","--string_input",action="store_true",help="Whether inputs are file names or strings")
opts = p.parse_args()
score_dict = get_scores(opts.goldfile,opts.predfile,opts.string_input)
print("File: " + score_dict["doc_name"])
print("o Total tokens: " + str(score_dict["tok_count"]))
print("o Gold " +score_dict["seg_type"]+": " + str(score_dict["gold_seg_count"]))
print("o Predicted "+score_dict["seg_type"]+": " + str(score_dict["pred_seg_count"]))
print("o Precision: " + str(score_dict["prec"]))
print("o Recall: " + str(score_dict["rec"]))
print("o F-Score: " + str(score_dict["f_score"]))
import spacy
import stanza
import numpy as np
from tqdm import tqdm
from stanza.utils.conll import CoNLL
def spliter_stanza(f_in, fi_out, lang, treebank=None):
# get language model
if treebank is not None:
stanza.download(lang, package=treebank)
else:
stanza.download(lang)
processors = 'tokenize'
nlp = stanza.Pipeline(lang, processors=processors, use_gpu=True)
# for each doc, get the list of tokens and labels
tok_tok_lbls = [(doc_id, doc_toks, doc_lbls) for doc_id, doc_toks, doc_lbls in tok_tokens_labels(f_in)]
# for each doc, get the character offset of tokens
with open(f_in, encoding='utf-8') as f_tok:
tok_str = f_tok.read()
tok_tok_begs = [(doc_id, doc_chars, tok_begs) for doc_id, doc_chars, tok_begs, _ in begin_toks_sents(tok_str)]
with open(fi_out, mode='w', encoding='utf-8') as f_out:
# parse each doc in turn
for (doc_id, doc_toks, doc_lbls), (_, doc_chars, tok_begs) in tqdm(zip(tok_tok_lbls, tok_tok_begs), total=min(len(tok_tok_lbls), len(tok_tok_begs))):
doc_text = rebuild_text(doc_toks, lang=lang)
# print(doc_text)
ann = nlp(doc_text)
conll_str = CoNLL.conll_as_string(CoNLL.convert_dict(ann.to_dict()))
conll_tok_begs = list(begin_toks_sents(conll_str, True))
# we parse one doc at a time
assert len(conll_tok_begs) == 1
_, p_doc_chars, p_tok_begs, p_sent_begs = conll_tok_begs[0]
try:
assert p_doc_chars == doc_chars
except AssertionError:
for i, (pdc, dc) in enumerate(zip(p_doc_chars, doc_chars)):
if pdc != dc:
print(f_in, i, p_doc_chars[i - 10:i + 10], doc_chars[i - 10:i + 10])
raise
# for each beginning of sentence (in the parser output), find the corresponding token index in the original .tok
sent_beg_idc = np.searchsorted(tok_begs, p_sent_begs, side='left')
sent_beg_idc = set(sent_beg_idc)
# output CONLL-U file
f_out.write('# newdoc id = ' + doc_id + '\n')
#print('# newdoc id = ' + doc_id, file=f_out)
tok_sent_idx = 1
for tok_doc_idx, (tok, lbl) in enumerate(zip(doc_toks, doc_lbls), start=0):
if tok_doc_idx in sent_beg_idc:
if tok_doc_idx > 0:
# add an empty line after the previous sentence (not for the first token in doc)
f_out.write('\n')
#print('', file=f_out)
tok_sent_idx = 1
else:
tok_sent_idx += 1
row = (str(tok_sent_idx), tok, '_', '_', '_', '_', '_', '_', '_', lbl)
f_out.write('\t'.join(row)+'\n')
#print('\t'.join(row).encode('utf-8'), file=f_out)
f_out.write('\n')
#print('', file=f_out)
def spliter_spacy(f_in, f_out, lg):
lm = f"{lg}_core_web_sm"
nlp = spacy.load(lm)
#nlp = spacy.load("en_core_web_sm")
""" doc = nlp(text)
for sent in doc.sents:
print(sent)
"""
def tok_tokens_labels(tok_filename):
"""Retrieve the list of tokens and (target) labels for each doc in a .ttok file.
Parameters
----------
tok_filename : str
Filename of the .ttok file
Yields
------
doc_toks : List[str]
List of tokens in the document.
doc_lbls : List[str]
List of labels in the document (same length as doc_toks).
"""
with open(tok_filename, encoding='utf-8') as f:
doc_id = None
doc_toks = []
doc_lbls = []
for line in f:
if line.startswith('# newdoc'):
if doc_toks:
yield (doc_id, doc_toks, doc_lbls)
doc_id = line.split('id = ')[1].strip()
doc_toks = []
doc_lbls = []
elif line.strip() == '':
continue
else:
fields = line.strip().split('\t')
tok = fields[1]
lbl = fields[9]
doc_toks.append(tok)
doc_lbls.append(lbl)
else:
# yield last doc
yield (doc_id, doc_toks, doc_lbls)
def begin_toks_sents(conll_str, stanza=False):
"""Get beginning positions of tokens and sentences as offsets on the non-whitespace characters of a document text.
Parameters
----------
conll_str : str
CONLL-U string for the file.
Yields
------
doc_id : str
Document id.
doc_chars : str
Document text excluding whitespaces.
tok_begs : List[int]
Beginning position of each token in the doc.
Correspond to indices in doc_chars.
sent_begs : List[int]
Beginning position of each sentence in the doc.
Correspond to indices in doc_chars.
"""
doc_id = None
doc_chars = ''
tok_begs = []
sent_begs = []
in_sent = False
cur_idx = 0 # current (non-whitespace) character index
for line in conll_str.split('\n'):
if line.startswith('# newdoc id = '):
if sent_begs:
# yield previous doc
yield (doc_id, doc_chars, tok_begs, sent_begs)
# reset for a new doc
doc_id = line.split('# newdoc id = ')[1]
doc_chars = ''
tok_begs = []
sent_begs = []
in_sent = False
cur_idx = 0
elif line.startswith('#'):
continue
elif line == '':
# an empty line marks doc or sentence split
in_sent = False
else:
fields = line.split('\t')
assert len(fields) == 10
if stanza and not fields[9].startswith('start'):
continue
# token line
tok_begs.append(cur_idx)
if not in_sent:
# first token in sentence
sent_begs.append(cur_idx)
in_sent = True
# delete whitespaces internal to the token
tok_chars = fields[1].replace(' ', '').replace('\xa0', '')
cur_idx += len(tok_chars)
doc_chars += tok_chars
else:
# yield last document
if sent_begs:
yield (doc_id, doc_chars, tok_begs, sent_begs)
def rebuild_text(doc_toks, lang=None):
"""Rebuild the underlying text from a list of tokens.
We don't assume any additional information.
In particular, the "SpaceAfter=No" provided in some CONLL-U files is ignored.
Parameters
----------
doc_toks : List[str]
List of tokens in the document.
lang : str
Language ; If None, the language is assumed to be one where tokens are
separated with whitespaces. Currently the only interesting value is "zh"
with no whitespace.
"""
if lang == "zh":
return ''.join(doc_toks)
# default: insert whitespaces between tokens then remove extraneous ones ;
# this heuristic is crude but a reasonable default
doc_text = ' '.join(doc_toks)
doc_text = (doc_text.replace(' : " ', ': "')
.replace(' ,', ',').replace(' .', '.').replace(' !', '!').replace(' ?', '?').replace(' :', ':')
.replace('', '').replace('', '')
.replace(' ;', ';')
.replace('', '')
.replace('( ', '(').replace(' )', ')')
.replace('[ ', '[').replace(' ]', ']')
)
return doc_text
def main(f_in, f_out, tool, lg):
if tool == "spacy" :
spliter_spacy(f_in,f_out, lg)
elif tool == "stanza":
spliter_stanza(f_in, f_out, lg, treebank=None)
else:
print(" pls defined sentence spliter tool : spacy, blabla")
#return output
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment