Merge branch 'dev-allen' into 'main'

updates and add usecase 2 See merge request !1

Merge branch 'dev-allen' into 'main'
d6975588 · larivier · 2fcdded4 · 31228c51 · d6975588 · d6975588
Commit d6975588 authored 2 years ago by larivier
--- a/README.md
+++ b/README.md
-# Project DisCut22 *still WIP*
+# Project DisCut22 *still WIP* : Discourse Annotator Tool

-A tool for Discourse Segmentation. Inheritor of ToNy and DisCut, segmentors for DISRPT 2019 and 2021. The goal of this version is to be easy to use with or without IT knowledge.
+A tool for Discourse Annotation. Inheritor of ToNy and DisCut, segmentors for DISRPT 2019 and 2021. The goal of this version is to be easy to use with or without IT knowledge.

 __2021__  
 *[Multi-lingual Discourse Segmentation and Connective Identification: MELODI at Disrpt2021](https://aclanthology.org/2021.disrpt-1.3.pdf)*  
@@ -13,17 +13,22 @@ Code: https://gitlab.inria.fr/andiamo/tony


 # Usage
+## Usecases 
+- **Discourse Segmentation:** Take a raw text as input, use a loaded model to make predictions. Output the same text but with EDU segmentation.
+
+- **Segmentation Evaluation:** Take an EDU gold segmented text as input, use a loaded model to make predictions. Output scores of model predictions against gold, and output discrepancies.
+

 ## Content description 
 [TBD : xplain directories automatically created during scripts run]
- `data/` Contains input data, raw and/or pre-processed format(s).
- `data/results/` Contains output data, scores and post-processed data. (Also logs of allennlp)
+- `data/MyProjet/` Contains input data, raw and/or pre-processed format(s).
+    - `results/` Contains output data, scores and post-processed data. (Also logs of allennlp)
 - `code/` Contains main scripts.
- `code/utils` Contains useful scripts to be called.
- `model/` Contains loaded or created model.
- `doc.pdf` Contains detailed documentation (TBD?)
- `code/config.json` A file to be completed (or a dir with choise between simple use_case configs and a template for a custom config)
-
+    - `discut22_1.py` One python script to run them all.
+    - `config_XX.json` A file to be completed (or a dir with choise between simple use_case configs and a template for a custom config).
+    - `utils/` Contains useful scripts to be called.
+- `model/` Contains model to be loaded or created.
+- `documentation.md` Contains detailed documentation (TBD?)

 ## Set up environnement
 - Conda stuff pour python 3.7 (TBD ?)
@@ -40,10 +45,12 @@ pip install -r <dir?>requirements.txt
 (go to `code` directory)
 Run this command:
 ```
-python discut22.py --config config_1.json
+python code/discut22.py --config code/config_1.json
 ```

-
+## Authors and acknowledgment
+Morteza Ezzabady
+Amir Zeldes
 <!---

 ## Test and Deploy

--- a/code/classes_def.py
+++ b/code/classes_def.py
@@ -5,17 +5,21 @@
 class Input:
    def __init__(self, infos):
        self.name = infos['name']
-        self.lang = infos['language']
-        self.path = infos['folder_path']
-        self.file = infos['file']
-        self.form = infos['format']
-        self.gold = infos['gold']
-        self.resu = infos['results_path']
+        self.lang = infos['language'] 
+        self.path = infos['folder_path'] # misused
+        self.file = infos['file'] 
+        self.form = infos['format'] # not used
+        self.gold = infos['gold'] # not used
+        self.resu = infos['results_path'] # misused : le créer automatiquement


 class Process:
    def __init__(self, infos, data):
-        self.toke = infos['pre-processing']['tokenization']
+        self.main = infos["main"]
+        self.toke = infos['pre-processing']['tokenization'] # not used
        self.data = data
        self.model = infos['discourse_segmenter']['model'] # ezpz for Tony 
-        
\ No newline at end of file
+        self.ssplit = infos['pre-processing']['sentence_split']
+        self.ssplitor = infos['pre-processing']['sentence_split_splitor']
+        self.post_tab = infos['post-processing']['json_to_tab']
+        self.post_bracket = infos['post-processing']['tab_to_bracket']
\ No newline at end of file
--- a/code/config_1.json
+++ b/code/config_1.json
@@ -14,9 +14,10 @@
        "framework": "sdrt"
    },
    "steps":{
+        "main": "annotation",
        "pre-processing": {
            "tokenization": true,
-            "sentence_split": true,
+            "sentence_split": false,
            "syntactic_parsing": false,
            "NER_format_initialisation": true
        },

--- a/code/config_2.json
+++ b/code/config_2.json
+{
+    "usecase_description": "Config file for usecase_2 : Take a EDU gold segmented text au format tok as input, use a loaded model to make predictions. Output scores of model predictions against gold, and output discrepancies. To start, we evaluate tony on annodis dev set.",
+    "input": {
+        "name": "fra.sdrt.annodis_dev",
+        "file": ".ttok",
+        "folder_path": "../data/fra.sdrt.annodis_dev",
+        "format": "truc",
+        "language": "fr",
+        "gold": true,
+        "results_path": "../data/fra.sdrt.annodis_dev/results"
+    },
+    "output": {
+        "format": "ner_tok", 
+        "framework": "sdrt"
+    },
+    "steps":{
+        "main": "test",
+        "pre-processing": {
+            "tokenization": false,
+            "sentence_split": true,
+            "sentence_split_splitor": "stanza",
+            "syntactic_parsing": false, 
+            "NER_format_initialisation": true
+        },
+        "discourse_segmenter": {
+            "model": "tony"
+        },
+        "post-processing": {
+            "json_to_tab": true,
+            "tab_to_bracket":false
+        },
+        "evaluation": true
+    }
+}
+
+
--- a/code/config_3.json
+++ b/code/config_3.json
+{
+    "usecase_description": "Config file for usecase_2.2 : Take a EDU gold segmented text au format conll as input, use a loaded model to make predictions. Output scores of model predictions against gold, and output discrepancies. To start, we evaluate tony on annodis dev set.",
+    "input": {
+        "name": "fra.sdrt.annodis_dev",
+        "file": ".conllu", 
+        "file_options": [".conllu", ".tok"],
+        "folder_path": "../data/fra.sdrt.annodis_dev",
+        "format": "truc",
+        "language": "fr",
+        "gold": true,
+        "results_path": "../data/fra.sdrt.annodis_dev/results"
+    },
+    "output": {
+        "format": "ner_tok", 
+        "framework": "sdrt"
+    },
+    "steps":{
+        "main": "test",
+        "pre-processing": {
+            "tokenization": false,
+            "sentence_split": false,
+            "sentence_split_splitor": "stanza",
+            "syntactic_parsing": false, 
+            "NER_format_initialisation": true
+        },
+        "discourse_segmenter": {
+            "model": "tony"
+        },
+        "post-processing": {
+            "json_to_tab": true,
+            "tab_to_bracket":false
+        },
+        "evaluation": true
+    }
+}
+
+
--- a/code/discut22_1.py
+++ b/code/discut22_1.py
@@ -12,10 +12,15 @@ import pandas as pd # for futur clean output in df
 import json 

 from classes_def import Input, Process
+import utils
 import utils.fr_tokenize as tk
 import utils.conv2ner as c2n
 import utils.json2conll as j2c
 import utils.conll2bracket as c2bracket
+import utils.sent_split as ssent
+#import utils.ssplit.parse_corpus as ssent
+#import utils.ssplit.parse_corpus as ssent
+#import utils.ssplit.parse_stanza as ssent


 # fonction to get config stuffs
@@ -53,21 +58,33 @@ def main(config):
    #print([x for x in enumerate(steps)])
    #suivant la liste ordonnée, faire les trucs (for now simple usecase1):

+    
+    # FN: soit besoin sent split, soit besoin tokenizer, soit aucun des deux
+    if steps.ssplit == True :       # python code/ssplit/parse_corpus.py ${dataset} --parser stanza --out_dir data
    #### Split text into sentence : not in usecase1
+        data_in = "{}/{}{}".format(steps.data.path, steps.data.name, steps.data.file)
+        data_tok = "{}/{}.tok".format(steps.data.path, steps.data.name)
+        print("Starting sentence spliting...to {}".format(steps.data.path, steps.data.name))
+    #    ssent.main(data_in, data_tok, "stanza", steps.data.lang)

+        ssent.main(data_in, data_tok, "stanza", steps.data.lang)

+    elif steps.toke == True :
    #### Tokenization du text        # #python ${SEG_DIR}/code/utils/fr_tokenize.py $RAW > ${RAW}.tok 
-    data_in = "{}/{}{}".format(steps.data.path, steps.data.name, steps.data.file)
-    data_tok = "{}/{}.tok".format(steps.data.path, steps.data.name)
-#    sys.exit("check path")
-    print("Starting Tokenization...to {}".format(data_tok))
-    tk.main(data_in, data_tok) # .ss -> .tok
+        data_in = "{}/{}{}".format(steps.data.path, steps.data.name, steps.data.file)
+        data_tok = "{}/{}.tok".format(steps.data.path, steps.data.name)
+    #    sys.exit("check path")
+        print("Starting Tokenization...to {}".format(data_tok))
+        tk.main(data_in, data_tok) # .ss -> .tok
+
+    else:
+        data_tok = f"{steps.data.path}/{steps.data.name}{steps.data.file}"


    #### Conversion en NER pb        # #python $RUNTIME/conv2ner.py ${RAW}.tok > ${RAW}.ner.tok
    data_ner = "{}/{}.ner.tok".format(steps.data.path, steps.data.name)
    print("Starting conversion to NER format...to {}".format(data_ner))
-    c2n.main(data_tok, data_ner)
+    c2n.main(data_tok, data_ner, steps.data.file)


    #### Appliquer le model choisi, sortir le JSON avec les predictions :score, proba, tags
@@ -77,24 +94,39 @@ def main(config):
    data_json = "{}/{}.json".format(steps.data.resu, steps.data.name)
    cmd = "allennlp predict --use-dataset-reader --output-file {} {} {} &> {}/logs.txt".format(data_json, model_path, data_ner, steps.data.resu)
    if not os.path.isdir(steps.data.resu):
-        print(" result does not exist")
+        print(" result directory does not exist")
        os.mkdir(steps.data.resu)
    print("Starting Prediction...")
    os.system(cmd)
    #### ------------------------------- TBD do the same but with python script (or JIANT ??)


+
+
+
+    if steps.post_tab == True :
    #### Appliquer les predictions au texte et sortir le texte tokenisé avec la colone des tags-prédis     # #python $RUNTIME/json2conll.py ${RESULT_DIR}/${FILE}.json split.tok > ${RESULT_DIR}/${FILE}.split.tok
-    data_conll = "{}/{}.split.tok".format(steps.data.resu, steps.data.name)
-    format = "split.tok" # to retrive from config file !!!
-    print("Starting Formating from json to tok format...to {}".format(data_conll))
-    j2c.main(data_json, format, data_conll)
+        data_conll = "{}/{}.split.tok".format(steps.data.resu, steps.data.name)
+        format = "split.tok" # to retrive from config file !!!
+        print("Starting Formating from json to tok format...to {}".format(data_conll))
+        j2c.main(data_json, format, data_conll)
+
+    ####### EVALUATION AGAINST GOLD
+    # python discut/code/utils/seg_eval.py data_gold data_pred (-s)
+    data_gold = data_tok
+    data_pred = data_conll
+    cmd = f"python utils/seg_eval.py {data_gold} {data_pred} &> {steps.data.resu}/Evaluation.txt"
+    os.system(cmd)
+


+
+    if steps.post_bracket == True :
    ####prendre le texte tokénisé+tags-prédits et sortir le texte en plain (format du d'ebut, for now en suite de phrases) avec les brackets    # #python $RUNTIME/conll2bracket.py ${RESULT_DIR}/${FILE}.split.tok >  ${RESULT_DIR}/${FILE}.split.tok.bracket
-    data_bracket = "{}/{}.split.tok.bracket".format(steps.data.resu, steps.data.name)
-    print("Starting formating into bracket text...to {}".format(data_bracket))
-    c2bracket.main(data_conll, data_bracket)
+        data_bracket = "{}/{}.split.tok.bracket".format(steps.data.resu, steps.data.name)
+        print("Starting formating into bracket text...to {}".format(data_bracket))
+        c2bracket.main(data_conll, data_bracket)
+    




--- a/code/utils/conv2ner.py
+++ b/code/utils/conv2ner.py
@@ -120,8 +120,9 @@ def conversion2ner(input, output, params=None):



-def main(f_in, f_out):
+def main(f_in, f_out, f):
    input = f_in
    output = f_out
+    input_format = f
    #param = get_param()
    conversion2ner(input, output) # add param
\ No newline at end of file
--- a/code/utils/seg_eval.py
+++ b/code/utils/seg_eval.py
+import io, os, sys, argparse
+
+"""
+Script to evaluate segmentation f-score and perfect discourse unit segmentation proportion from two files. Two input formats are permitted:
+
+  * One token per line, with ten columns, no sentence breaks (default *.tok format) - segmentation indicated in column 10
+  * The same, but with blank lines between sentences (*.conll format)
+
+Token columns follow the CoNLL-U format, with token IDs in the first column and pipe separated key=value pairs in the last column. 
+
+Document boundaries are indicated by a comment: # newdoc id = ...
+
+The evaluation uses micro-averaged F-Scores per corpus (not document macro average).
+
+Example:
+
+```
+# newdoc id = GUM_bio_byron
+1	Education	_	_	_	_	_	_	_	BeginSeg=Yes
+2	and	_	_	_	_	_	_	_	_
+3	early	_	_	_	_	_	_	_	_
+4	loves	_	_	_	_	_	_	_	_
+5	Byron	_	_	_	_	_	_	_	BeginSeg=Yes
+6	received	_	_	_	_	_	_	_	_
+...
+```
+
+Or:
+
+```
+# newdoc id = GUM_bio_byron
+# sent_id = GUM_bio_byron-1
+# text = Education and early loves
+1	Education	education	NOUN	NN	Number=Sing	0	root	_	BeginSeg=Yes
+2	and	and	CCONJ	CC	_	4	cc	_	_
+3	early	early	ADJ	JJ	Degree=Pos	4	amod	_	_
+4	loves	love	NOUN	NNS	Number=Plur	1	conj	_	_
+
+# sent_id = GUM_bio_byron-2
+# text = Byron received his early formal education at Aberdeen Grammar School, and in August 1799 entered the school of Dr. William Glennie, in Dulwich. [17]
+1	Byron	Byron	PROPN	NNP	Number=Sing	2	nsubj	_	BeginSeg=Yes
+2	received	receive	VERB	VBD	Mood=Ind|Tense=Past|VerbForm=Fin	0	root	_	_
+```
+
+For PDTB-style corpora, we calculate exact span-wise f-scores for BIO encoding, without partial credit. In other words, 
+predicting an incorrect span with partial overlap is the same as missing a gold span and predicting an incorrect span
+somewhere else in the corpus. Note also that spans must begin with B-Conn - predicted spans beginning with I-Conn are ignored.
+
+The file format for PDTB style corpora is similar, but with different labels:
+
+```
+1	Fidelity	Fidelity	PROPN	NNP	_	6	nsubj	_	_
+2	,	,	PUNCT	,	_	6	punct	_	_
+3	for	for	ADP	IN	_	4	case	_	Seg=B-Conn
+4	example	example	NOUN	NN	_	6	obl	_	Seg=I-Conn
+5	,	,	PUNCT	,	_	6	punct	_	_
+6	prepared	prepare	VERB	VBN	_	0	root	_	_
+7	ads	ad	NOUN	NNS	_	6	obj	_	_
+...
+```
+
+
+Arguments:
+ * goldfile: shared task gold test data
+ * predfile: same format, with predicted segments positions in column 10 - note **number of tokens must match**  
+ * string_input: if specified, files are replaced by strings with file contents instead of file names
+
+
+"""
+
+__author__ = "Amir Zeldes"
+__license__ = "Apache 2.0"
+__version__ = "1.0.1"
+
+def parse_data(infile, string_input=False):
+	if not string_input:
+		data = io.open(infile, encoding="utf8").read().strip().replace("\r", "")
+	else:
+		data = infile.strip()
+
+	tokens = []
+	labels = []
+	spans = []
+	counter = 0
+	span_start = -1
+	span_end = -1
+	for line in data.split("\n"):
+		if "\t" in line:  # Token
+			fields = line.split("\t")
+			if "-" in fields[0]:
+				continue
+			label = fields[-1]
+			# Ensure correct labeling even if other pipe-delimited annotations found in column 10
+			if "BeginSeg=Yes" in label:
+				label = "BeginSeg=Yes"
+			elif "Seg=B-Conn" in label:
+				label ="Seg=B-Conn"
+				span_start = counter
+			elif "Seg=I-Conn" in label:
+				label = "Seg=I-Conn"
+				span_end = counter
+			else:
+				label = "_"
+				if span_start > -1:  # Add span
+					if span_end == -1:
+						span_end = span_start
+					spans.append((span_start,span_end))
+					span_start = -1
+					span_end = -1
+
+			tokens.append(fields[1])
+			labels.append(label)
+			counter +=1
+
+	if span_start > -1 and span_end > -1:  # Add last span
+		spans.append((span_start,span_end))
+
+	return tokens, labels, spans
+
+
+def get_scores(gold_file, pred_file, string_input=False):
+	"""
+
+	:param gold_file: Gold shared task file
+	:param pred_file: File with predictions
+	:param string_input: If True, files are replaced by strings with file contents (for import inside other scripts)
+	:return: dictionary of scores for printing
+	"""
+
+
+	report = ""
+	gold_tokens, gold_labels, gold_spans = parse_data(gold_file, string_input)
+	pred_tokens, pred_labels, pred_spans = parse_data(pred_file, string_input)
+
+	if os.path.isfile(gold_file):
+		doc_name = os.path.basename(gold_file)
+	else:
+		# Use first few tokens to identify file
+		doc_name = " ".join(gold_tokens[0:10]) + "..."
+
+	# Check same number of tokens in both files
+	if len(gold_tokens) != len(pred_tokens):
+		report += "\nFATAL: different number of tokens detected in gold and pred:\n"
+		report += "  o In " + doc_name + ": " + str(len(gold_tokens)) + " gold tokens but " + str(len(pred_tokens)) + " predicted tokens\n\n"
+		sys.stderr.write(report)
+		sys.exit(0)
+
+	# Check tokens are identical
+	for i, tok in enumerate(gold_tokens):
+		if tok != pred_tokens[i]:
+			report += "\nWARN: token strings do not match in gold and pred:\n"
+			report += " o First instance in " + doc_name + " token " + str(i) + "\n"
+			report += "Gold: " + tok + " but Pred: " + pred_tokens[i] + "\n\n"
+			sys.stderr.write(report)
+			break
+
+	# Check if this is EDU or Conn-style data
+	if "BeginSeg=Yes" in gold_labels:
+		mode = "edu"
+		seg_type = "EDUs"
+	else:
+		mode = "conn"
+		seg_type = "conn spans"
+
+	true_positive = 0
+	false_positive = 0
+	false_negative = 0
+
+	if mode == "edu":
+		for i, gold_label in enumerate(gold_labels):
+			pred_label = pred_labels[i]
+			if gold_label == pred_label:
+				if gold_label == "_":
+					continue
+				else:
+					true_positive += 1
+			else:
+				if pred_label == "_":
+					false_negative += 1
+				else:
+					if gold_label == "_":
+						false_positive += 1
+					else:  # I-Conn/B-Conn mismatch
+						false_positive +=1
+	else:
+		for span in gold_spans:
+			if span in pred_spans:
+				true_positive +=1
+			else:
+				false_negative +=1
+		for span in pred_spans:
+			if span not in gold_spans:
+				false_positive += 1
+
+	try:
+		precision = true_positive / (float(true_positive) + false_positive)
+	except Exception as e:
+		precision = 0
+
+	try:
+		recall = true_positive / (float(true_positive) + false_negative)
+	except Exception as e:
+		recall = 0
+
+	try:
+		f_score = 2 * (precision * recall) / (precision + recall)
+	except:
+		f_score = 0
+
+	score_dict = {}
+	score_dict["doc_name"] = doc_name
+	score_dict["tok_count"] = len(gold_tokens)
+	score_dict["seg_type"] = seg_type
+	score_dict["gold_seg_count"] = true_positive+false_negative
+	score_dict["pred_seg_count"] = true_positive+false_positive
+	score_dict["prec"] = precision
+	score_dict["rec"] = recall
+	score_dict["f_score"] = f_score
+
+	return score_dict
+
+
+if __name__ == "__main__":
+	p = argparse.ArgumentParser()
+
+	p.add_argument("goldfile",help="Shared task gold file in .tok or .conll format")
+	p.add_argument("predfile",help="Corresponding file with system predictions")
+	p.add_argument("-s","--string_input",action="store_true",help="Whether inputs are file names or strings")
+
+	opts = p.parse_args()
+
+	score_dict = get_scores(opts.goldfile,opts.predfile,opts.string_input)
+
+	print("File: " + score_dict["doc_name"])
+	print("o Total tokens: " + str(score_dict["tok_count"]))
+	print("o Gold " +score_dict["seg_type"]+": " + str(score_dict["gold_seg_count"]))
+	print("o Predicted "+score_dict["seg_type"]+": " + str(score_dict["pred_seg_count"]))
+	print("o Precision: " + str(score_dict["prec"]))
+	print("o Recall: " + str(score_dict["rec"]))
+	print("o F-Score: " + str(score_dict["f_score"]))
--- a/code/utils/sent_split.py
+++ b/code/utils/sent_split.py
+import spacy
+import stanza
+import numpy as np
+from tqdm import tqdm
+from stanza.utils.conll import CoNLL
+
+
+def spliter_stanza(f_in, fi_out, lang, treebank=None):
+    # get language model
+    if treebank is not None:
+        stanza.download(lang, package=treebank)
+    else:
+        stanza.download(lang) 
+    processors = 'tokenize'
+    nlp = stanza.Pipeline(lang, processors=processors, use_gpu=True)
+    # for each doc, get the list of tokens and labels
+    tok_tok_lbls = [(doc_id, doc_toks, doc_lbls) for doc_id, doc_toks, doc_lbls in tok_tokens_labels(f_in)]
+    # for each doc, get the character offset of tokens
+    with open(f_in, encoding='utf-8') as f_tok:
+        tok_str = f_tok.read()
+    tok_tok_begs = [(doc_id, doc_chars, tok_begs) for doc_id, doc_chars, tok_begs, _ in begin_toks_sents(tok_str)]
+    with open(fi_out, mode='w', encoding='utf-8') as f_out:
+        # parse each doc in turn
+        for (doc_id, doc_toks, doc_lbls), (_, doc_chars, tok_begs) in tqdm(zip(tok_tok_lbls, tok_tok_begs), total=min(len(tok_tok_lbls), len(tok_tok_begs))):
+            doc_text = rebuild_text(doc_toks, lang=lang)
+            # print(doc_text)
+            ann = nlp(doc_text)
+            conll_str = CoNLL.conll_as_string(CoNLL.convert_dict(ann.to_dict()))
+            conll_tok_begs = list(begin_toks_sents(conll_str, True))
+            # we parse one doc at a time
+            assert len(conll_tok_begs) == 1
+            _, p_doc_chars, p_tok_begs, p_sent_begs = conll_tok_begs[0]
+            try:
+                assert p_doc_chars == doc_chars
+            except AssertionError:
+                for i, (pdc, dc) in enumerate(zip(p_doc_chars, doc_chars)):
+                    if pdc != dc:
+                        print(f_in, i, p_doc_chars[i - 10:i + 10], doc_chars[i - 10:i + 10])
+                        raise
+            # for each beginning of sentence (in the parser output), find the corresponding token index in the original .tok
+            sent_beg_idc = np.searchsorted(tok_begs, p_sent_begs, side='left')
+            sent_beg_idc = set(sent_beg_idc)
+            # output CONLL-U file
+            f_out.write('# newdoc id = ' + doc_id + '\n')
+            #print('# newdoc id = ' + doc_id, file=f_out)
+            tok_sent_idx = 1
+            for tok_doc_idx, (tok, lbl) in enumerate(zip(doc_toks, doc_lbls), start=0):
+                if tok_doc_idx in sent_beg_idc:
+                    if tok_doc_idx > 0:
+                        # add an empty line after the previous sentence (not for the first token in doc)
+                        f_out.write('\n')
+                        #print('', file=f_out)
+                    tok_sent_idx = 1
+                else:
+                    tok_sent_idx += 1
+                row = (str(tok_sent_idx), tok, '_', '_', '_', '_', '_', '_', '_', lbl)
+                f_out.write('\t'.join(row)+'\n')
+                #print('\t'.join(row).encode('utf-8'), file=f_out)
+            f_out.write('\n')
+            #print('', file=f_out)
+
+
+
+
+def spliter_spacy(f_in, f_out, lg):
+    lm = f"{lg}_core_web_sm"
+    nlp = spacy.load(lm)
+    #nlp = spacy.load("en_core_web_sm")
+
+"""    doc = nlp(text)
+    for sent in doc.sents:
+        print(sent)
+ """
+
+def tok_tokens_labels(tok_filename):
+    """Retrieve the list of tokens and (target) labels for each doc in a .ttok file.
+
+    Parameters
+    ----------
+    tok_filename : str
+        Filename of the .ttok file
+
+    Yields
+    ------
+    doc_toks : List[str]
+        List of tokens in the document.
+    doc_lbls : List[str]
+        List of labels in the document (same length as doc_toks).
+    """
+    with open(tok_filename, encoding='utf-8') as f:
+        doc_id = None
+        doc_toks = []
+        doc_lbls = []
+        for line in f:
+            if line.startswith('# newdoc'):
+                if doc_toks:
+                    yield (doc_id, doc_toks, doc_lbls)
+                doc_id = line.split('id = ')[1].strip()
+                doc_toks = []
+                doc_lbls = []
+            elif line.strip() == '':
+                continue
+            else:
+                fields = line.strip().split('\t')
+                tok = fields[1]
+                lbl = fields[9]
+                doc_toks.append(tok)
+                doc_lbls.append(lbl)
+        else:
+            # yield last doc
+            yield (doc_id, doc_toks, doc_lbls)
+
+
+def begin_toks_sents(conll_str, stanza=False):
+    """Get beginning positions of tokens and sentences as offsets on the non-whitespace characters of a document text.
+
+    Parameters
+    ----------
+    conll_str : str
+        CONLL-U string for the file.
+
+    Yields
+    ------
+    doc_id : str
+        Document id.
+    doc_chars : str
+        Document text excluding whitespaces.
+    tok_begs : List[int]
+        Beginning position of each token in the doc.
+        Correspond to indices in doc_chars.
+    sent_begs : List[int]
+        Beginning position of each sentence in the doc.
+        Correspond to indices in doc_chars.
+    """
+    doc_id = None
+    doc_chars = ''
+    tok_begs = []
+    sent_begs = []
+    in_sent = False
+    cur_idx = 0  # current (non-whitespace) character index
+    for line in conll_str.split('\n'):
+        if line.startswith('# newdoc id = '):
+            if sent_begs:
+                # yield previous doc
+                yield (doc_id, doc_chars, tok_begs, sent_begs)
+            # reset for a new doc
+            doc_id = line.split('# newdoc id = ')[1]
+            doc_chars = ''
+            tok_begs = []
+            sent_begs = []
+            in_sent = False
+            cur_idx = 0
+        elif line.startswith('#'):
+            continue
+        elif line == '':
+            # an empty line marks doc or sentence split
+            in_sent = False
+        else:
+            fields = line.split('\t')
+            assert len(fields) == 10
+            if stanza and not fields[9].startswith('start'):
+                continue
+            # token line
+            tok_begs.append(cur_idx)
+            if not in_sent:
+                # first token in sentence
+                sent_begs.append(cur_idx)
+                in_sent = True
+            
+            # delete whitespaces internal to the token
+            tok_chars = fields[1].replace(' ', '').replace('\xa0', '')
+            cur_idx += len(tok_chars)
+            doc_chars += tok_chars
+    else:
+        # yield last document
+        if sent_begs:
+            yield (doc_id, doc_chars, tok_begs, sent_begs)
+
+def rebuild_text(doc_toks, lang=None):
+    """Rebuild the underlying text from a list of tokens.
+
+    We don't assume any additional information.
+    In particular, the "SpaceAfter=No" provided in some CONLL-U files is ignored.
+
+    Parameters
+    ----------
+    doc_toks : List[str]
+        List of tokens in the document.
+    lang : str
+        Language ; If None, the language is assumed to be one where tokens are
+        separated with whitespaces. Currently the only interesting value is "zh"
+        with no whitespace.
+    """
+    if lang == "zh":
+        return ''.join(doc_toks)
+    # default: insert whitespaces between tokens then remove extraneous ones ;
+    # this heuristic is crude but a reasonable default
+    doc_text = ' '.join(doc_toks)
+    doc_text = (doc_text.replace(' : " ', ': "')
+                .replace(' ,', ',').replace(' .', '.').replace(' !', '!').replace(' ?', '?').replace(' :', ':')
+                .replace('“ ', '“').replace(' ”', '”')
+                .replace(' ;', ';')
+                .replace(' ’', '’')
+                .replace('( ', '(').replace(' )', ')')
+                .replace('[ ', '[').replace(' ]', ']')
+    )
+    return doc_text
+
+
+def main(f_in, f_out, tool, lg):
+    
+    if tool == "spacy" :
+        spliter_spacy(f_in,f_out, lg)
+    elif tool == "stanza":
+        spliter_stanza(f_in, f_out, lg, treebank=None)
+    else:
+        print(" pls defined sentence spliter tool : spacy, blabla")
+
+
+    #return output
\ No newline at end of file
--- a/data/fra.sdrt.annodis_dev/fra.sdrt.annodis_dev.conllu
+++ b/data/fra.sdrt.annodis_dev/fra.sdrt.annodis_dev.conllu
--- a/data/fra.sdrt.annodis_dev/fra.sdrt.annodis_dev.ttok
+++ b/data/fra.sdrt.annodis_dev/fra.sdrt.annodis_dev.ttok