Skip to content
Snippets Groups Projects
Commit cfc00e33 authored by laura.riviere's avatar laura.riviere
Browse files

refact config1

parent b43945e7
No related branches found
No related tags found
1 merge request!3Refacto 1205
...@@ -27,7 +27,6 @@ class Process: ...@@ -27,7 +27,6 @@ class Process:
self.meta_line = infos['pre-processing']['create_metadata']['line'] self.meta_line = infos['pre-processing']['create_metadata']['line']
self.meta_sent = infos['pre-processing']['create_metadata']['sent'] self.meta_sent = infos['pre-processing']['create_metadata']['sent']
#self.ner_init = infos['pre-processing']['NER_format_initialisation'] # useless because done anyway
#if self.main == "train": #if self.main == "train":
#if self.ner_init == True : # à faire en relatif !! split truc #if self.ner_init == True : # à faire en relatif !! split truc
...@@ -51,7 +50,6 @@ class Process: ...@@ -51,7 +50,6 @@ class Process:
class Output: class Output:
def __init__(self, infos): def __init__(self, infos):
self.prod_tab = infos['file']['json_to_tab']
self.prod_bracket = infos['file']['tab_to_bracket'] self.prod_bracket = infos['file']['tab_to_bracket']
self.prod_conll = infos['file']['conllu'] self.prod_conll = infos['file']['conllu']
self.metadata = infos['file']['metadata'] self.metadata = infos['file']['metadata']
\ No newline at end of file
...@@ -18,8 +18,8 @@ import re ...@@ -18,8 +18,8 @@ import re
import json import json
from classes_def_2 import Data, Process, Output from classes_def_2 import Data, Process, Output
import utils_2.syntactic_parsing as synt_pars import utils_2.syntactic_parsing as synt_pars
import utils.conv2ner as conv_to_ner import utils.conv2ner as conv_to_ner # TODO clean it
import utils.json2conll as json_to_connl # TODO clean it
...@@ -38,7 +38,7 @@ def get_config_infos(config, stamp): ...@@ -38,7 +38,7 @@ def get_config_infos(config, stamp):
my_logs["config"] = infos my_logs["config"] = infos
return data, steps, prod return data, steps, prod
def create_folders(li): def create_folders(li): # -> can be rtansfor into method of class
for it in li: for it in li:
if not os.path.isdir(it): if not os.path.isdir(it):
os.mkdir(it) os.mkdir(it)
...@@ -50,7 +50,7 @@ def print_logs(): ...@@ -50,7 +50,7 @@ def print_logs():
def pre_processing(data, steps): def pre_processing(data, steps):
data_in = f"{data.path}/{data.name}{data.exte}" data_in = f"{data.path}/{data.name}{data.exte}"
if steps.pre_process_to_do == True: if steps.pre_process_to_do == True:
data_out = f"{data.path}/{data.name}.conll" data_out = f"{data.conv}/{data.name}.conll"
if steps.synt_tool == "stanza": if steps.synt_tool == "stanza":
processors = [] processors = []
metadata = {} metadata = {}
...@@ -81,13 +81,34 @@ def data_to_ner_format(data_in): ...@@ -81,13 +81,34 @@ def data_to_ner_format(data_in):
OUTPUT: Tokenized text with just 4 columns. OUTPUT: Tokenized text with just 4 columns.
""" """
data_ner = f"{data_in}.ner" data_ner = f"{data_in}.ner"
conv_to_ner.main(data_in, data_ner, "conll") conv_to_ner.main(data_in, data_ner, "conll") # <-- TODO faire en relatif
#TODO add same for train/dev/test for config train #TODO add same for train/dev/test for config train
my_logs['data_ner'] = data_ner my_logs['data_ner'] = data_ner
return data_ner return data_ner
def make_predictions(data_in, model_path):
model = model_path # add def get_model from v1
data_out = f"{data.resu}/{data.name}_pred.json"
#cmd = f"allennlp predict --use-dataset-reader --output-file {data_out} {model_path} {data_in} &> {steps.data.resu}/logs.txt"
cmd = f"allennlp predict --use-dataset-reader --output-file {data_out} {model_path} {data_in}" # &> {steps.data.resu}/logs.txt"
os.system(cmd)
return data_out
def pred_json_to_toke(data_in):
data_out = f"{data.resu}/{data.name}_pred.conll"
json_to_connl.js2conll(data_in, data_out, "conll") # <-- TODO faire en relatif
return data_out
def pred_json_to_conll_with_metadata(data_pred_json, data_meta):
data_out = f"{data.resu}/{data.name}_pred_n_meta.conll"
json_to_connl.js2conllNmeta(data_pred_json, data_out, "conll", data_meta) # <-- TODO faire en relatif
return data_out
if __name__ == '__main__': if __name__ == '__main__':
...@@ -105,6 +126,17 @@ if __name__ == '__main__': ...@@ -105,6 +126,17 @@ if __name__ == '__main__':
create_folders([data.conv, data.resu]) create_folders([data.conv, data.resu])
data_preprocessed = pre_processing(data, steps) data_preprocessed = pre_processing(data, steps)
#data_preprocessed = "/home/lriviere/andiamo/discut22/data/edgar_poe_short/data_converted_vendredi/edgar_poe_short.conll"
data_ner = data_to_ner_format(data_preprocessed) data_ner = data_to_ner_format(data_preprocessed)
if steps.main == "annotation":
#data_pred_json = make_predictions(data_ner, steps.model)
data_pred_json = "/home/lriviere/andiamo/discut22/data/edgar_poe_short/results_vendredi/edgar_poe_short_pred.json"
if prod.metadata == True:
data_pred_and_meta_conll = pred_json_to_conll_with_metadata(data_pred_json, data_preprocessed)
else:
data_pred_toke = pred_json_to_toke(data_pred_json)
#print_logs() #print_logs()
\ No newline at end of file
...@@ -26,7 +26,6 @@ def js2conll(filepath, fileoutpath, config): ...@@ -26,7 +26,6 @@ def js2conll(filepath, fileoutpath, config):
data = [] data = []
for line in open(filepath, 'r'): for line in open(filepath, 'r'):
data.append(json.loads(line)) data.append(json.loads(line))
with open(fileoutpath, 'w') as f_out: with open(fileoutpath, 'w') as f_out:
for doc in data: for doc in data:
tokens = zip(doc["words"],doc["tags"]) tokens = zip(doc["words"],doc["tags"])
...@@ -37,8 +36,37 @@ def js2conll(filepath, fileoutpath, config): ...@@ -37,8 +36,37 @@ def js2conll(filepath, fileoutpath, config):
f_out.write("\n") f_out.write("\n")
#print() #print()
def main(f_in, form, f_out): def js2conllNmeta(data_pred_json, data_out, config, data_meta):
input = f_in data = []
output = f_out sent_pred_count = 0
forma = form tok = 0
js2conll(input, output, forma) for line in open(data_pred_json, 'r'):
\ No newline at end of file data.append(json.loads(line))
with open(data_out, 'w', encoding='utf-8') as fo, open(data_meta, 'r') as fm:
for line in fm:
line = line.strip()
if line.startswith("#"):
fo.write(f"{line}\n")
elif line == "":
sent_pred_count += 1
tok = 0
fo.write(f"{line}\n")
else:
sent_pred = data[sent_pred_count]
word = data[sent_pred_count]['words'][tok]
tag = data[sent_pred_count]['tags'][tok]
tok += 1
#print(f"tok: {tok}, tag: {tag}, word: {word}, line : {line}")
if int(line.split("\t")[0]) == tok and line.split("\t")[1] == word:
fo.write(f"{line}\t{tag}\n")
else:
exit("pb js2conllNmeta")
#print(f"sentpred : {sent_pred}\n")
#print(f"word n tag : {word}:::{tag}\n")
\ No newline at end of file
File added
File added
import stanza
from stanza.utils.conll import CoNLL
def with_stanza(lang, f_in, f_out, process, meta):
"""
Stanza's class CoNNL:
ID = 'id'
TEXT = 'text'
LEMMA = 'lemma'
UPOS = 'upos'
XPOS = 'xpos'
FEATS = 'feats'
HEAD = 'head'
DEPREL = 'deprel'
DEPS = 'deps'
MISC = 'misc' -> 'start_char|end_char'
START_CHAR = 'start_char'
END_CHAR = 'end_char'
FIELD_TO_IDX = {ID: 0, TEXT: 1, LEMMA: 2, UPOS: 3, XPOS: 4, FEATS: 5, HEAD: 6, DEPREL: 7, DEPS: 8, MISC: 9}
"""
stanza.download(lang)
nlp = stanza.Pipeline(lang, processors=process, use_gpu=True)
with open(f_in, 'r', encoding='utf-8') as fi, open(f_out, 'w', encoding='utf-8') as fo:
count_line = 0
for line in fi:
count_line += 1
count_sent = 0
line = line.strip()
if line.startswith("#"):
if "meta" in meta.keys() and meta['meta'] == True:
fo.write(f"{line}\n")
elif line == "":
fo.write("\n")
else:
if meta['line']:
txt = f"#{meta['line']}-{count_line}\n"
fo.write(txt)
doc = nlp(line)
for sent in doc.sentences:
count_sent += 1
if meta['sent']:
txt = f"#{meta['sent']}-{count_sent}\n#text=\"{sent.text}\"\n"
fo.write(txt)
for token in sent.tokens:
token_conll = CoNLL.convert_token_dict(token.to_dict()[0])
fo.write("\t".join(token_conll))
fo.write("\n")
fo.write("\n")
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment