diff --git a/README.md b/README.md index 80b201e463471f779c23a58a28ceda2f94da61cc..50cab136553af7fd6c2884f3a5ae7212ba797ebb 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,8 @@ Requirements: - pytorch: `pip install torch==1.9.0+cu111 torchvision==0.10.0+cu111 torchaudio===0.9.0 -f https://download.pytorch.org/whl/torch_stable.html` Usage: -- train: `bash expes.sh eng.rst.rstdt conllu bert train` +- train: `bash expes.sh eng.rst.rstdt conllu bert train [-s 200]` +#for split the long sentences - test: `bash expes.sh eng.rst.rstdt conllu bert test` - fine-tune with other model: `bash expes.sh eng.rst.rstdt conllu bert train eng` - test on other model: `bash expes.sh eng.rst.rstdt conllu bert test eng` diff --git a/code/contextual_embeddings/expes.sh b/code/contextual_embeddings/expes.sh index 7f25ba35a109143b69b7deb2512163d24d69a6ce..1db5286968642c6422a8f274eea2a994b7dac8db 100644 --- a/code/contextual_embeddings/expes.sh +++ b/code/contextual_embeddings/expes.sh @@ -18,11 +18,23 @@ export ACTION=${4} if [ -z "$5" ] then export HAS_PAR=false + export TOOLONG=false +elif [ "${5}" = "-s" ] +then + export TOOLONG=true + export SPLIT=${6} else export HAS_PAR=true + export TOOLONG=false export PARENT=${5} fi +if [ $# -eq 7 ] && [ "${6}" = "-s" ] +then + export TOOLONG=true + export SPLIT=${7} +fi + if [ "$MODEL" = "xlm" ] ; then export BERT_VOCAB="xlm-roberta-base" @@ -49,12 +61,22 @@ export GOLD=${GOLD_BASE}${DATASET}"/"${DATASET}"_"${EVAL}"."${CONFIG} # conversion of datasets to NER / BIO format by first testing the existence of files so as not to redo it each time if [ ! -f ${CONV}${DATASET}"_train.ner."${CONFIG} ]; then echo "converting to ner format -> in data_converted ..." - python conv2ner.py "../../data/"${DATASET}"/"${DATASET}"_train."${CONFIG} ${CONV}/${DATASET}"_train.ner."${CONFIG} + if [ $TOOLONG = true ] + then + python conv2ner.py "../../data/"${DATASET}"/"${DATASET}"_train."${CONFIG} ${CONV}/${DATASET}"_train.ner."${CONFIG} --split-too-long True ${SPLIT} + else + python conv2ner.py "../../data/"${DATASET}"/"${DATASET}"_train."${CONFIG} ${CONV}/${DATASET}"_train.ner."${CONFIG} + fi fi if [ ! -f ${CONV}/${DATASET}"_"${EVAL}".ner."${CONFIG} ]; then echo "converting to ner format -> in data_converted ..." - python conv2ner.py "../../data/"${DATASET}"/"${DATASET}"_"${EVAL}"."${CONFIG} ${CONV}/${DATASET}"_"${EVAL}".ner."${CONFIG} + if [ $TOOLONG = true ] + then + python conv2ner.py "../../data/"${DATASET}"/"${DATASET}"_"${EVAL}"."${CONFIG} ${CONV}/${DATASET}"_"${EVAL}".ner."${CONFIG} --split-too-long True ${SPLIT} + else + python conv2ner.py "../../data/"${DATASET}"/"${DATASET}"_"${EVAL}"."${CONFIG} ${CONV}/${DATASET}"_"${EVAL}".ner."${CONFIG} + fi fi if [ "$ACTION" = "train" ]