From 58b29c789f24f51cf2fff429f1d08c1c2f736f99 Mon Sep 17 00:00:00 2001
From: "laura.riviere" <laura.riviere@irit.fr>
Date: Wed, 2 Nov 2022 10:01:06 +0100
Subject: [PATCH] add training possibility

---
 README.md                                     |   5 +-
 .../custom_bert_token_embedder.cpython-37.pyc | Bin 0 -> 9434 bytes
 .../custom_conll_reader.cpython-37.pyc        | Bin 0 -> 7306 bytes
 .../custom_disrpt_reader.cpython-37.pyc       | Bin 0 -> 7627 bytes
 .../custom_simple_tagger.cpython-37.pyc       | Bin 0 -> 8475 bytes
 .../custom_bert_token_embedder.py             | 287 ++++++++++++++++++
 code/allen_custom/custom_conll_reader.py      | 184 +++++++++++
 code/allen_custom/custom_disrpt_reader.py     | 187 ++++++++++++
 code/allen_custom/custom_simple_tagger.py     | 196 ++++++++++++
 code/classes_def.py                           |  37 ++-
 code/config_4.json                            |  44 +++
 code/discut22_1.py                            | 134 +++++---
 code/utils/seg_eval.py                        |   3 +
 code/utils/training_allennlp.py               |  47 +++
 14 files changed, 1079 insertions(+), 45 deletions(-)
 create mode 100644 code/allen_custom/__pycache__/custom_bert_token_embedder.cpython-37.pyc
 create mode 100644 code/allen_custom/__pycache__/custom_conll_reader.cpython-37.pyc
 create mode 100644 code/allen_custom/__pycache__/custom_disrpt_reader.cpython-37.pyc
 create mode 100644 code/allen_custom/__pycache__/custom_simple_tagger.cpython-37.pyc
 create mode 100644 code/allen_custom/custom_bert_token_embedder.py
 create mode 100644 code/allen_custom/custom_conll_reader.py
 create mode 100644 code/allen_custom/custom_disrpt_reader.py
 create mode 100644 code/allen_custom/custom_simple_tagger.py
 create mode 100644 code/config_4.json
 create mode 100644 code/utils/training_allennlp.py

diff --git a/README.md b/README.md
index f45e4ba..618ec43 100644
--- a/README.md
+++ b/README.md
@@ -28,13 +28,14 @@ Code: https://gitlab.inria.fr/andiamo/tony
     - `config_XX.json` A file to be completed (or a dir with choise between simple use_case configs and a template for a custom config).
     - `utils/` Contains useful scripts to be called.
 - `model/` Contains model to be loaded or created.
+    - `config_training.jsonnet` A file to be completed. (TBD automatically saved with model when done)
 - `documentation.md` Contains detailed documentation (TBD?)
 
 ## Set up environnement
 - Conda stuff pour python 3.7 (TBD ?)
 - Install all librairies required with the following command:
 ```
-pip install -r <dir?>requirements.txt
+pip install -r requirements.txt
 ```
 
 ## Configuration file: to chose or to complete
@@ -45,7 +46,7 @@ pip install -r <dir?>requirements.txt
 (go to `code` directory)
 Run this command:
 ```
-python code/discut22.py --config code/config_1.json
+python discut22.py --config config_1.json
 ```
 
 ## Authors and acknowledgment
diff --git a/code/allen_custom/__pycache__/custom_bert_token_embedder.cpython-37.pyc b/code/allen_custom/__pycache__/custom_bert_token_embedder.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7ba9802b113f8fe5be38ab83deb4daf2695ec9ff
GIT binary patch
literal 9434
zcmZ?b<>g{vU|?|Q3rk*U&cN^(#DQT}kOTt*L-86028I-dD25cq6s8=;T&5_dT;?cd
zMi8Glhb4+7g)xOWhc${7Ota;(GchnS<Z?uDf>p5OaOHAGaf8{cIXt<%QM_O_TMl0?
ze-uBM&7LEWD;Oon$dJmiKqyssA!C$?J3|Ub3TF#L3THD@l&CvH3Renu3quNbDt|L`
zlvpbN0`U}{g^cbDDZD9sEet7qsbbB{&5ThJDa^qPn*50jj0_B1jtU70A^F*<d9Jxh
zsVOO`MF|NC<r$gD849Jvsl^KUd8rEdX$mD7sR~Z6K_Lpc`6;P6#atR0B_##LR{Hws
znI##eNqWiox%%n(`RO^Sx<#qQsfk6&8Tv`7MJ1YCCHV>kMfoYE$*BsdU>h>?(u?)D
zxExCqGE0gTlJkpF6*5a8mU)z>r)TD+E4U>lr>Yk#1XPCP7bRyXWabv+q~@mPl_ZvA
z=I3FxI|HIKEipM&zo4=Nq)oS=D7B<0F*7eUMHl3BO&zY{e1(#X#Juccg%S`eHCK`A
zB`6{_nQpPTWG0u~V)4l=F7eZ3yv6C1T2$f-@+*iFoSc}GSmc{op~-lQ7bUPX8E>(a
zmSpB6gN%V;HdvalV_;xNWr$)-VTfW1Wk_L+V$NiUVo7C9WlLpGW0GV@<w#+&VMygn
z;Yejm<w|8wVVT36!kWU?!rIIj#ht<)%%I76OE>`T54fYMUT`TWC@6U5DHP}DrYdCS
z6_+IDC8riEROXi|<R(@slqcqufIOL#pO^xUyyC=M<QP;aDbGw!1skG~s+X<@Qd?S_
zs*qT$kf=}sN)FK2N-a`I%u7*7%qh-?NEYPh=YZlUC$TcMNK+4NL`X(vu|is6a!Gzs
zB`CJbGE-7P1|=sZXQV1XLcIj+wv_xl^%8}|<dV|FoSaI9jKs23kkxQ!f|D1-aS)?#
zvBiT;3DIP}#RZm)&r8fry~Pa{jR%X|Vk=55DJ{xNW`@Q93j+fKi0ur@m<9|C3^j~3
z3|S1>Y(*j^j9CmdjFJq&3?)npm}?jpGS)KJFcs<4FoRUoFxoKGfHDW8pC(HYCj$dR
z5eJ9>8F-67ttdY?9+9dvS#B{W=M)!#3Wp+Y1_p*(tYGJ?WGE6~U|{&Aqo<#klarbk
zpIlm8lAo&&q2fW=A|4#X@sJFWTBKJ{d5Z-U28tjD^MHK8#Kp+P#Kp+M$i-A8fh7g#
zft34cGKOgK+~SA_he>?=Ev|S_h?M4}#>d~{iH|QVP0Rs>R($*|_W1ae{N(s}h^Lr9
zu4OJRDJp{aEIvLlFE1aGzl!7IZ*eE*Bo-Ivrj}&nr`%#m%FoX!5(0$-$X{U3K?p&R
z7$|@wLGA`IxL8>jMVOcv!NG=-DL{b<@>p>SQl<c9i4^`+NS0s&Ws6h}Nrn`r6t+3+
z?F?y*DeNg6Exb{jscb2na~Q$J8%q>dia;uN77r5UP2rit0xslOqxjkxSQw)CgBdgh
zZwWbrLmrfEV5M8)d{zbqaH7Rl@RejFmVgp4sJJSI6nKz00u@>cY57H<L|g>PAjOab
z8k~_|nv<fCl&VmWm|2vXqEMb$lA(~OkdOef)-x|9wE|S8>Oc!sXjxdSP@Z3uQjnRN
zoT`wS0xAzd`PnzWxI`f*Gdne>5@Zp`B}s`%If(2G%AyGgsL285vxEe&w?K|WDzrh-
zpQ3<Bw9o=vM<Fl2M4<$fZ<7*>Qx(ARt;Yp6At13RF*mg&wWt`()rA8v7o2>->D&tH
zM{qF-wH@S(5R|$?At@D9T$C3j78InWK*FpfzaSo5e#GbJ<y3;q0VT171RaI^0#Fg3
zn4_SPlA4xSnp2`+tB{c3mY7qVnvkFgb^yo<&oof2RaBY^(o>R=nx_CVNC8p|fXlm*
z`~n5A72v`#H8BNLizOBm<Wz#Ci@_zjLT+XS#6`J@74hKMC|0mi$jmFj;u0$bQ$r(!
z6X3ptB#_i1NEKF)Uz`c`0z?9wazI6BalS$ZxOh#@FDgncF38VI$;?ZKdnq@uA~Ux%
zS0N`gFTErKoYp|~V`g4KX$d%qC^!}um*#@P5>y$3#fudZi&7QJGxJjN%TrU}MuBWA
zNK8pdO;JcJC@9J=D9TJM0i~%Vkmoatp?2tiLjvN;)FOrQ%$yvB;)0w^P`XIX1%*Ps
zLL%JIoW!E^RE4C(lH?2>g|yUEaJ2$e1*%?k!1Y8?YI1&VQf6K%*zEk$5|HZ?ixo1{
zK$Uzs$o~iv^YhYEixi4e3rkb;l2aiOkyn}<UtE${R01g?@kNIrYGmY<<|d^Ufub3#
z9~4l<1*yrIi8;{xssIi%P;~=xuYyKtF{BbQ)KSRPOV!g+h<5e~j)fQsaz<)i3I&cx
z1xGK@RtLKVK&^%(&G_8RiueL}P6Xu<P}?LrEhj&*Bo>tHVOd27Ij{KT=cPh?0L><#
zY@Uz+5(eo3l^-P;sl}-Z5L-YQKmlP3C~V48Gt)D`NkE|}wE)zZgSNXMr2)uTXc?BG
zkW{IVl9`s4T9les0?BQ~dI}+6^K&yRKrsL@4U$elPAM%&Ni0cCQAjB*LQeePc3Ng$
zIwTGFxxHjyVqkd5$iTp$$$pCqQRLj>LoGUPaUnA9EdfxZf#V(;ZMXP9B8X_bB?|F1
zTG$qWD%@mH?FFizKs2ZU2WmeQ&thO;C}C)3SircDff3w_h-XS+tYL^}E@4Sws$s}t
zZf0U+C}CZ|R>Hb~9mHSAxDX=Gx_|@3Phn1BS;$<&5YJh{0^(<JH8X-t%Hq!AsbPrc
ztzoHQi07+etzn4guVJfUh!?0~E}l>#Si_jboh6jQTEkw$p2CpLk;2vsQYoCrmcmfO
z5HFm<5X_*-?pI}~r>E~58tkfHlv-SxQ&Oy7T#}fq3#!diGK=FAb8_^PQj1ICa}x^+
zK;c_Zl5vaG(Z|!>uLu;nMY0SG3~rjtw|G<Xp(S!&eo^i%$;`aWlFY=McxZqjlE^Ji
za9kxO<)mtg++r;*El4c_H9$Zb;^T|t7#J9$xD#_S)AKU((&N*LZm|{=W#*OKVg<L!
zZ*hU!P<f@f1(o0y1=zT};)49*)F@sgZSf_ySc@|f3sP^fCFkd*Wv1Wa&d5wjNzIEd
z&a6tk#RVx@;!`qnixfbO5KzOZND0IOb;xcBf+8NO0bC0d-{OY(B{#D|lj9akacWK)
z*txg(5cX!~q!t&+gN#8lRT-2N8E<jM$AfFi_;{oSkR}5IgBPd)1hSujgNu=kiH(Ve
zQG`*5k%y6skqrcySQt4NnHZTE1(>R&F<T{i@$rxV)MP9Ir$$iZfKnrf24QeZ7L+VO
zt<+!!O-4UW#v((oql!#G?nSiBK#j6neCeqr@rXdm067oT=wo24Qp93Cs%lM^Tb$ss
zGCnh<_!fJ9T3T^xNiitzWvAxFmsA#{f+Qdg1+^bR4wYeGU;w!foNW#<ftuV47*ar)
zfpGyt3e!SHCWcy;TGkTA8pbT96y_ABRwhY?X2x2!8rBYmEaq&MTJ{viT8<j_qIY05
zEFjSorWBT5re?-k&Kk}XCP@Z}Nwpjxxf%`$hFY!?)+{!NTnb|iS2|M)V+w08Gf2dS
zp@cn)qlPnu5oAU!V-0f(b2cZ~bmlBn)46N7@t6*lLouBj)N2Eq&J8u48EiVo1jZt!
z67~h0&~61|3S&0&0<MJ&wLBdRS==Q&3wS{?V48U$Bbeq{$e6;`%N)a0%UjD=!oGm7
zhIb)jEpH9`1g0Xn680>X8r~X~6!sLBUglbUu$}xV9AF>ugF>N$A&Xyvp@ShyK!PEK
zLyDo3v4asrgT%!dI+;osO9VTZni-oJYX#aF+8NuK(wI{?Q@C0<N(5_In;Bg|aUfU&
zjsr1<6vk|(35-QYKql7;<#EBpYlUi9YlI{iI+(;6IvB+nYK1}O)(F?IHiKeEq(-1d
zP@JJgM4X{kw1#bgP>FDjXfxwNP(jC^!kog~3W{;&R;C)!G-gSLTCqHq8s1v58Zngc
z5dp=|1g0X74u%CHHDWb93z=%g^Ehh6KygzmUc;2aoXs(Tsi*=RHYqi1d2BU|3m6tM
z)Jl}_ED)^`uaQ{D6wCk)Ri=fEwUQ+~Sz<MkkW^U0vp~E?d?8~BcQ4Zdi4^vQjI~lV
z;x$q=lAwNgtu)vK=^7@Gc_353BP2YD?rhM`0;t+UYZrrr;7wy)c&i4UaTGvxHn`N&
z^U2Rk4@u1{&Ib*yfEtb(;C2+aEYnfY)6>&Y0M%o#mQXyT4XCM50@ed+(xCMgKz4&L
zG;89p3ekv#yAGrY<N$CB2-5U}G*F9D!9zU-poSo<XQHRzkzbx#mRh8cS)yL7P>^3-
zoSBr9st@U*pqL3BcK|mWz#|W!4hYDo{G!bC%sf~NOb;|*0q$ZYCl;sbC?q66-H+6H
z067re?kEPgA~W++GC^Y-&|#>Q)M7|K3+7C4TRAl`IYR->IusYWq!vJ%Wchgt8TsX)
z4i>a0r;rKhr72_<D?n<S(xTK9Q1cip0PWtDWI)DCKrRGfSW6!iJ)jnOPGWHhQm+ow
z4+A?2+|}1n$Ona@LNctg2+{#F0mYqAU#4Xi;V}-{Qv*2zgx&Is6jCb^LBnJ^pr$q`
zHZouXKdE^opysuLl1pkDsFDG71oHArlpqcEOvu<C_C5*7Y<S8mRwz#`N>zvknWLnm
zprovvnFpmyQX!0-N)Qb)JQfsssd>=&D<~}i)fg!LEQf`JDN>+irW8Y*1qvP|)DTrt
zfTk7&xYlT69R-tENTU(tRuJ|~gR28|jtYwML8DRN*i40X5J7zxNX`QHd6M(<N)j`n
zT}Y6{a4(}c7&*6q+Q<3Dsh}(Y?(IQct&y1rYE749CYR<U7HLAdR0!jdMnF+a2IUlR
zD$!Fwb3LfZjOv5bypqhKRIDWf!htBJgZ-F<@H-;7z;?pxbJ7YC&oqU^JZO(9IX|yB
zv$&)foD<6P6^fxru2?|>)b)d;0&s~_kXVpfq^X1AKCml_6+jI`aB_j9e3%&u#gOq8
z$Vg#jL8^iQxTA@r3)M8xfMaTMeqIXP8JI>G>M4Li8k9{S1Cr1%1`mj!7^eYB53r#K
zqyZg;l>F4<JoOSykdyNu<zphKe-27p`JmCD#GD)j1LznLC|!cv+b{qB|Nq}llkpaF
zN@lJm(=FzL#FSgCWr;bZshZ417NCZl6=-OVFR`Qq6w~>6@wth`+2CgSOHlhQ8PpgA
zwYNYVP*YbJ)Uq>UU|>jRs9}iJt7Yt9N@0*-sA242$YKy<U}0cp$Yv<g>R<xZ+u007
zEO{)9423K;j1!p(SwIa*u-Tf7;AXTY(=8@FgIkQ5n#|x<<t?`2lA_F%)LXm|Iv$b(
z5pCaJTsAqG#U;u4xdnFTK`sF`2iX{y7_0QKv@+B3i^>y=QuJ(c@{<#DitY3enl%}3
zv8AMzWhSR;O5S1txA9p(CKu^}!UWV9zr_k3o4Cd3R%8nj(*}hTb8=$IEv~B6qWt1`
zP^|<SMa@eGH|vYoK?+2m!x7--HK;?Al9^k4ixt#K$}WO;8g8)`CFZ54-eLodQRSuF
zV$Mm;D>4S@7c0%fYJvc$#SGRIUzQ3Q_rApo5`lOjK0oaiXK7w>VQFe=RjQ^8I4mKt
zYYB2XIEuLtO@LcGX{9+i@x`fypql>{52%!eyOldHH8rI;9^!ccP>~o98SnwO9&Yi2
zCF0?&j$3@7ksnasD={~}G_T~A2t)xmzrgeQEiQ-#P~y5Jl$etfpPHAPpORVx=>!(v
zVvN5fT$)#qn4F!O0v)CXwG(dffO99*1-FDjtrk%EmReB)6)V2Qk&~aFos^iIeM=|}
zRQE$AAfq}U1M(7cpklZ9ic@n^lS|^ERs1bc`1lcut<1TZ6}MQxQO5~Mg2hHDw|G#(
zUl?K(igUpU6`GcdKwYRJ7f|?vdr0ii)R+xQwrQX&B`3xx#0?s9XJHay=40eylwsy$
z;$RYC0<)R;z(evpEIf=Hj6#fDENn~yjC?#|jC@QyOj0ZYj2ui{AQ2`WCO#$!CKgs6
zCLTsUCJq)ZCLu;1W-d^Vkco$>N(y(z&}8#-D*|<-z<Cl(_-Qiv`DqFifwFm#2*~lG
zAVLg8fbv?A6eyK4XXcd@fqGuISi$2%MW8NekvT{aIGb=G8lbn>APv!?e2_+7Xn!_2
zH75twrv>#SH5rRQWl9k!q>7{&7#MUxp#-v@LxzQ!fe{8(#Q9hm7{My4#31t<x}eb-
zUBrMh$|L|NPlAHF7}RHl&5VI30ocKl0N|N1Ne0lM$sAVj1OR9@j5mq{JgCnZ%%I6}
zO9m2v*ayKA|1d*a>}aiZw3a%;eV_@G)ErP-y||<ZsUhl`S(1@j1S;MYKx63Ox-BCW
zV<a6kWCj{lgUq6-gFO!#QP(Za12u9})RAhte28%ci6t2zZJ??}uOzWZFTF}3Ei(r+
zkPDK?NQI2iD?rE2klK2lkj5I=J(-{hw(QhOg-l5O3Mwr@jzstjVob3@VhOxT!!m12
z#LTIFQcix7er{rMNoo;nb`@zJEFR=(eeje3c!;S|*~i2XZY9W<D6LBcu>WDLlG5VT
z6l;ZiP*|5|7K5jPVN#iSC8<Rq?|{a!LB0iR%r8<14f26BVv14=OEZg7i{sOa5>vqK
z7ku*w;8_He!O@VS(o`J<Q0-M(lBxhQBoj1}nV$xk`az6(fx8%K;9>1j(7+bNdepH<
zP!hstJw2u|pu=?|c<k8CPm}eQ04UW!rv4$>`xY-sQh^NdfyVS8gIJ(}D^Rfn8v6w`
zsz60)u^)KscL5`4$flO91T^l;1R9C}jrdlG)vzsKUdZ6W5X)7|Uc+3&9?w#!6wXk>
zl)?-e-z{NnX2@czVXI-OVFeA{FfxG0Ygzq@R6xZpQ(Dn2E@ux{XYT-iPrndNW=M+#
zJn#r|l_ooQ{uDeyTBHTi!G%bRMdhG!T?0gbig`^ANb{l$Bwz|6!18djiughLS;6{Y
z#X6{nhm`Fg6GRyq7>Yrojtop(Onji)j){Y@N)=x*ix{tDc60O7<ic7%iGv&is-M7B
zQ&Ay^3$m}M1jGWn0YtzY1+HsK85kJ;gX$VklyHc$FfuR}fy6W=i$HbWEw++M&^UGx
zsQC*v_7;0iK6uec5vZn%5{3-8<mKt5mF6WwCgN@}<>lStLhy?~gSEF53Lwiw&<ip>
zSV0Dv8kYu7;pgQPfW%94Qj7Hvd8`POR&GHi`H@vX+FFqASZYzR9xA^G<h5JeaC7tW
zAWc=y^wbic{PgtHB5-KlVgq?BwFr_DKy~mfj-u4`Oi&>V4ia$V28jTL-z^TC-29Z%
noK!o|bbB$VEy2PFYMO8`3Ne8iBs`2fOkB)-A_Dac!9t7x_gu_-

literal 0
HcmV?d00001

diff --git a/code/allen_custom/__pycache__/custom_conll_reader.cpython-37.pyc b/code/allen_custom/__pycache__/custom_conll_reader.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..61f75bd353d9d94355e7f854fbb639bdca3a0c6f
GIT binary patch
literal 7306
zcmZ?b<>g{vU|?|Q3rjw)%)sy%#DQT}kOTt*L-7*^28I-d6viBeT*fHIT&5_dT;?d|
zT$U&nMvypD4r>%^3PTE04qFr(m}ZY+Phm)5&f&=Ah~faNVaegl<%;40vsrVvb9th8
zz-+c0-dw&YJ}{d-hd);!N&w8}$PvsHiW15djuOrli4w^bjS>ZmbLNQUibsiq*<3ji
zxsp+mxl&P5U_N(_bgoR43?oA-+XC5CxrK~T@~H}`iYYvEm{NFC_*z(^lv4Q?D5vl*
zWOQdp5l9hiVMq~7<!fecW{grvVG3r@6nY8ryC%ym7MINAl3Of3nZ+fyID%6POH=cb
zQ*Uv2mZTOXCgr61X)@m8%r8qVD#}brE!JebCFGo+mzJ4cT9jClnV;ucRFq$&$#{!9
zIWajSH6^|vu_Qy2@fNR3Vo73gYDrLPVoEAVo}(l`J}EQ5G)I%=7H3FmMTuKxYEH^6
zA((|eiAkwB5HVH=#p|0|l9-ZMk_hH%GT!3w%quQQ%u7zyWV*!@lAoQL=b4w1T9I0G
zOC&fmFFhyKGX+^xlkpZSn6JrnizO#BFZC8%QEEwPQC>15$ma|U49pA+3``6R49=j4
zH(_94s9{*Zu#h2zF`KEDv4*io1Qbv;3=0?+GNdp|GSo8FFr~0$GZpE8#8X&<8B!R6
z88q4aZZU)ufy}Jp(sl874i0e)3ek1bWV*##TvC)-aErCLASbg#letKkfq~%`S88rS
zNo73PLELGXMa3oYB_O}7WGIqiU|{&As;8frlarbkpIlm8lAo&&q2iPC^Kx?Hi@?FD
zS5SG2J3g~GJ|(j(GbI&lZZRLoQ;a-}EQ~BnJd7NSEKEf*3=9lW!dMJQ2C0Q%PAH>z
z3IhW}Dnk@w3PTiA3S$&=3R4Pm3QGoa6ibR|Dr+iRDtj7}Btt4&Dn}}tBtt4^DqAXB
z3Tq129QJmGG{zL}6rL8|D6UlQRJK%}R9;Dj6y7;ZDSRpXEiBE9QGAd@7{#9=mMV}c
zD9Mm205%1gWJ~2u5t_r6BAg=9!Wt#i&cMPDB^=D4DSk`B85|hSpg=%LS&43(3=CWf
z3JMCKlvb>e2~8r!3TZ|8xeAF21x2ag;LR*fO;Je8%t=)!&qyswRY*-t&QJhFf<h)p
zmqJNKszO?RPELM#W?s5NT7FS(Vu=+O*yM2kAQy!I|6tt^M|TBhk5E5vFw4(12t-5F
zlxLP?C?qN*<s{~1LygT#$xKcx0htCeJ2fvwAwLabZE<Q|2`H0+O;5~AQBX&u8wK46
zT`=}nSIER|L}F1%W^zud4%~?3{JgT%qLN~T%o2snypnu{M1_O|Xiz02C={1wmVk0I
z$Y%=0&`hpSl9-;JnU}5y@q;TUND>lYxhx?;Avr&<Br!9u7#up~`9&%4ph`$k0taic
zQbK}4LIONPCnV^BofMLhs!*1gQ<_?=kepvslv-SnpO*p-i+r$Q2?-^M>G3(BOrMYd
z)&W&ro|%)Qke*tikdvR7l9~eX3n&0VHYTP+?ZYfJU=};(r6^>kDOBc{Dijx_CTFHq
zDkSDrDkLPNr6!h?7NtV$EKW#J&;Y3^D9TStOv<TLD9*?)%}L2qFVTdADacAxmlJSc
zW|2aEd7eT-0yeiP6hp%}BqOs}0aU0Jr<N!{a<f8adR~4}DkPAVpb@L20Lt6NnfZCe
z3ZN8HoS9pYQ>jo=l$uyl3=S`F0*0j|g+!1tP=ZM<NX<)0%_~trjz~QPjgZvjjJ(X`
z#2k<*8L2S4!9`QC0?ZtU(fN7cFiXzQP0Gvzm-3*bi%8obyHXUCU}c>WC><4BD`e&=
z6o8#ynv+<h14@V~`A}~aWu|A8fSp~E0ZIk=c?#vJIXR%zm<x{Ng3^*=JxxeD3`i_W
z%uOvxErPg47Y-l>fC|g_Oi0OJtY8I7=9$SQ(ZwZ2Itt+O9$G8JCM4)6<QIS{sl*%w
zjg-{1#L}D++k}K_Skkppz*cW)Xx1hqXoBqml`G+?3Z=!V3MHV3E6G<#NlnYl194Lo
zGV=;bL6KNclnM$zNI<3K7l9LaNoqxjo`OPfYN~=&a!z7#u~h=H<Dl6Do(VwSC@v|&
z<>b87B4jrOLsNDFNL)uDp&-8)<kupFgyf9UyzB&pl6-}X#4?aG6H7p`gOQtH<)lIy
zs2WxP<wmHJQ1c1MU9bcV3Syjrs-cPOz>tj8;#382_AXWcXX&I=m>Uv{6`*#2N|O>u
zd4QVLP+eDS1&@pbuvZflbnO(t%=nVT^kPt;B|v=+l>!F?k|Zdcpi+6MMNkR2so;VS
z>`HK;WtJ3!vvOWyZfbEsVsa`d5DM~(q1tp5pq3%=5r#sFKTyK)gp@+%iFqZU_y8rM
z{Jd09P6CxJAcuo<tpcoMN&sb`%KXwIh1~p<)EsaIf|N=MNvUwRLJGCa)D#6!H3qRQ
zIUiKC#uq1Nq~?Mhizg*{`a2mS#}p(fWkT~Q#3TiXNub=RkN`?!h(Z%o7Qws^iC$R2
z3-WSUVoqiXG|v?)Bo?KDN+Ga?;6f}R!O7D<)F&Z9PeB1xK_nzVTm#BfpuDM&o0tiy
zaNynr1yz1gW_o5`VvYhxHOy_9dEjj3obTu3qhMrUV65Qh8U!v`Az=b?jweJ7k~SR$
zPf&fIr~t0KK&^$$lvIU8h2nz5JRJppWV!s(lH$x1M8HE7f&JkGQU`KbQfhi;ULLqW
zO9PvskeQpCnv$7VlA2SA)SO7n%P+}DErR<BWF0Ia6es4UDwI?fq(a;c4%B#*q=zRq
z!K#aK$0ytza9o0%oSczgoC>O@;6)EC0&g+rr54>{F32yw#R|?penp@LsoO0UPypUy
zgV?3Xev1zrc<>_l7AM@OTYS)*4NWk&SaK4pDsS;Zk`*+8-r@(Fgz#gL2m=E{GCQ;x
z%E7?E0P2l_8t29H7#J8z7@8RtFxIdvWMpKhVJwhHVJu;4X2@i$VTfn0VM<}h<}9+Q
zVX9%sV##JNvMFIrVOqde!?cjGnX!qnLZyb;g&|g^mc51@s=}#+HHCQrTMauzMWItT
zLk(*RNN+_6YYOWEwi;H549LuQ_7aX7=4OyxoHeX94DnnwY&8t=+$kWE2i*K;_q)Xg
z4v^w1snWdcJWwN`BrzRQ_<<wVN};;8N)9Fm4IwmnO{OA6Q15^hoI|Q4VX9F=4rCBm
znI`Wo*5cBF)S_D)@$s2?nI-Y@MS>vh{E)r`B5ru&!yJ7)UE+Nlom_o_i$H;Viw)Ac
zEm8sLU@lHADN<!%U?@@t5gH&d@pxF4i$}3vlj9akacWLlkupdrNIOKQ8i<R~&I*oS
zO~zZC@$ulQFFqcel8U53>U0<w7<@q;E>Q1;ft`m@fKiB%kCBU!i&2P?gOQJkjggCy
zjggJ9N)cD1Uk~PKO{QC%pf)|Y*9l2Rp#B^vC2=z_FmQs>709Yt;AB+8*vweVSi)Gt
zn8Mi1)L&Q2RKm1?Ifbc)aUl~ULq%2z%L3LKrV_S=OfC$uQ87%l%(W~v%*A@3UQi1|
z4T}pyGh;1l3Tp~W3quKe4Qn3*M5Kl_hNYIRmc5pvmNSJ7BwND<mf@;lSin)kRl{Dx
zQNvloxR5oBfsvt5FPtHqA%!83A&4OY(t~98yTw>kr48!AfXWhB?Fa3{fI2ClniA4`
zNi4BaP%VacVL{$yVPIeg26<D0fq@~Np@tz=0Mva2b!9<a*NIGpEWwb@r6yC67$^?J
z85kHenQk%Z8G!qakk|obc5oE^;<CxfEG|jT&n>Vk1epu6gpHxf2zPvdTC#dJIr+(n
zImLE*2<_P23Cizack<LSmN0;#1=)>^px%0Mi6&E#IVdr*f?FtPPAswjxeJ6LZX?T{
zA|Y@vK&=7iVQkivA*|8lECLlzw^;HEQuA)H<>aTQrxx8}$;?a3zr~qZl3G-fpPy5F
zi#@$4zqBB!65Ij5#axwHa7zHx9*i%^kB3FBCU=n$C@M@re&hu82;xC0`4$(VTYrlc
z9QC)@z#&k4iyiF5;`mz}u*S$OE^t8&Ziw9CfHgvjG(iS~qZ3SkvfV9K@Yq2L$S<Ja
zXJF=F;$h@r7Gq{%<YN+I<YD4ts*=MW!O5Vo2IVbK@eE3I;BgZdaNf#dTmZ^Pj4@2L
zOts84%%CB(RwhY?T9z7?6sByZB8?PKZejr!v<@ZADXcXtvl&v@<}!nd=^B<`22FOq
zmmmvnF%{(BVk#)nWV^+bns<vSCG{3lX2~t4w4x$EP*^b)msN54B<7`;CZ?w<REa6%
z<(DWFmlhP{7nP)@C?r)Xxcd0!YqH#8$}hgf0ve7g0ws5F(Wl9Z6zGic(82>$NrGys
zymYkazr~TBS`wd=n3rA!3OC3IivS~dbcK<FQ2<{N1=Fd?28j+2P@H&z2rm!;igrkW
z#Rdvd1_lPu*d!=m!GYSq2&(sL7;6}^7@L`D*-{v5*fJSw*-Mx}gEB1)HH@GFriNhw
zGpM#oVVT3!%vi&o!fL|+s^wDHvYCp?YM4{lvpI{xN?2={YZ$WFQaE~<@|aTCYdLCI
zz%m6TtTiksGMsrFHOwISTFw;C8ip+PY__5mHB2e&*&IbPN;p!uK#^6#S;JJrmcrf4
zT*IEiZNrel166sch6AkfPzgs0FH|K*4O<EySS6ngLkd4s<)0c(u*x?j94P`&m7Fzf
zDS}{?f?!{8rSNAnO<*ii1FPi(u}U~=I6?Jq7Q}8L8-~1~8ip*c6i%?p0I*9zDgz*H
ztYJV>sR35W16El9b}2|@MG0pO2dbGRJT>gijKK_=!c}ST7F&3#LP=tFszO3RQK~LD
zA7)mirhuC4pgtg|DXycCl$x9fYUP)wDx~D+sh2>yV9>T^W>sntsDTgeM}libNP7>|
zP6ekBlp+k&_yHH`pdt!fRC0le^ehInf)Z4!zzRyn3bf=4E+qRwDHoKl*chs8aprH3
zImo3Gwz3>Fpa~uWDvAK*LylbN@NAJ50|NtSl$awm57JyHl4oFGsEUA&KEZ|>$`gyB
zLo%S=OG!SsTMaA8KwTkd%cxjSAt13BJRFh>>NsJztq7c#6hL-F5n~6qj0E+9i%GHt
zoLS{SwnP$R3#j`6>(s$JxCFfdDhzJ1fb?nd7J=F<MY^Ek0aVl!MS@u1W)-MxC^7_b
z!7VjVE3SwK#09s(Ky9`neo$S;1u5RZ<v%!(z<vPPQ3NV*ZgIdy{%-NZ>Y;dO<-(Jk
zpOTsq4-Tv1TbvMHUTP6~nsQ=bU}y%VDH~9^#RV!0m6*Ahq?lxwSr}QExfnSZIhX_(
z(P(BdMm9!1Mg+;jRHaFDYSd)&D*{CX*e!107I2Xg$iFOpetv$MBA`|<s1T2jzr__F
zpPQdjnv)tIe~TwRzOXbg2P(rJAD@z+93Nk#2MRb)j=05KTvAjd0OEpbvs)}l`T03T
zp!R7Is8B8fmB!$3E%FAb0Y?WYV}jcaMWBGV#S3lWCa31)z*@PWI#-ji$ONPZRP`3g
zGB7ZJazru64;<Poj9knNj4-Gp$iyfn%E1lcgVk!P-eN1MEXd4DF9JEU2y7?Fg%B6n
zgK{4`sO<q#5dyLX<lI|g;Aw-roC3Y%{M_99JiX+M)a2}9aK(2^3SAP^*oZGJ$;>H+
zG--6;3P5cUJrGfxS^{n~q!tyU@ryt$>s#8yX@OV*O2W59P<23(9wga=lQS>KGoZAO
z<Qb4l^kAtK938i0P>lf>$M7x>BtpTxAqgz%K;>{TI0kQVrl*$pfSYhdpmtG_Ey(r^
z5RnNYK>d?j97U<=nZ+fkMc|kLCqN_u6uq}NY#=Gl4m1T+3@Vl+K$(D%gNX;!#A4)O
T<YAOx1od-xn7Eh(c!U@MY0DSJ

literal 0
HcmV?d00001

diff --git a/code/allen_custom/__pycache__/custom_disrpt_reader.cpython-37.pyc b/code/allen_custom/__pycache__/custom_disrpt_reader.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b5fed7c4fd030664682f83ecb79a6980d3d2cf6b
GIT binary patch
literal 7627
zcmZ?b<>g{vU|?|Q3rn7=!octt#DQT}kOTt*L-7*^28I-d6viBeT*fHIT&5_dT;?d|
zT$U&nMvypD4r>%^3PTE04qFr(m}ZY+Phm)5&f&=Ah~faNVaegl<%;40vsrVvb9th8
zz-+c0-dw&YJ}{d-hd);!N&w8}$PvsHiW15djuOrli4w^bjS>ZmbLNQUibsiq*<3ji
zxsp+mxl&P5U_N(_bgoR43?oA-+XC5CxrK~T@~H}`iYYvEm{NFC_*z(^lv4Q?D5vl*
zWOQdp5l9hiVMq~7<!fecW{grvVG3r@6nY8ryC%ym7MINAl3Of3nZ+fyID%6POH=cb
zQ*Uv2mZTOXCgr61X)@m8%r8qVD#}brE!JebCFGo+mzJ4cT9jClnV;ucRFq$&$#{!9
zIWajSH6^|vu_Qy2@fNR3Vo73gYDrLPVoEAVo}(l`J}EQ5G)I%=7H3FmMTuKxYEH^6
zA((|eiAkwB5HVH=#p|0|l9-ZMk_hH%GT!3w%quQQ%u7zyWV*!@lAoQL=b4w1T9I0G
zOC&fmFFhyKGX+^xlkpZSn6JrnizO#BFZC8%QEEwPQC>15$ma|U49pA+3``6R49=j4
zH(_94s9{*Zu#h2zF`KEDv4*io1Qbv;3=0?+GNdp|GSo8FFr~0$GZpE8#8X&<8B!R6
z88q4aZZU)ufz15HsO+Z6bc?mPq$soC7He@qPG*TFbCECu1H&z@)ZBuS%6PClxYII=
zic8{4KwenMP$bR3!0<~=Pd_myCp9lVxwN<>KUW_@#iwKz7ZsGm7lDIOub}c4cYJ1X
zd`f0nW=bm9;9@?IXBc@HSr}QEco;btS(u7s85kI%M6j8V3{nikoKQyb6b1%{RE8+V
z6ox3K6vim#6s8pB6qXF;D3%n_RMu3sRQ5C`NrqImRE|_ONrqI;RJK&M6xI~3IqdBW
zX^bh{DLgH_QCz9qscfk{sl1X5DZF!-QutE%TUeSIqxc}HFN!}!EL9*?P?8~40Bi~}
z$(G8SA~c6BML0#Ig*8g3oq>fRN;sH7Q~Z{sGdMV0z`=l$rV`ya85p<}6ciLdX{=Zw
z6Ph@R71E0Ga}^R53W`#}A)Hy9nxc@FnUkteo{?IVs*swPoS^`U2Zc<KE`^eeRE4zs
zoSgjf%)E4kwEUvn#1bnmu*u>6K`sgb{=vE-j_wN19-)5TV3wb25Qv7TDbFm)P)Jlr
z%1O-2h8mlfl9`-X0x}I`c4}UVLVg;=+Tzr_5>WO6o1U1LqM)wp;_n<B;usX7tDqYJ
z!T#zBnYfKeEGo%N&PmmQ8<Cu!SC(2-Qml|!qL7(alCO}ckdOcks)PiE;?m3#P(B9v
zOraQ>y%kCl)6+Ba()A#Ia0LZPLINy*B_t>$=jW9qX66-xL#I5yC<Pu=2?<KzU@cZk
zNKi;ffM?}|1U;~mLNZbn$`W%*Q;QXn^NWg7iwpAeQov!64>l|zp(HUqJ_nTD6B58W
zpsLF=b8-~YQ%e+b@)J{1Qy_i;1t7@A#B`{AnB@h`V#mA`h0HXC%KTD=;)2xV%(O~{
z#Joy{goL!z#FEmYREV9$2?+`sAT<R=`ALaMIh6{<8Tq9-DS7H8nvgIBS&8a$0uIb9
zQphjQQ%Fd_<~D_5Xc&iNWELxcinHR>5(P+pR>(}x%P&fW1hNt|VwDs?xw|+sKd)E;
zltPL#a|?1R6-tUy6HAK0;RQ~>u#}{b2vP=0FsTKpc`2!RB?`z9si&Y3lA4^6mzkWH
z12QEe6=pZMbShSWnFBF8KMx#c$@#fSnR(#y9h7ttX&Yo$ih>d>%_@P?QL(i`W}ZR;
z*!iV7iA6e~gqV^K^;S`4dPWJ@*(DjERFI#iP@bBT14@m#;7BegEh*O1grviO#G=I9
z)RNR9h--A=0Ac{Byo}F;l=;O9R-j~_nOqWGTvDW?050jF^+9Yxf{sFd0jPRP%u&!t
zNli;E%_*@>NT`M-T`L7_HHL;}Z9;-3*e+1H5}vA1TAZp-0*bhje1(+Mw9GsZH&r1s
zub>nZi3LTepzwnPR9b!!IDwa>R+Q)|C<LdbDp)1wBo-H2B_KNvnoZ!D0OXD0k|JDA
z&Py#qc2h7kWha2dbrcc`@{2)!EmBBG&M3{xPEaVxSI9^#135FX1Qa_Mxd~QIDx`rb
zU<FWaggOZ|pMcy2OVFSo#u=y@n#c|e$w)0uRRCx2Vg+!PPD+KjA+cBiY6qw^DS?y+
zs96owb;VZj$VdQtH9<kwP65n}FG)-<1_fFI)aOtsa4;ZAg2D+Zm6uusm4KTHF8IK%
z1P5AXNijGp=OyN*78fKYr-A~ZAio%@O-BK084@31D5UrUB^*ykDO8@AR|1L;P$J6D
zO9ka5P}u@<I5^iTz)GeBPzI{ZFD+8Y%}+_q0cRjcsicsU3U@1{P|HkBQ2<qA5ZjXT
zK}BnPadJj#F4(bnQj({?lOb|UL6TA?G@n9DQh=BQ%AE=cpfrXkG(lw%%=?h&g%!LY
zFNY=OWTrs#T(Lr8Q7Whu0$T_!#1aymJpDs`5)$+j6hIY3LIT7!piBkIn+my!nUD$x
z?p;t&<rih9XXYj5D1cPM+?JUK&SuW}em*`5Mg|7P3VyCZ;F1**CLrf{LewB>(^2pQ
z)%S@C;K~csTF6XERY+7QE=bJNQSe8W%P%b{&P+iBJVX)LA5I{3AeSYjrf25mfeW-W
zuo()Oxw)w+nTaK-Ih9DwiNw78l8n?MxUWFg!4g7oVs5HJNo7GQ#NFUPjYmm(cw!T*
zx)^tS!p#B4CCJIi8TrMjplS+U^uQwU7IR)|(JkhJ{Nh`z;QZqUZj-v*VgUu<EjEZ<
zn(Vjuz<~!Zf^Tucjk?7L&DqcdbBiS>v8wVGFC<w(6X-2|ut^9%7Kt!0FeI}>+o2o`
z3=E)t7^qoZJdc5ap@gBCVF6<e%R)v*h8o5Ki4?{Xre=mr#u|or<{G9HhHTCvn;NDX
zhAftB_9B}S))b}%Y&A>^8JiiK7%Nn2m|YlRWop@L*r6(%N?22v7qHc^LsS$xg)`K!
zrhxQTl(43-E?}!+g~))+jAt+5s9|mf*~MAITEh^}Rl`=p5YL?gB6+|Ke|EoHY~TPX
zu97Ov%gzHe0!k9oA%!0}VyzUaYpdj7a?lV$lh<S_QUY}lSiw1@N)o0TCFDQ`ft6|U
z-eN5-El4f8#StH$nU`4-A73O0(#{X*P9WljH$KeK$I~U=$I;2vC%6a{$hX)at=l41
zkPhbJ)RH1K1_p*A4G^IT5)+SyWx041`!zXku@tA~q!p=vl!CNFbgF~62<@!k_|;^*
z#Tg$DuKMER!6~Uo2Bc1xfq}sn)YSs<7}$9j1sH`G`53tvxfq2QIT-nv*ciDO*%;Xv
ztCVoI`t@MG)?~WH32M}XJDiYo1nSCxk`fOC0|N+yIy=Ru!0D)lv6-=!v4pXPF@>?2
zsefH9Qwh@o<`kwH#)V9b3>C9VSQfC>FqN<^WO8AM?TTTlWv*qZVJ_AK^@Ca%YFJzt
zni*?ZQ&>}2S{O>$YgqdjAR;xaF<iB5wd}PVwVbsaaL84{QNo$Rmcrh`P{Y>D)XZ4R
z1{UD}i*SKOxKcPlrq!^6P2#R$Sin`oUBg+!R>M`pxR5oBfsvt5FPtHqA%!83A&4OY
z(%IzlyTw>kr48!xfC?X2bqVeCfO<8csuj}RNi4BaP%Vb{c0s|z!oa`~42l8?1_p+7
zh8l)g0Z>mD)awQHawjqsvIIjqq?$}c;-E+aDb!@T#iVBd?ruV&6Vzh`$LTLFo1Dzz
zlH~l{0=q(xxgbl}7^;l%L?Ec`t7nswpPZOeY^R6NkKLi55(4Z{o?6Bd22eaAJCYI9
zvo9{uWGb=%C0|x>n+DB|MV26kfiT2zaC<;qb`S;!2q<Jy7=sx!nfz`s`h$(oWW2=)
zO}}st-{P>zNz6@3Nwh0tU|{$RN^Vt#BzP907sUyn;tuQtA?OH13gbe?B0lgK19LEg
zCW~Jcj}xf62~JJ7jZ7`BVs_FEF0x@@V9;c|#Z_FASX5FB>V$%vTLE@BgaGBTB0B~K
zhH_9m+Jj13kZ(aw1_cDj!(eL(gm4kaS?~Y`mv<2J;b8&};7UXQYjPKXit<}5`30$Y
zx7c#>)6-LnZn0$MrRCq^%q&SQD#_2!DZa&?UX)*2kW>jCuDHcqm03_^1&T!G+{A)g
z0-#oOd`W&hEQM?G6d8m3VFpsd3F=41gR=iEF2r!iEmm;)zr_Yl1;w|xz;zk8t#*qY
z?C|3FB3qDt4p^J+76+{TR-^?o9qcDC0Sdiatl)u{6p+_I`GA3$gNcWchgpo7g^`a*
zh)IG`h*5}%kEu$YKnh3(1vn_+LDd^5;K3tKF5sdmi*W&{7-EcJs%5HWu3-iZKDIJR
zGSsrvu%s|$GZks1fC?iPa7E=%!kofd!!ny8g>5b~sG6%`31-k__j?Jl;1*Ls{w=10
z5>2*SOsRRdm{L-2F=dwAVoED23Iq9?skp3)(<d=6y)-dBRiR2uAuqo~p}4f5Ait<2
zHANw*Qo+^7H(!(G7E^xlEf&x~UJ=OG;HpED6%qW9Focx8pk@ZBuFp$HOBA;_(o;*~
za}x8?YeB&W8Kn|n1dmWLaxe<uE#zTZHQ6B15(tVLe-IG_B0y;XQslFN0+oS*0W`1+
z3R`fvHZX#k4mFH53|WlLOtow&j5TbTjJ50~OrU|S7KR!|P?1~1uz(p<;is_7VQOZq
zVNYSTVE{E0QrNPYippx3Q`oaPi^57+YnW>ove;5MdYST=QrK%bYFNNB1tqLCEGROZ
zc^oy&Ao*I(6wVrkEcR@+q7^kvDeT!CMKek`Qn*0zRl-@rRKu3S-OOCWp2BUzkir91
zd8vj2tnyF^M+z@gB}WZg3LjV{pAAC_KUC$P8cwjvHzgb?0#KElHEbz@V3mSkUvQ=H
zXERM;EK&oj<pi-xIKh<*Ll(qtAsdFgpc;lOt`ttN$^fuSK`H|vZmdC8sR35W16El9
zb}2|@MG0pO2dG(uFtdcGhP@e7j0#t!!5h8dsR|{D*{KQ%1x2a4;Eb4Am6`(T^?<t3
zpx%s*LQ-mSBB-NOo~n?NpQl~|>BT~u>zP%lMW8+sxGN5BSU{T0pr$@Jg}@6`P(DB@
zP{DNv7pRiUVnC}oKt&9+U@LM2r7Lj82NR&&4=&awff6ssP&S4tJKVV(WE65~g{^)A
zjSPcF(u(3ixsf9mI=o$^&A`9_8fE85&4V<$iWC?a7^)(mV|cIuoAShB=)e}J4_J~9
z9-M#`WS|Zuw0&8urx1`>3?5X<1$93$+*Sn6PKqEqqKL5rT#SOo5{gN(1)N{yLAFE^
zV+*Jg2kRQchY1LJ1yml~Vgc#X<SpU`MWP;v02Mby2_P1@aSAFPii|*9aC;Zj7A^v{
z5Wx-MB2de>h#yp+azP3?a8&>fB(NVqb`*iioLd~Q@zY!UuzD#TTG{X<=clBm#Dl}C
z_!cLGmzP?ERxB2IF)%Q+ff5y{0%za?m4`~qTuf3-GR!QDEX-Vt9E==H0*q)hvlt^A
zBOfDz<YB7PB0f24viTK(Vgl?KH*iOyNEzf|7C%2fKTQ!(X984`$H(8|ijU9DPbtkw
zjgP;@6CYn#nwSHXVULeb$xn`tFVY7EBq&eZVlFNzDiQ#3L6zDqmZbdroFY&&zX((?
z7lBG-aKILU8XA!J0A)^alL8zPw|Jo)h2+$n99U-olmj&xi%dc0f-|BV0|Nud`Nbdx
zhc*i%7c&DR4C)9nF^WlWa6|Y-ASIfrx7bQ53o`T4i$KmS0^134A;d*)pzOyEYL<Xh
z#DJ^;Iro+rcmgIbr$8?`KQ}i&PcJzmH95N&T>ssYLYD-!KH^JDGINR{tsNb>0#M^b
z4@4BFmVg@&sYS(T{31?}8?=ej0<i>?hHr_W>VTv@NV*57XI_wJK#3p8Ga#4f!ICRD
zI&R6J8Urqp;XOx4gn~Pj5?IuM%Hm>h4Bp~QPc88Qx9*CvKo+}zh&&LH4<bO*P`5aW
zQqwbwOHzx#F#}G3NCYT)Z*kZ_Qk)&Ad0GrA`y@cQfRTfV2h`kR<YDAtlwbsP7kQYt
Jm;`u)7y*k%V5a~8

literal 0
HcmV?d00001

diff --git a/code/allen_custom/__pycache__/custom_simple_tagger.cpython-37.pyc b/code/allen_custom/__pycache__/custom_simple_tagger.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c1647c89fbf6170d62b81ac688416a44a176c2c6
GIT binary patch
literal 8475
zcmZ?b<>g{vU|?|Q3riMNWng#=;=nK~NP>ZZq4)^{149Z!3S$mKE@Kp9E>jd!E^`!f
zE=v>(BS@Snhc${dg&~D0hb@XNg&~DGhdq})iXAG)5yg?hkiwF~nadT$1r_7Y<zZrA
zWXR=>;scAY=J4kVL<!^yMhSxXY&k-?!coFtHhYdpu4t5Ku2_^<u6UF<Sd1e_B3Cj>
z63phzk;;{hlFpThk^%F%a%6MmqU3VrqvXMS?i__&#VEyGr6?svhE&M~$|*bx8QmFD
zcvJXV7*hCBC7YR>8KYEEn1UHJ`Co$kuE}zX#U(Sj<Q9j2K}lwQUSiHI7N5-Gl3UD<
zd6j;ejJG)R%TkMqGE-8EH5qTQ`DEs$CKhQj-4aXANKMX;Psz+p%`48#&nu44O)N>y
zxFzJApO==IURsn0GR?K9D8ERP@fKHDesW?`X-;BMr6%(&zTnhCqu|s+*SzHXl+>bI
z{2`gSsV<qtB}JJ@r6s8;w**5{D@xolQ*%;WbCXh2Qc^)?u=?hwq~>Tc-4gZ8%Ph%E
z%*m`uEpjX<$jMAjEXgmrB^s2P4zdANSd;0NaC&M<d`W6WNqic}$oSmE;_O>W#i@m*
zsd>q%@ySK`#l`Wdc_l^p1(osTnI#$VIr-_CCB-1E&WR<d>G?&O$%#3R$;qWfiOH3>
zgn|na^PCckQ&ZdweNz*QON&yYM3PI3OY(E$i!*Zza#G_<64TRDi;_VB48uH7M)4&E
z28L9ID8>|qD5ey~DCQKV6s8pB6qZz$W`-1|G`3XMRJK(1RE|{6R4z$|RIXHxRF+iE
zRBlOz6s8o`Ib7`wX^bgsDeNu0Q9P-<DT1kdsa&c2SpsQHk_;d^6(S-8;!83>Wrab!
z6plG8DV!->Ev(IqQ6j0l$a2W+6z(}pDLg5>EiBE9QKG55DMG0nV0kgHDSUI7QutE@
zz@~_|Gq5m3Ndz-!3f~fP28Tf~I0!<(A&?ly&cMK>prD`-l95@gkdS~ZnUJ6W4z5at
zRB%ElR!CHU#fn0HngS@w6hQF-5-&+i&Q47MnShdZ5)yP2N-|ROz;+ZArKV&imw?PE
zNlaHr%P&$$O-#;ED9O)G%~QzC1F3}DpvMK)6OdSxn44OXT2u_?>cRn-TLwzD3RVgU
z35Wy?va%?(ur#wMH3h5)<UL0PgaRD}m^8@F{Gyc9B88HCh2;F)g3^*yh2qSr)MBvX
zGV=;bOZ4+gOA1O$6bg#+vr>~mnWk6|Y%Iw8&=i{r%>`hWVB`&~4hk(!O##^o)(ZA`
zLIOA(iW3qP5)$BsBqS)5rz)hUmMA0^D}WsY(p8d?s*nrHZV;zH;-?5~J!;-fNC3MP
z6a*m|snCclQqTZLTV_eILVh_Y<V#YE@)C0tib1iMnU}7qP?C{Y0<yR~Gbcx(v^W(M
zQc0;L<*BK83ScLvWagzqA`TKUAR7|%QWT(Z43dQgTuEX&G{lk<bCOGQ5=&C!iwhF-
z;?oSl4o=F?&jAHdKD2yO&`3#5ODxSPu}w(u%g;+qNYI4)!5L<P0>}j2oYb<^90fN+
zh1}GVqRixCg_P1Fkj*7UiJ5tsdFgrzo@ojtpp*?x+#yA!sUVx+2@>S!goK>Lq|}^v
za2RIhr6(jPWER6y2-pOeI}{T05Ge{(7f4TWL27blT4rhrD9L9QgN0H{K+%<u0I?bp
zTOeCK(-e?RRlqbYFTX?q$;8YO1!%A!!n-72AtAvnF{d~+Apv4EnlYe^QCw04@-s1^
z>jMf1NE!i$Vm>(L%QI5*6kt&Y5(fpnLYg7OMv!O15_2+B6d*PfD<l@BDkwR5`YY*x
zsZbvf!_(i%5JWrqx&|vDbOu55sX`_wJYbf<(+4CjN{UKTAzC2b%S+5nEiOn*P6a0)
zJpKbKFGfy3;8HHLSRpSpH3gFYV7VTYqcH0YXu{F~xi2NP7*wP{TmtfTXkJcgaj`;W
zeyKuAW{N^_er{?>1}K0^^Gb_L6LUbxAQcvC3T25orKt+V8Tq9-DGEuc3gw_22GU!e
zm{$TZv@EqKDZe;19+sLx(T6j4DA+21qaKktKxw_GG*t)c=wbzM=1odfC`v6UEy_zx
zQ7A|)f}}Ks<ebD}P<f(|n39qSD%9XvtRx>?8i86Bi8<gRp(L}oBr_SBhcgj%J2?2U
z*7dj&99G2|n#eH(D;E*=fSm&_>=X*%l>sEciV!Uduw$^cFmO2ry=?JJ1C{Y*nJJ(Q
z3=6taXfX{>nqb$!?FBcY6be%F5_3u_F-jkBnEKsfgQP<@O|Dz4;0o)OFqXRG7CX2E
zN-er22)7JYPTt}J8v!q)Zt+9-@SJdqA1&?N;zk5u(JgMcgHwxcu|bkUGPteHz`y{a
zL9J_MQ0sad0|P?|LkVLILo?F?rW(c?&V@{j3^i;u4Drl0Of?MgEHw;StP9v`*cLL@
zFvPRhFxN1|bJVccFvN3~aFuY^Fl2Eo;91B}!;r<^%+w50!&|~v!vq#%UckSQA%!u8
zshNe5p@yZXtcJC?tVAG%xtXzvu|%+$p+qo+C55$@F@<do$S$E8mf}e@tVK&ogliaU
z*lJi)*n2_tiR7`RFw`)_i_~z`Ff=oQTGqh~njC&r5}<HZ0JkEMYN?dW+$uq6;6kfV
zkdQq_FoQ~LP(iJukW^Ztke3h1EAUWPC{HX#6v=uoK>?=8c#E|pzaTzalj#;mBCG|<
z31b;+GTq`WNlcH2$5fFt0|SGm=q=Xb(t^~YB2fkghFcu*@u1j>k1vv8U|=Yc01@IK
z2|iHE7@TC{L9LQo+<B$B@!+hRT3jRvQXvqJ7V$-(c5D$lNDizUQVE0l0k=dz4nZ_y
z;&T!!Q;Ui?KuUQ*7J*wAAeBX&AQuROB-o*4QIQCU#}%Ju2rZp7xo)u(r{<&;fgB2P
zGRQ$iQXs7$KSKN>3*yRw2zijd8E<jMLp%^4zmnmXnx1}QPEKlGJfwB551~*S>3Ri~
zMd}O;43?nesR3%Fa|$uBF|sl8F@i`QMjj?UMh;dEMh<2cMge9HMixdUMjj?MrYdR7
z_O2c*q%~P?u|b+!w^%?8lv^CRsU?Xii6x0e3JeSk$t)m?7#J7?7#JAX7#J9wLAHNj
zU;rg-h8o5i<`hOydtZ_vg}IlhmZgSe0pmi380K2mTDDsD5~do)X2x2M8iobTH7qq8
z3z=#;YZw-=)Nn3jtmUfVTELnD%0_GpnQFO9*t0llxWUPuX#r~uD=4dFv86LFWUS?>
z;i};QWrbSa8m1KXY|f%fC7d-}HB2=e&CIoY6-qS>SzHUa7c#go#ER7N*YHE+Kv}Js
znUSGzS~$qHykMQDY8V#qBIFk`Gcwfh)$pcp*f7*EgH^l%_dFPCm|Yl}8Ebe`xNR6}
zc!L=<c@np?LHh`xN)N3)4ieIZ18{){5`qM+f|UZO>l$5LQlz6$l3$dZq34sImmZRu
zSDarIi)e<!wSlyPnyHXP3+iJawS@H&i{lfEiV`b96^4#Nc}8Y(2CP0UsVo3>0dp!9
zl2R245{p5lc1mVZYH~?VC8z<Gh++kjp|JjILV_No3k0<~6Vyoobyh%K?xISC+{A(c
zP}QEDT3HM-Fd-oX)M@d|OG&LrEkbH#gS=1z_F%D|f`Vg-0;rLln_pa_kd#=QnXIEw
zS`0EL5!6%4%uCNn^-MuE3D#Z(RsPjVkbqROLUBJN05y_89eQxVprZgP7$9mjYvF!D
zazD5}%SbFs1=nB2iMgo?V6P+=E0m`erGgrIh^8&b51`H&s7-|GQBdau+#UoqT#}1Y
z6H7p?sEky!9!pVbK~ZXPYF-J*tCjhsMJUdP^&#~@z0OnxBv)n@D}Y*A>7cesNj|Q)
zM79hOe>y03KvG9`UVeG8LPmZ$D9V!abCWXjK=!4jr52^;l_-?w7o{k`-3qSDL2aUv
zd{BoL961VQsmUezMWBWgit*r}R>;f)HU5eqbqb_K0tz}%d7@y2k>GWpE#$-;1y~P9
z!Bzp(3PCjCK-wK4Iu#(P2;n|ZSb|~%7JQ)Wk(pPLnx0yuke;8D0&UZQTL#4-$>NN}
zf>ab|CnVrXG_Vc?$N{jD5Zt5!jWtCha<OAxWo#^Ri%JK-XW(Wf=jW9`I%VJ#m|v8c
zo(bx5g5m%a=wMM$k^znRfTAlYRRKIolcG?X2kN0GDyW0is)G_PbZ`LV+8}V_7SSn!
z7DjM?JLV}s^E6UIOU}<L2DNJQ(iQU45Sa@yt^|)ex19XM5=dr)8waw|5f(T3X$s&#
zR=^psIv`KPBdT!CganirMKQNDFE76+H!&v@RD9*+r-Ry~Nr_3BIhiGyso*FD#Xo51
z1lk5bq&cuad_hq@l6qJYq_;N`6;fbhg_-$zpp=N_Y*2OrXFi1cbMlLev4uM#)1ZWu
zqe5|VVoqX_0$3lUr~vgpb3s9elsJkQLCqGHoW#6zO~xW~1_p+g|NsC0ugQ3eIVCe!
zlj#;4B+73IV+laNTPz^slR;*HngdJ>3=E)lK`^K-0BR4VGt@A|^3*ccFs3j_GE8JD
zWC>;fH3-4!w8#k5;9<JOq-Su8u>#D45XKA)48OQ+ax#lcK%Gy!`5@y!X0tI=>0xQI
zq~#ZtCl;mX+2rIWC*~B}=^?Zh*@Eog12?~Jv4EP0MS`HV7Puv*1NM4ZW@>qnCWy)C
zc8k3@KdmG;v7$%_q)HDY$_DP27T;n`110t%eUO0yMFt>w6A)nvBFsQ4g(2fUsVR`L
zT5xOe78jxty2TGGnc{O&^U_N)ZZXE+VgV24@<3YO5S>L<Ak&13Qo&g#1vHooPPoNI
z)*uOPNN$Y><<=q_kO&*34GIZ8P^cDxg76kQG+;oZxW(Xxp{f`oA0r1N4-*$7AEN*x
z6C)Qh7ZV2~52FC12qPDx5F-Z@2cr;Ul@#t!)?_O31Nj)-2n0n6DB?k_X?PE*2h<2;
z1U3IU89EpiFf3%qVqCzqkO9QYVoqVoW-77+4*@f$u(WWLuyinFv8J#}GBh)S8dkMT
zd2AgF>5R2Z=?t~Z6=pR|E)21HwJaq}S!^{dpkd=)rdrl^#x%wh4v_vDR<JI%8s-J;
zpgtEPL!n<dLk)8cV+yAYLk(jvgC<wiT6o@Y$xj6}T)@M&sR{-8#o)fZZh2;LszPE>
zdTwHc0;pq;H0-LQker`amReK-Duh!qlT*RX8_-aHUOJ@zQ>=qB?v<F50<t0@K?zjP
zDJ3L;%41N04a(%8NQ1VRz$1(xv7*%C(j4dr2dMM}M{|)PsFjVLvp`*rV&t6F!BE2p
z%UOu{*JLWP10@W`OijjH{E0a^@vu?H{Jdg_GEfRBa$sO!SdEmsbg{&HN-AiM18e32
zxvt0plpz>zu_RVx7Ne8|pk4vky`VlDxFis%W$a+cV#orQ2PurbOwba6sYncB2q(N3
zsL2HGH{B8e^}E20iTJdl{M`6VP<s;Uy&`+CE=XR4Cws7`Ho`r{SOm(TnjE*7lM703
zv4A=-w^;K^a|<eOvE-#>=H6m0O3X`7y~PU7Y`54TiQpDnVnIP_UWz6=#B@-Sza@Ya
zi1DD(=@thlXhE@jiyKMq79YeAkYhpV1?&QFsB1D61%WaTxby(Uz%4dN@Gb(m7t}Lk
zVB=uqW8`9#V`5`uVFYDUE=G_D2V<2a&dAYZy2T1=PL@ErX0TidDvIH`GLC_P0i5d?
zQ<z#fY8bK@Kpir0-eIa?$YM%i$mT4H0<p4~J3yT_##-hYhAfsMry5YVi+Ld<BSW49
zh@HZc!qmc0!wl08lBogB$Fcgs;~&)X2U!h||4dL#AQHn=%UH`)!?b{*gJB_K4dY~R
zjQ|N6##>yVxP}yEn#_;{%njj!M+b`hA<+#g&cIs0rQ9zr8&IACR}?!zp$PIl8)KCb
zme4|&UyPKV;o3DB!4qC6o&bd*xH<!M(ZQY&gLoo^p@uP=t%#+D3FZw=##`*5vLi7k
z2kbnkAHb>f7Got?3_^fnz9@}>f#D#?!5}L@9d%8%B5;Mnnpu*XTMVvfic&zSMGKUb
z*-8si5=&AwSs)1#6bQEjA)!+oUy>gW8MC^@o@NNC7(n>}>LGAA-Qq^_{dSPOpel|5
zR1<(OBO9bvV5*YCA8?vXetw#aZkm#yK^ahXjgP;@6(66QpHiBW8Xtd)CqBNgG%*J%
z!yX@>lAjzOU&Igc2xuU!h!@1-26>4EG;~`8N-DRQi%W`%Ktl=Ok&Gfx(NP47NYL0(
zQF6vDE=0Gu2ozpLpu}~H4bt>13IG|I1R~r(L@vm1Ug$7Na%xUad^{xFp<!YOQUs0z
z(BKlNG%W@dQ5?!FtW1m$$iir##Kg?NXl2dBs3yY12p)#gRK3Lp>PP3LgWU#pIkIbt
z67y2>bBiKC#)ExO!oa`~B@XG==jG`^!l+m;2RsE09)gMzhRddv=7A^A5_4`b<>lSt
zLhy?~V=K4Bzym>fIR$#j`MJ6Id3xaa>tgWm6nH!;iWjZ|RLT{BLOV(TE(W!|2o&^?
zF)&a}+!94rkeX8ra%E~x5g#ZFx#4nod5{<cMG=yjd3k!JC7C&pHWes6-I9cBflo0(
z6KN5sEP#xXaTKMdXBL-0NB0nM17U$;<`#zy#JzT)I;I$u`6QS?jT0VF^Mz4}Nq~`u
PQ3Tv@5nvQx7GeYdr-`Oc

literal 0
HcmV?d00001

diff --git a/code/allen_custom/custom_bert_token_embedder.py b/code/allen_custom/custom_bert_token_embedder.py
new file mode 100644
index 0000000..9e36f12
--- /dev/null
+++ b/code/allen_custom/custom_bert_token_embedder.py
@@ -0,0 +1,287 @@
+"""
+A ``TokenEmbedder`` which uses one of the BERT models
+(https://github.com/google-research/bert)
+to produce embeddings.
+
+At its core it uses Hugging Face's PyTorch implementation
+(https://github.com/huggingface/pytorch-pretrained-BERT),
+so thanks to them!
+"""
+from typing import Dict, List
+import logging
+
+import torch
+import torch.nn.functional as F
+
+from pytorch_pretrained_bert.modeling import BertModel
+
+from allennlp.modules.scalar_mix import ScalarMix
+from allennlp.modules.token_embedders.token_embedder import TokenEmbedder
+from allennlp.nn import util
+
+logger = logging.getLogger(__name__)
+
+
+class PretrainedBertModel:
+    """
+    In some instances you may want to load the same BERT model twice
+    (e.g. to use as a token embedder and also as a pooling layer).
+    This factory provides a cache so that you don't actually have to load the model twice.
+    """
+    _cache: Dict[str, BertModel] = {}
+
+    @classmethod
+    def load(cls, model_name: str, cache_model: bool = True) -> BertModel:
+        if model_name in cls._cache:
+            return PretrainedBertModel._cache[model_name]
+
+        model = BertModel.from_pretrained(model_name)
+        if cache_model:
+            cls._cache[model_name] = model
+
+        return model
+
+
+class CustomBertEmbedder(TokenEmbedder):
+    """
+    A ``TokenEmbedder`` that produces BERT embeddings for your tokens.
+    Should be paired with a ``BertIndexer``, which produces wordpiece ids.
+
+    Most likely you probably want to use ``PretrainedBertEmbedder``
+    for one of the named pretrained models, not this base class.
+
+    Parameters
+    ----------
+    bert_model: ``BertModel``
+        The BERT model being wrapped.
+    top_layer_only: ``bool``, optional (default = ``False``)
+        If ``True``, then only return the top layer instead of apply the scalar mix.
+    max_pieces : int, optional (default: 512)
+        The BERT embedder uses positional embeddings and so has a corresponding
+        maximum length for its input ids. Assuming the inputs are windowed
+        and padded appropriately by this length, the embedder will split them into a
+        large batch, feed them into BERT, and recombine the output as if it was a
+        longer sequence.
+    num_start_tokens : int, optional (default: 1)
+        The number of starting special tokens input to BERT (usually 1, i.e., [CLS])
+    num_end_tokens : int, optional (default: 1)
+        The number of ending tokens input to BERT (usually 1, i.e., [SEP])
+    scalar_mix_parameters: ``List[float]``, optional, (default = None)
+        If not ``None``, use these scalar mix parameters to weight the representations
+        produced by different layers. These mixing weights are not updated during
+        training.
+    """
+    def __init__(self,
+                 bert_model: BertModel,
+                 aligning_files = None,
+                 top_layer_only: bool = False,
+                 max_pieces: int = 512,
+                 num_start_tokens: int = 1,
+                 num_end_tokens: int = 1,
+                 scalar_mix_parameters: List[float] = None) -> None:
+        super().__init__()
+        self.bert_model = bert_model
+        #self.aligning_fr = "saved_mappings/fra.sdrt.annodis_eng.rst.gum.pth"
+        self.aligning_fr = "../MUSE/results/stac-annodis_all/best_mapping.pth"
+        #self.aligning_fr = 'saved_mappings/eng.rst.gum_fra.sdrt.annodis.pth'
+        print("ALIGN", self.aligning_fr)
+        self.aligning_fr = torch.from_numpy(torch.load(self.aligning_fr))
+        self.aligning_fr_t = torch.transpose(self.aligning_fr, 0, 1) #.to(torch.device('cuda:0'))
+        print(self.aligning_fr.shape)
+        self.output_dim = bert_model.config.hidden_size
+        self.max_pieces = max_pieces
+        self.num_start_tokens = num_start_tokens
+        self.num_end_tokens = num_end_tokens
+
+        if not top_layer_only:
+            self._scalar_mix = ScalarMix(bert_model.config.num_hidden_layers,
+                                         do_layer_norm=False,
+                                         initial_scalar_parameters=scalar_mix_parameters,
+                                         trainable=scalar_mix_parameters is None)
+        else:
+            self._scalar_mix = None
+
+    def get_output_dim(self) -> int:
+        return self.output_dim
+
+    def forward(self,
+                input_ids: torch.LongTensor,
+                offsets: torch.LongTensor = None,
+                token_type_ids: torch.LongTensor = None) -> torch.Tensor:
+        """
+        Parameters
+        ----------
+        input_ids : ``torch.LongTensor`` The (batch_size, ..., max_sequence_length) tensor of wordpiece ids.
+        offsets : ``torch.LongTensor``, optional
+            The BERT embeddings are one per wordpiece. However it's possible/likely
+            you might want one per original token. In that case, ``offsets``
+            represents the indices of the desired wordpiece for each original token.
+            Depending on how your token indexer is configured, this could be the
+            position of the last wordpiece for each token, or it could be the position
+            of the first wordpiece for each token.
+
+            For example, if you had the sentence "Definitely not", and if the corresponding
+            wordpieces were ["Def", "##in", "##ite", "##ly", "not"], then the input_ids
+            would be 5 wordpiece ids, and the "last wordpiece" offsets would be [3, 4].
+            If offsets are provided, the returned tensor will contain only the wordpiece
+            embeddings at those positions, and (in particular) will contain one embedding
+            per token. If offsets are not provided, the entire tensor of wordpiece embeddings
+            will be returned.
+        token_type_ids : ``torch.LongTensor``, optional
+            If an input consists of two sentences (as in the BERT paper),
+            tokens from the first sentence should have type 0 and tokens from
+            the second sentence should have type 1.  If you don't provide this
+            (the default BertIndexer doesn't) then it's assumed to be all 0s.
+        """
+        # pylint: disable=arguments-differ
+        batch_size, full_seq_len = input_ids.size(0), input_ids.size(-1)
+        initial_dims = list(input_ids.shape[:-1])
+
+        # The embedder may receive an input tensor that has a sequence length longer than can
+        # be fit. In that case, we should expect the wordpiece indexer to create padded windows
+        # of length `self.max_pieces` for us, and have them concatenated into one long sequence.
+        # E.g., "[CLS] I went to the [SEP] [CLS] to the store to [SEP] ..."
+        # We can then split the sequence into sub-sequences of that length, and concatenate them
+        # along the batch dimension so we effectively have one huge batch of partial sentences.
+        # This can then be fed into BERT without any sentence length issues. Keep in mind
+        # that the memory consumption can dramatically increase for large batches with extremely
+        # long sentences.
+        needs_split = full_seq_len > self.max_pieces
+        last_window_size = 0
+        if needs_split:
+            # Split the flattened list by the window size, `max_pieces`
+            split_input_ids = list(input_ids.split(self.max_pieces, dim=-1))
+
+            # We want all sequences to be the same length, so pad the last sequence
+            last_window_size = split_input_ids[-1].size(-1)
+            padding_amount = self.max_pieces - last_window_size
+            split_input_ids[-1] = F.pad(split_input_ids[-1], pad=[0, padding_amount], value=0)
+
+            # Now combine the sequences along the batch dimension
+            input_ids = torch.cat(split_input_ids, dim=0)
+
+            if token_type_ids is not None:
+                # Same for token_type_ids
+                split_token_type_ids = list(token_type_ids.split(self.max_pieces, dim=-1))
+
+                last_window_size = split_token_type_ids[-1].size(-1)
+                padding_amount = self.max_pieces - last_window_size
+                split_token_type_ids[-1] = F.pad(split_token_type_ids[-1], pad=[0, padding_amount], value=0)
+
+                token_type_ids = torch.cat(split_token_type_ids, dim=0)
+
+        if token_type_ids is None:
+            token_type_ids = torch.zeros_like(input_ids)
+
+        input_mask = (input_ids != 0).long()
+
+        # input_ids may have extra dimensions, so we reshape down to 2-d
+        # before calling the BERT model and then reshape back at the end.
+        all_encoder_layers, _ = self.bert_model(input_ids=util.combine_initial_dims(input_ids),
+                                                token_type_ids=util.combine_initial_dims(token_type_ids),
+                                                attention_mask=util.combine_initial_dims(input_mask))
+        all_encoder_layers = torch.stack(all_encoder_layers)
+        # ======ROTATION===== #
+        #all_encoder_layers = torch.matmul(all_encoder_layers, self.aligning_fr_t)
+
+        if needs_split:
+            # First, unpack the output embeddings into one long sequence again
+            unpacked_embeddings = torch.split(all_encoder_layers, batch_size, dim=1)
+            unpacked_embeddings = torch.cat(unpacked_embeddings, dim=2)
+
+            # Next, select indices of the sequence such that it will result in embeddings representing the original
+            # sentence. To capture maximal context, the indices will be the middle part of each embedded window
+            # sub-sequence (plus any leftover start and final edge windows), e.g.,
+            #  0     1 2    3  4   5    6    7     8     9   10   11   12    13 14  15
+            # "[CLS] I went to the very fine [SEP] [CLS] the very fine store to eat [SEP]"
+            # with max_pieces = 8 should produce max context indices [2, 3, 4, 10, 11, 12] with additional start
+            # and final windows with indices [0, 1] and [14, 15] respectively.
+
+            # Find the stride as half the max pieces, ignoring the special start and end tokens
+            # Calculate an offset to extract the centermost embeddings of each window
+            stride = (self.max_pieces - self.num_start_tokens - self.num_end_tokens) // 2
+            stride_offset = stride // 2 + self.num_start_tokens
+
+            first_window = list(range(stride_offset))
+
+            max_context_windows = [i for i in range(full_seq_len)
+                                   if stride_offset - 1 < i % self.max_pieces < stride_offset + stride]
+
+            # Lookback what's left, unless it's the whole self.max_pieces window
+            if full_seq_len % self.max_pieces == 0:
+                lookback = self.max_pieces
+            else:
+                lookback = full_seq_len % self.max_pieces
+
+            final_window_start = full_seq_len - lookback + stride_offset + stride
+            final_window = list(range(final_window_start, full_seq_len))
+
+            select_indices = first_window + max_context_windows + final_window
+
+            initial_dims.append(len(select_indices))
+
+            recombined_embeddings = unpacked_embeddings[:, :, select_indices]
+        else:
+            recombined_embeddings = all_encoder_layers
+
+        # Recombine the outputs of all layers
+        # (layers, batch_size * d1 * ... * dn, sequence_length, embedding_dim)
+        # recombined = torch.cat(combined, dim=2)
+        input_mask = (recombined_embeddings != 0).long()
+
+        if self._scalar_mix is not None:
+            mix = self._scalar_mix(recombined_embeddings, input_mask)
+        else:
+            mix = recombined_embeddings[-1]
+
+        # At this point, mix is (batch_size * d1 * ... * dn, sequence_length, embedding_dim)
+
+        if offsets is None:
+            # Resize to (batch_size, d1, ..., dn, sequence_length, embedding_dim)
+            dims = initial_dims if needs_split else input_ids.size()
+            return util.uncombine_initial_dims(mix, dims)
+        else:
+            # offsets is (batch_size, d1, ..., dn, orig_sequence_length)
+            offsets2d = util.combine_initial_dims(offsets)
+            # now offsets is (batch_size * d1 * ... * dn, orig_sequence_length)
+            range_vector = util.get_range_vector(offsets2d.size(0),
+                                                 device=util.get_device_of(mix)).unsqueeze(1)
+            # selected embeddings is also (batch_size * d1 * ... * dn, orig_sequence_length)
+            selected_embeddings = mix[range_vector, offsets2d]
+
+            return util.uncombine_initial_dims(selected_embeddings, offsets.size())
+
+
+@TokenEmbedder.register("custom-bert-pretrained")
+class CustomPretrainedBertEmbedder(CustomBertEmbedder):
+    # pylint: disable=line-too-long
+    """
+    Parameters
+    ----------
+    pretrained_model: ``str``
+        Either the name of the pretrained model to use (e.g. 'bert-base-uncased'),
+        or the path to the .tar.gz file with the model weights.
+
+        If the name is a key in the list of pretrained models at
+        https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling.py#L41
+        the corresponding path will be used; otherwise it will be interpreted as a path or URL.
+    requires_grad : ``bool``, optional (default = False)
+        If True, compute gradient of BERT parameters for fine tuning.
+    top_layer_only: ``bool``, optional (default = ``False``)
+        If ``True``, then only return the top layer instead of apply the scalar mix.
+    scalar_mix_parameters: ``List[float]``, optional, (default = None)
+        If not ``None``, use these scalar mix parameters to weight the representations
+        produced by different layers. These mixing weights are not updated during
+        training.
+    """
+    def __init__(self, pretrained_model: str, aligning_files, requires_grad: bool = False, top_layer_only: bool = False,
+                 scalar_mix_parameters: List[float] = None) -> None:
+        model = PretrainedBertModel.load(pretrained_model)
+
+        print("ALIGN", aligning_files['fr'])
+
+        for param in model.parameters():
+            param.requires_grad = requires_grad
+
+        print("CHECKPOINT")
+        super().__init__(bert_model=model, top_layer_only=top_layer_only, scalar_mix_parameters=scalar_mix_parameters)
diff --git a/code/allen_custom/custom_conll_reader.py b/code/allen_custom/custom_conll_reader.py
new file mode 100644
index 0000000..9d7a37b
--- /dev/null
+++ b/code/allen_custom/custom_conll_reader.py
@@ -0,0 +1,184 @@
+from typing import Dict, List, Sequence, Iterable
+import itertools
+import logging
+import os
+
+from overrides import overrides
+
+from allennlp.common.checks import ConfigurationError
+from allennlp.common.file_utils import cached_path
+from allennlp.data.dataset_readers.dataset_reader import DatasetReader
+from allennlp.data.dataset_readers.dataset_utils import to_bioul
+from allennlp.data.fields import TextField, SequenceLabelField, Field, MetadataField
+from allennlp.data.instance import Instance
+from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
+from allennlp.data.tokenizers import Token
+
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+
+def _is_divider(line: str) -> bool:
+    empty_line = line.strip() == ''
+    if empty_line:
+        return True
+    else:
+        first_token = line.split()[0]
+        if first_token == "-DOCSTART-":  # pylint: disable=simplifiable-if-statement
+            return True
+        else:
+            return False
+
+
+@DatasetReader.register("custom_conll_reader")
+class CustomConllDatasetReader(DatasetReader):
+    """
+    Reads instances from a pretokenised file where each line is in the following format:
+
+    WORD POS-TAG CHUNK-TAG NER-TAG
+
+    with a blank line indicating the end of each sentence
+    and '-DOCSTART- -X- -X- O' indicating the end of each article,
+    and converts it into a ``Dataset`` suitable for sequence tagging.
+
+    Each ``Instance`` contains the words in the ``"tokens"`` ``TextField``.
+    The values corresponding to the ``tag_label``
+    values will get loaded into the ``"tags"`` ``SequenceLabelField``.
+    And if you specify any ``feature_labels`` (you probably shouldn't),
+    the corresponding values will get loaded into their own ``SequenceLabelField`` s.
+
+    This dataset reader ignores the "article" divisions and simply treats
+    each sentence as an independent ``Instance``. (Technically the reader splits sentences
+    on any combination of blank lines and "DOCSTART" tags; in particular, it does the right
+    thing on well formed inputs.)
+
+    Parameters
+    ----------
+    token_indexers : ``Dict[str, TokenIndexer]``, optional (default=``{"tokens": SingleIdTokenIndexer()}``)
+        We use this to define the input representation for the text.  See :class:`TokenIndexer`.
+    tag_label: ``str``, optional (default=``ner``)
+        Specify `ner`, `pos`, or `chunk` to have that tag loaded into the instance field `tag`.
+    feature_labels: ``Sequence[str]``, optional (default=``()``)
+        These labels will be loaded as features into the corresponding instance fields:
+        ``pos`` -> ``pos_tags``, ``chunk`` -> ``chunk_tags``, ``ner`` -> ``ner_tags``
+        Each will have its own namespace: ``pos_tags``, ``chunk_tags``, ``ner_tags``.
+        If you want to use one of the tags as a `feature` in your model, it should be
+        specified here.
+    coding_scheme: ``str``, optional (default=``IOB1``)
+        Specifies the coding scheme for ``ner_labels`` and ``chunk_labels``.
+        Valid options are ``IOB1`` and ``BIOUL``.  The ``IOB1`` default maintains
+        the original IOB1 scheme in the CoNLL 2003 NER data.
+        In the IOB1 scheme, I is a token inside a span, O is a token outside
+        a span and B is the beginning of span immediately following another
+        span of the same type.
+    label_namespace: ``str``, optional (default=``labels``)
+        Specifies the namespace for the chosen ``tag_label``.
+    """
+    _VALID_LABELS = {'ner', 'pos', 'chunk'}
+
+    def __init__(self,
+                 token_indexers: Dict[str, TokenIndexer] = None,
+                 tag_label: str = "ner",
+                 feature_labels: Sequence[str] = (),
+                 lazy: bool = False,
+                 coding_scheme: str = "IOB1",
+                 label_namespace: str = "labels") -> None:
+        super().__init__(lazy)
+        self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
+        if tag_label is not None and tag_label not in self._VALID_LABELS:
+            raise ConfigurationError("unknown tag label type: {}".format(tag_label))
+        for label in feature_labels:
+            if label not in self._VALID_LABELS:
+                raise ConfigurationError("unknown feature label type: {}".format(label))
+        if coding_scheme not in ("IOB1", "BIOUL"):
+            raise ConfigurationError("unknown coding_scheme: {}".format(coding_scheme))
+
+        self.tag_label = tag_label
+        self.feature_labels = set(feature_labels)
+        self.coding_scheme = coding_scheme
+        self.label_namespace = label_namespace
+        self._original_coding_scheme = "IOB1"
+
+    @overrides
+    def _read(self, file_path: str) -> Iterable[Instance]:
+        # if `file_path` is a URL, redirect to the cache
+        file_path = cached_path(file_path)
+
+        with open(file_path, "r") as data_file:
+            logger.info("Reading instances from lines in file at: %s", file_path)
+
+            # Group into alternative divider / sentence chunks.
+            for is_divider, lines in itertools.groupby(data_file, _is_divider):
+                # Ignore the divider chunks, so that `lines` corresponds to the words
+                # of a single sentence.
+                if not is_divider:
+                    fields = [line.strip().split() for line in lines]
+                    # unzipping trick returns tuples, but our Fields need lists
+                    fields = [list(field) for field in zip(*fields)]
+                    tokens_, pos_tags, chunk_tags, ner_tags = fields
+                    # TextField requires ``Token`` objects
+                    tokens = [Token(token) for token in tokens_]
+
+                    yield self.text_to_instance(tokens, pos_tags, chunk_tags, ner_tags, file_path)
+
+    def get_lang(self, file_path):
+        _, file_name = os.path.split(file_path)
+        lang = file_name[:2]
+        if lang == 'po':
+            lang = 'pt'
+        if lang not in ['en','de','it','fr','pt','sv']:
+            raise ConfigurationError(f"Language {lang} not supported by ELMo")
+        return lang
+
+    def text_to_instance(self, # type: ignore
+                         tokens: List[Token],
+                         pos_tags: List[str] = None,
+                         chunk_tags: List[str] = None,
+                         ner_tags: List[str] = None,
+                         file_path: str = None) -> Instance:
+        """
+        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
+        """
+        # pylint: disable=arguments-differ
+        sequence = TextField(tokens, self._token_indexers)
+        instance_fields: Dict[str, Field] = {'tokens': sequence}
+        instance_fields["metadata"] = MetadataField({"words": [x.text for x in tokens], "lang": self.get_lang(file_path)})
+
+        # Recode the labels if necessary.
+        if self.coding_scheme == "BIOUL":
+            coded_chunks = to_bioul(chunk_tags,
+                                    encoding=self._original_coding_scheme) if chunk_tags is not None else None
+            coded_ner = to_bioul(ner_tags,
+                                 encoding=self._original_coding_scheme) if ner_tags is not None else None
+        else:
+            # the default IOB1
+            coded_chunks = chunk_tags
+            coded_ner = ner_tags
+
+        # Add "feature labels" to instance
+        if 'pos' in self.feature_labels:
+            if pos_tags is None:
+                raise ConfigurationError("Dataset reader was specified to use pos_tags as "
+                                         "features. Pass them to text_to_instance.")
+            instance_fields['pos_tags'] = SequenceLabelField(pos_tags, sequence, "pos_tags")
+        if 'chunk' in self.feature_labels:
+            if coded_chunks is None:
+                raise ConfigurationError("Dataset reader was specified to use chunk tags as "
+                                         "features. Pass them to text_to_instance.")
+            instance_fields['chunk_tags'] = SequenceLabelField(coded_chunks, sequence, "chunk_tags")
+        if 'ner' in self.feature_labels:
+            if coded_ner is None:
+                raise ConfigurationError("Dataset reader was specified to use NER tags as "
+                                         " features. Pass them to text_to_instance.")
+            instance_fields['ner_tags'] = SequenceLabelField(coded_ner, sequence, "ner_tags")
+
+        # Add "tag label" to instance
+        if self.tag_label == 'ner' and coded_ner is not None:
+            instance_fields['tags'] = SequenceLabelField(coded_ner, sequence,
+                                                         self.label_namespace)
+        elif self.tag_label == 'pos' and pos_tags is not None:
+            instance_fields['tags'] = SequenceLabelField(pos_tags, sequence,
+                                                         self.label_namespace)
+        elif self.tag_label == 'chunk' and coded_chunks is not None:
+            instance_fields['tags'] = SequenceLabelField(coded_chunks, sequence,
+                                                         self.label_namespace)
+
+        return Instance(instance_fields)
diff --git a/code/allen_custom/custom_disrpt_reader.py b/code/allen_custom/custom_disrpt_reader.py
new file mode 100644
index 0000000..68189e9
--- /dev/null
+++ b/code/allen_custom/custom_disrpt_reader.py
@@ -0,0 +1,187 @@
+from typing import Dict, List, Sequence, Iterable
+import itertools
+import logging
+import os
+
+from overrides import overrides
+
+from allennlp.common.checks import ConfigurationError
+from allennlp.common.file_utils import cached_path
+from allennlp.data.dataset_readers.dataset_reader import DatasetReader
+from allennlp.data.dataset_readers.dataset_utils import to_bioul
+from allennlp.data.fields import TextField, SequenceLabelField, Field, MetadataField
+from allennlp.data.instance import Instance
+from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
+from allennlp.data.tokenizers import Token
+
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+
+def _is_divider(line: str) -> bool:
+    empty_line = line.strip() == ''
+    if empty_line:
+        return True
+    else:
+        first_token = line.split()[0]
+        if first_token == "#":  
+            return True
+        else:
+            return False
+
+
+@DatasetReader.register("custom_disrpt_reader")
+class CustomDisrptDatasetReader(DatasetReader):
+    """
+    Reads instances from a pretokenised file where each line is in the following format:
+
+    WORD POS-TAG CHUNK-TAG NER-TAG
+
+    with a blank line indicating the end of each sentence
+    and '-DOCSTART- -X- -X- O' indicating the end of each article,
+    and converts it into a ``Dataset`` suitable for sequence tagging.
+
+    Each ``Instance`` contains the words in the ``"tokens"`` ``TextField``.
+    The values corresponding to the ``tag_label``
+    values will get loaded into the ``"tags"`` ``SequenceLabelField``.
+    And if you specify any ``feature_labels`` (you probably shouldn't),
+    the corresponding values will get loaded into their own ``SequenceLabelField`` s.
+
+    This dataset reader ignores the "article" divisions and simply treats
+    each sentence as an independent ``Instance``. (Technically the reader splits sentences
+    on any combination of blank lines and "DOCSTART" tags; in particular, it does the right
+    thing on well formed inputs.)
+
+    Parameters
+    ----------
+    token_indexers : ``Dict[str, TokenIndexer]``, optional (default=``{"tokens": SingleIdTokenIndexer()}``)
+        We use this to define the input representation for the text.  See :class:`TokenIndexer`.
+    tag_label: ``str``, optional (default=``ner``)
+        Specify `ner`, `pos`, or `chunk` to have that tag loaded into the instance field `tag`.
+    feature_labels: ``Sequence[str]``, optional (default=``()``)
+        These labels will be loaded as features into the corresponding instance fields:
+        ``pos`` -> ``pos_tags``, ``chunk`` -> ``chunk_tags``, ``ner`` -> ``ner_tags``
+        Each will have its own namespace: ``pos_tags``, ``chunk_tags``, ``ner_tags``.
+        If you want to use one of the tags as a `feature` in your model, it should be
+        specified here.
+    coding_scheme: ``str``, optional (default=``IOB1``)
+        Specifies the coding scheme for ``ner_labels`` and ``chunk_labels``.
+        Valid options are ``IOB1`` and ``BIOUL``.  The ``IOB1`` default maintains
+        the original IOB1 scheme in the CoNLL 2003 NER data.
+        In the IOB1 scheme, I is a token inside a span, O is a token outside
+        a span and B is the beginning of span immediately following another
+        span of the same type.
+    label_namespace: ``str``, optional (default=``labels``)
+        Specifies the namespace for the chosen ``tag_label``.
+    """
+    _VALID_LABELS = {'ner', 'pos', 'chunk'}
+
+    def __init__(self,
+                 token_indexers: Dict[str, TokenIndexer] = None,
+                 tag_label: str = "ner",
+                 feature_labels: Sequence[str] = (),
+                 lazy: bool = False,
+                 coding_scheme: str = "IOB1",
+                 label_namespace: str = "labels") -> None:
+        super().__init__(lazy)
+        self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
+        if tag_label is not None and tag_label not in self._VALID_LABELS:
+            raise ConfigurationError("unknown tag label type: {}".format(tag_label))
+        for label in feature_labels:
+            if label not in self._VALID_LABELS:
+                raise ConfigurationError("unknown feature label type: {}".format(label))
+        if coding_scheme not in ("IOB1", "BIOUL"):
+            raise ConfigurationError("unknown coding_scheme: {}".format(coding_scheme))
+
+        self.tag_label = tag_label
+        self.feature_labels = set(feature_labels)
+        self.coding_scheme = coding_scheme
+        self.label_namespace = label_namespace
+        self._original_coding_scheme = "IOB1"
+
+    @overrides
+    def _read(self, file_path: str) -> Iterable[Instance]:
+        # if `file_path` is a URL, redirect to the cache
+        file_path = cached_path(file_path)
+
+        with open(file_path, "r") as data_file:
+            logger.info("Reading instances from lines in file at: %s", file_path)
+
+            # Group into alternative divider / sentence chunks.
+            for is_divider, lines in itertools.groupby(data_file, _is_divider):
+                # Ignore the divider chunks, so that `lines` corresponds to the words
+                # of a single sentence.
+                if not is_divider:
+                    fields = [line.strip().split() for line in lines]
+                    # unzipping trick returns tuples, but our Fields need lists
+                    fields = [list(field) for field in zip(*fields)]
+                    #TOKID TOK _ POS _ _ _ _ _ TAG
+                    chunk_tags, tokens_, _, pos_tags, _, _, _, _, _, ner_tags = fields
+                    chunk_tags = list(map(lambda _: "O", chunk_tags))
+                    ner_tags = list(map(lambda x: "B-S" if x.startswith("BeginSeg=Yes") else "O", ner_tags))
+                    # TextField requires ``Token`` objects
+                    tokens = [Token(token) for token in tokens_]
+
+                    yield self.text_to_instance(tokens, pos_tags, chunk_tags, ner_tags, file_path)
+
+    def get_lang(self, file_path):
+        _, file_name = os.path.split(file_path)
+        lang = file_name[:2]
+        if lang == 'po':
+            lang = 'pt'
+        if lang not in ['en','de','it','fr','pt','sv']:
+            raise ConfigurationError(f"Language {lang} not supported by ELMo")
+        return lang
+
+    def text_to_instance(self, # type: ignore
+                         tokens: List[Token],
+                         pos_tags: List[str] = None,
+                         chunk_tags: List[str] = None,
+                         ner_tags: List[str] = None,
+                         file_path: str = None) -> Instance:
+        """
+        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
+        """
+        # pylint: disable=arguments-differ
+        sequence = TextField(tokens, self._token_indexers)
+        instance_fields: Dict[str, Field] = {'tokens': sequence}
+        instance_fields["metadata"] = MetadataField({"words": [x.text for x in tokens], "lang": self.get_lang(file_path)})
+
+        # Recode the labels if necessary.
+        if self.coding_scheme == "BIOUL":
+            coded_chunks = to_bioul(chunk_tags,
+                                    encoding=self._original_coding_scheme) if chunk_tags is not None else None
+            coded_ner = to_bioul(ner_tags,
+                                 encoding=self._original_coding_scheme) if ner_tags is not None else None
+        else:
+            # the default IOB1
+            coded_chunks = chunk_tags
+            coded_ner = ner_tags
+
+        # Add "feature labels" to instance
+        if 'pos' in self.feature_labels:
+            if pos_tags is None:
+                raise ConfigurationError("Dataset reader was specified to use pos_tags as "
+                                         "features. Pass them to text_to_instance.")
+            instance_fields['pos_tags'] = SequenceLabelField(pos_tags, sequence, "pos_tags")
+        if 'chunk' in self.feature_labels:
+            if coded_chunks is None:
+                raise ConfigurationError("Dataset reader was specified to use chunk tags as "
+                                         "features. Pass them to text_to_instance.")
+            instance_fields['chunk_tags'] = SequenceLabelField(coded_chunks, sequence, "chunk_tags")
+        if 'ner' in self.feature_labels:
+            if coded_ner is None:
+                raise ConfigurationError("Dataset reader was specified to use NER tags as "
+                                         " features. Pass them to text_to_instance.")
+            instance_fields['ner_tags'] = SequenceLabelField(coded_ner, sequence, "ner_tags")
+
+        # Add "tag label" to instance
+        if self.tag_label == 'ner' and coded_ner is not None:
+            instance_fields['tags'] = SequenceLabelField(coded_ner, sequence,
+                                                         self.label_namespace)
+        elif self.tag_label == 'pos' and pos_tags is not None:
+            instance_fields['tags'] = SequenceLabelField(pos_tags, sequence,
+                                                         self.label_namespace)
+        elif self.tag_label == 'chunk' and coded_chunks is not None:
+            instance_fields['tags'] = SequenceLabelField(coded_chunks, sequence,
+                                                         self.label_namespace)
+
+        return Instance(instance_fields)
diff --git a/code/allen_custom/custom_simple_tagger.py b/code/allen_custom/custom_simple_tagger.py
new file mode 100644
index 0000000..f4cc3da
--- /dev/null
+++ b/code/allen_custom/custom_simple_tagger.py
@@ -0,0 +1,196 @@
+from typing import Dict, Optional, List, Any
+
+import random
+
+import numpy
+from overrides import overrides
+import torch
+from torch.nn.modules.linear import Linear
+import torch.nn.functional as F
+
+from allennlp.common.checks import check_dimensions_match, ConfigurationError
+from allennlp.data import Vocabulary
+from allennlp.modules import Seq2SeqEncoder, TimeDistributed, TextFieldEmbedder
+from allennlp.models.model import Model
+from allennlp.nn import InitializerApplicator, RegularizerApplicator
+from allennlp.nn.util import get_text_field_mask, sequence_cross_entropy_with_logits
+from allennlp.training.metrics import CategoricalAccuracy, SpanBasedF1Measure
+
+
+@Model.register("custom_simple_tagger")
+class CustomSimpleTagger(Model):
+    """
+    This ``SimpleTagger`` simply encodes a sequence of text with a stacked ``Seq2SeqEncoder``, then
+    predicts a tag for each token in the sequence.
+
+    Parameters
+    ----------
+    vocab : ``Vocabulary``, required
+        A Vocabulary, required in order to compute sizes for input/output projections.
+    text_field_embedder : ``TextFieldEmbedder``, required
+        Used to embed the ``tokens`` ``TextField`` we get as input to the model.
+    encoder : ``Seq2SeqEncoder``
+        The encoder (with its own internal stacking) that we will use in between embedding tokens
+        and predicting output tags.
+    calculate_span_f1 : ``bool``, optional (default=``None``)
+        Calculate span-level F1 metrics during training. If this is ``True``, then
+        ``label_encoding`` is required. If ``None`` and
+        label_encoding is specified, this is set to ``True``.
+        If ``None`` and label_encoding is not specified, it defaults
+        to ``False``.
+    label_encoding : ``str``, optional (default=``None``)
+        Label encoding to use when calculating span f1.
+        Valid options are "BIO", "BIOUL", "IOB1", "BMES".
+        Required if ``calculate_span_f1`` is true.
+    label_namespace : ``str``, optional (default=``labels``)
+        This is needed to compute the SpanBasedF1Measure metric, if desired.
+        Unless you did something unusual, the default value should be what you want.
+    verbose_metrics : ``bool``, optional (default = False)
+        If true, metrics will be returned per label class in addition
+        to the overall statistics.
+    initializer : ``InitializerApplicator``, optional (default=``InitializerApplicator()``)
+        Used to initialize the model parameters.
+    regularizer : ``RegularizerApplicator``, optional (default=``None``)
+        If provided, will be used to calculate the regularization penalty during training.
+    """
+
+    def __init__(self, vocab: Vocabulary,
+                 text_field_embedder: TextFieldEmbedder,
+                 encoder: Seq2SeqEncoder,
+                 calculate_span_f1: bool = None,
+                 label_encoding: Optional[str] = None,
+                 label_namespace: str = "labels",
+                 verbose_metrics: bool = False,
+                 initializer: InitializerApplicator = InitializerApplicator(),
+                 regularizer: Optional[RegularizerApplicator] = None) -> None:
+        super(CustomSimpleTagger, self).__init__(vocab, regularizer)
+
+        self.label_namespace = label_namespace
+        self.text_field_embedder = text_field_embedder
+        self.num_classes = self.vocab.get_vocab_size(label_namespace)
+        self.encoder = encoder
+        self._verbose_metrics = verbose_metrics
+        self.tag_projection_layer = TimeDistributed(Linear(self.encoder.get_output_dim(),
+                                                           self.num_classes))
+
+        check_dimensions_match(text_field_embedder.get_output_dim(), encoder.get_input_dim(),
+                               "text field embedding dim", "encoder input dim")
+
+        # We keep calculate_span_f1 as a constructor argument for API consistency with
+        # the CrfTagger, even it is redundant in this class
+        # (label_encoding serves the same purpose).
+        if calculate_span_f1 and not label_encoding:
+            raise ConfigurationError("calculate_span_f1 is True, but "
+                                     "no label_encoding was specified.")
+        self.metrics = {
+                "accuracy": CategoricalAccuracy(),
+                "accuracy3": CategoricalAccuracy(top_k=3)
+        }
+
+        if calculate_span_f1 or label_encoding:
+            self._f1_metric = SpanBasedF1Measure(vocab,
+                                                 tag_namespace=label_namespace,
+                                                 label_encoding=label_encoding)
+        else:
+            self._f1_metric = None
+
+        initializer(self)
+
+    @overrides
+    def forward(self,  # type: ignore
+                tokens: Dict[str, torch.LongTensor],
+                tags: torch.LongTensor = None,
+                metadata: List[Dict[str, Any]] = None) -> Dict[str, torch.Tensor]:
+        # pylint: disable=arguments-differ
+        """
+        Parameters
+        ----------
+        tokens : Dict[str, torch.LongTensor], required
+            The output of ``TextField.as_array()``, which should typically be passed directly to a
+            ``TextFieldEmbedder``. This output is a dictionary mapping keys to ``TokenIndexer``
+            tensors.  At its most basic, using a ``SingleIdTokenIndexer`` this is: ``{"tokens":
+            Tensor(batch_size, num_tokens)}``. This dictionary will have the same keys as were used
+            for the ``TokenIndexers`` when you created the ``TextField`` representing your
+            sequence.  The dictionary is designed to be passed directly to a ``TextFieldEmbedder``,
+            which knows how to combine different word representations into a single vector per
+            token in your input.
+        tags : torch.LongTensor, optional (default = None)
+            A torch tensor representing the sequence of integer gold class labels of shape
+            ``(batch_size, num_tokens)``.
+        metadata : ``List[Dict[str, Any]]``, optional, (default = None)
+            metadata containing the original words in the sentence to be tagged under a 'words' key.
+
+        Returns
+        -------
+        An output dictionary consisting of:
+        logits : torch.FloatTensor
+            A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing
+            unnormalised log probabilities of the tag classes.
+        class_probabilities : torch.FloatTensor
+            A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing
+            a distribution of the tag classes per word.
+        loss : torch.FloatTensor, optional
+            A scalar loss to be optimised.
+
+        """
+        embedded_text_input = self.text_field_embedder(tokens, lang=metadata[0]['lang']) #tokens)
+        batch_size, sequence_length, _ = embedded_text_input.size()
+        mask = get_text_field_mask(tokens)
+        encoded_text = self.encoder(embedded_text_input, mask)
+
+        logits = self.tag_projection_layer(encoded_text)
+        reshaped_log_probs = logits.view(-1, self.num_classes)
+        class_probabilities = F.softmax(reshaped_log_probs, dim=-1).view([batch_size,
+                                                                          sequence_length,
+                                                                          self.num_classes])
+
+
+        output_dict = {"logits": logits, "class_probabilities": class_probabilities}
+
+        if tags is not None:
+            loss = sequence_cross_entropy_with_logits(logits, tags, mask)
+            for metric in self.metrics.values():
+                metric(logits, tags, mask.float())
+            if self._f1_metric is not None:
+                self._f1_metric(logits, tags, mask.float())
+            output_dict["loss"] = loss
+
+        if metadata is not None:
+            output_dict["words"] = [x["words"] for x in metadata]
+        return output_dict
+
+    @overrides
+    def decode(self, output_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+        """
+        Does a simple position-wise argmax over each token, converts indices to string labels, and
+        adds a ``"tags"`` key to the dictionary with the result.
+        """
+        all_predictions = output_dict['class_probabilities']
+        all_predictions = all_predictions.cpu().data.numpy()
+        if all_predictions.ndim == 3:
+            predictions_list = [all_predictions[i] for i in range(all_predictions.shape[0])]
+        else:
+            predictions_list = [all_predictions]
+        all_tags = []
+        for predictions in predictions_list:
+            argmax_indices = numpy.argmax(predictions, axis=-1)
+            tags = [self.vocab.get_token_from_index(x, namespace="labels")
+                    for x in argmax_indices]
+            all_tags.append(tags)
+        output_dict['tags'] = all_tags
+        return output_dict
+
+    @overrides
+    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
+        metrics_to_return = {metric_name: metric.get_metric(reset) for
+                             metric_name, metric in self.metrics.items()}
+
+        if self._f1_metric is not None:
+            f1_dict = self._f1_metric.get_metric(reset=reset)
+            if self._verbose_metrics:
+                metrics_to_return.update(f1_dict)
+            else:
+                metrics_to_return.update({
+                        x: y for x, y in f1_dict.items() if
+                        "overall" in x})
+        return metrics_to_return
diff --git a/code/classes_def.py b/code/classes_def.py
index 7e9857c..bcc9b77 100644
--- a/code/classes_def.py
+++ b/code/classes_def.py
@@ -10,16 +10,43 @@ class Input:
         self.file = infos['file'] 
         self.form = infos['format'] # not used
         self.gold = infos['gold'] # not used
-        self.resu = infos['results_path'] # misused : le créer automatiquement
+        self.resu = f"{self.path}/results"
 
 
 class Process:
     def __init__(self, infos, data):
-        self.main = infos["main"]
-        self.toke = infos['pre-processing']['tokenization'] # not used
         self.data = data
-        self.model = infos['discourse_segmenter']['model'] # ezpz for Tony 
+        self.main = infos["main"] # train test annotation
+
+        self.toke = infos['pre-processing']['tokenization'] # not used
         self.ssplit = infos['pre-processing']['sentence_split']
         self.ssplitor = infos['pre-processing']['sentence_split_splitor']
+        self.ner_init = infos['pre-processing']['NER_format_initialisation']
+
+        if self.main == "train":
+            if self.ner_init == True : # à faire en relatif !! split truc
+                self.train_data = f"{self.data.path}/{self.data.name}_train.ner{self.data.file}"
+                self.dev_data = f"{self.data.path}/{self.data.name}_dev.ner{self.data.file}"
+            else :
+                self.train_data = infos['discourse_segmenter']['training']['train_data_path']
+                self.dev_data = infos['discourse_segmenter']['training']['validation_data_path']
+        self.toolkit = infos['discourse_segmenter']['training']['toolkit']
+        self.tr_config = infos['discourse_segmenter']['training']['config_file']
+        self.pretr_lm = infos['discourse_segmenter']['training']['pre_trained_lm']
+
+        self.model = infos['discourse_segmenter']['model'] # ezpz for Tony 
+
         self.post_tab = infos['post-processing']['json_to_tab']
-        self.post_bracket = infos['post-processing']['tab_to_bracket']
\ No newline at end of file
+
+        self.eval = infos['evaluation']
+        self.test_data = infos['gold_test_data_path']
+
+        #if self.eval == True :
+        #    if self.ner_init == True :
+        #        self.test_data = f"{self.data.path}/{self.data.name}_test.ner{self.data.file}"
+        #        #self.test_data = infos['gold_test_data_path']
+        #    else :
+        #        self.test_data = infos['gold_test_data_path']
+
+        self.post_bracket = infos['post-processing']['tab_to_bracket']
+        
\ No newline at end of file
diff --git a/code/config_4.json b/code/config_4.json
new file mode 100644
index 0000000..dc1d29f
--- /dev/null
+++ b/code/config_4.json
@@ -0,0 +1,44 @@
+{
+    "usecase_description": "Config file for usecase_3 : Take a EDU gold segmented set of train/dev/test of texts au format conll as input, train a model, output scores.",
+    "input": {
+        "name": "eng.rst.rstdt",
+        "file": ".conllu", 
+        "file_options": [".conllu", ".tok"],
+        "folder_path": "../data/eng.rst.rstdt",
+        "format": "truc",
+        "language": "en",
+        "gold": true
+    },
+    "output": {
+        "format": "ner_tok", 
+        "framework": "rst"
+    },
+    "steps":{
+        "main": "train",
+        "pre-processing": {
+            "tokenization": false,
+            "sentence_split": false,
+            "sentence_split_splitor": "stanza",
+            "syntactic_parsing": false, 
+            "NER_format_initialisation": true
+        },
+        "discourse_segmenter": {
+            "model": null,
+            "training": {
+                "toolkit": "allennlp",
+                "pre_trained_lm": "bert",
+                "config_file": "../model/config_training.jsonnet",
+                "train_data_path": "../data/eng.rst.rstdt/eng.rst.rstdt_train.conllu",
+                "validation_data_path": "../data/eng.rst.rstdt/eng.rst.rstdt_dev.conllu"
+            }
+        },
+        "post-processing": {
+            "json_to_tab": false,
+            "tab_to_bracket":false
+        },
+        "evaluation": true,
+        "gold_test_data_path": "../data/eng.rst.rstdt/eng.rst.rstdt_test.conllu"
+    }
+}
+
+
diff --git a/code/discut22_1.py b/code/discut22_1.py
index 5a24a5c..fd531d6 100644
--- a/code/discut22_1.py
+++ b/code/discut22_1.py
@@ -18,9 +18,7 @@ import utils.conv2ner as c2n
 import utils.json2conll as j2c
 import utils.conll2bracket as c2bracket
 import utils.sent_split as ssent
-#import utils.ssplit.parse_corpus as ssent
-#import utils.ssplit.parse_corpus as ssent
-#import utils.ssplit.parse_stanza as ssent
+import utils.training_allennlp as tr_allen
 
 
 # fonction to get config stuffs
@@ -30,7 +28,7 @@ def get_config_infos(config_file):
         infos = json.load(f)
     data_in = Input(infos['input'])
     actions = Process(infos['steps'], data_in)
-    print("data to be process : {}".format(data_in.name))
+    print(f"data to be process : {data_in.name}")
     return actions
 
 
@@ -40,20 +38,20 @@ def get_model(model_name):
 
     if name == "tony": 
         arch = "french_tokens.tar.gz"
-        if not os.path.isfile("../model/{}".format(arch)):
+        if not os.path.isfile(f"../model/{arch}"):
             dl = "wget https://zenodo.org/record/4235850/files/french_tokens.tar.gz -P ../model --progress=bar"
             os.system(dl)
         else:
             print("Tony already in place !")
 
-    return "../model/{}".format(arch)
+    return f"../model/{arch}"
 
 
 
 # main call
-def main(config):
+def main(steps):
     
-    steps = get_config_infos(config) # on obtient la liste des trucs
+    #steps = get_config_infos(config) # on obtient la liste des trucs
     # à faire, donnée par la classe Process
     #print([x for x in enumerate(steps)])
     #suivant la liste ordonnée, faire les trucs (for now simple usecase1):
@@ -62,69 +60,128 @@ def main(config):
     # FN: soit besoin sent split, soit besoin tokenizer, soit aucun des deux
     if steps.ssplit == True :       # python code/ssplit/parse_corpus.py ${dataset} --parser stanza --out_dir data
     #### Split text into sentence : not in usecase1
-        data_in = "{}/{}{}".format(steps.data.path, steps.data.name, steps.data.file)
-        data_tok = "{}/{}.tok".format(steps.data.path, steps.data.name)
-        print("Starting sentence spliting...to {}".format(steps.data.path, steps.data.name))
+        data_in = f"{steps.data.path}/{steps.data.name}{steps.data.file}"
+        data_tok = f"{steps.data.path}/{steps.data.name}.tok"
+        print(f"Starting sentence spliting...to {steps.data.path}/steps.data.name")
     #    ssent.main(data_in, data_tok, "stanza", steps.data.lang)
 
         ssent.main(data_in, data_tok, "stanza", steps.data.lang)
 
     elif steps.toke == True :
     #### Tokenization du text        # #python ${SEG_DIR}/code/utils/fr_tokenize.py $RAW > ${RAW}.tok 
-        data_in = "{}/{}{}".format(steps.data.path, steps.data.name, steps.data.file)
-        data_tok = "{}/{}.tok".format(steps.data.path, steps.data.name)
+        data_in = f"{steps.data.path}/{steps.data.name}{steps.data.file}"
+        data_tok = f"{steps.data.path}/{steps.data.name}.tok"
     #    sys.exit("check path")
-        print("Starting Tokenization...to {}".format(data_tok))
+        print(f"Starting Tokenization...to {data_tok}")
         tk.main(data_in, data_tok) # .ss -> .tok
 
     else:
         data_tok = f"{steps.data.path}/{steps.data.name}{steps.data.file}"
 
 
+    if steps.ner_init == True:
+        if steps.main == "test" or steps.main =="annotation":
     #### Conversion en NER pb        # #python $RUNTIME/conv2ner.py ${RAW}.tok > ${RAW}.ner.tok
-    data_ner = "{}/{}.ner.tok".format(steps.data.path, steps.data.name)
-    print("Starting conversion to NER format...to {}".format(data_ner))
-    c2n.main(data_tok, data_ner, steps.data.file)
+            data_ner = f"{steps.data.path}/{steps.data.name}.ner.tok"
+            print(f"Starting conversion to NER format...to {data_ner}")
+            c2n.main(data_tok, data_ner, steps.data.file)
+        elif steps.main == "train":
+            for part in ["train", "dev", "test"]:
+                data_tok = f"{steps.data.path}/{steps.data.name}_{part}{steps.data.file}"
+                data_ner = f"{steps.data.path}/{steps.data.name}_{part}.ner{steps.data.file}"
+                print("Starting conversion to NER format...to {}".format(data_ner))
+                c2n.main(data_tok, data_ner, steps.data.file)
+
+
+    # Create the results directory
+    if not os.path.isdir(steps.data.resu):
+        print(" result directory does not exist yet")
+        os.mkdir(steps.data.resu)
 
 
+    if steps.main == "train":
+        #model_config = steps.model_config
+        #cmd = "bash utils/expes.sh eng.rst.rstdt model/config_training.jsonnet bert train"
+        #os.system(cmd)
+        if steps.toolkit == "allennlp":
+            print("toolkit allennlp for training")
+        #    tr_allen.main(steps)
+            # set the value of model from null to what was just created by training
+            steps.model = f"{steps.data.resu}/model.tar.gz"
+        elif steps.toolkit == "jiant":
+            print("Jiant toolkit not ready")
+        else :
+            print("toolkit unknown")
+        
+        #check config train file
+    elif steps.main == "test" or steps.main =="annotation":
     #### Appliquer le model choisi, sortir le JSON avec les predictions :score, proba, tags
     # #allennlp predict --use-dataset-reader --output-file ${RESULT_DIR}/${FILE}.json ${MODEL} ${RAW}.ner.tok
-    print("Checking for model...{}".format(steps.model))
-    model_path = get_model(steps.model)
-    data_json = "{}/{}.json".format(steps.data.resu, steps.data.name)
-    cmd = "allennlp predict --use-dataset-reader --output-file {} {} {} &> {}/logs.txt".format(data_json, model_path, data_ner, steps.data.resu)
-    if not os.path.isdir(steps.data.resu):
-        print(" result directory does not exist")
-        os.mkdir(steps.data.resu)
-    print("Starting Prediction...")
-    os.system(cmd)
+        print(f"Checking for model...{steps.model}")
+        model_path = get_model(steps.model)
+        data_json = f"{steps.data.resu}/{steps.data.name}.json"
+        cmd = f"allennlp predict --use-dataset-reader --output-file {data_json} {model_path} {data_ner} &> {steps.data.resu}/logs.txt"
+        print("Starting Prediction...")
+        os.system(cmd)
     #### ------------------------------- TBD do the same but with python script (or JIANT ??)
-
+    else:
+        print(" pb define model")
 
 
 
 
     if steps.post_tab == True :
-    #### Appliquer les predictions au texte et sortir le texte tokenisé avec la colone des tags-prédis     # #python $RUNTIME/json2conll.py ${RESULT_DIR}/${FILE}.json split.tok > ${RESULT_DIR}/${FILE}.split.tok
-        data_conll = "{}/{}.split.tok".format(steps.data.resu, steps.data.name)
+    #### Appliquer les predictions au texte et sortir le texte tokenisé avec la colone des tags-prédis     
+    # # #python $RUNTIME/json2conll.py ${RESULT_DIR}/${FILE}.json split.tok > ${RESULT_DIR}/${FILE}.split.tok
+        data_conll = f"{steps.data.resu}/{steps.data.name}.split.tok"
         format = "split.tok" # to retrive from config file !!!
-        print("Starting Formating from json to tok format...to {}".format(data_conll))
+        print(f"Starting Formating from json to tok format...to {data_conll}")
         j2c.main(data_json, format, data_conll)
 
     ####### EVALUATION AGAINST GOLD
     # python discut/code/utils/seg_eval.py data_gold data_pred (-s)
-    data_gold = data_tok
-    data_pred = data_conll
-    cmd = f"python utils/seg_eval.py {data_gold} {data_pred} &> {steps.data.resu}/Evaluation.txt"
-    os.system(cmd)
+    if steps.eval == True : 
+        if steps.main == "train":
+            data_gold = steps.test_data # (())== data NER because of ner_init == true((deleted))
+            if steps.ner_init == True :
+                data_gold_ner = f"{steps.data.path}/{steps.data.name}_test.ner.conllu"
+
+            # make predictions on test_data
+            model_path = steps.model # model just been created
+            # data_json about to be created by predict cmd
+            data_json = f"{steps.data.resu}/{steps.data.name}_test.predictions.json" ## à faire en relatif !! [opt : --silent ??]
+            cmd = f"allennlp predict --use-dataset-reader --output-file {data_json} {model_path} {data_gold_ner} --include-package allen_custom.custom_conll_reader --include-package allen_custom.custom_simple_tagger --include-package allen_custom.custom_disrpt_reader --predictor sentence-tagger --include-package allen_custom.custom_bert_token_embedder &> {steps.data.resu}/logs.txt"
+            #cmd = f"allennlp predict --use-dataset-reader --output-file {data_json} {model_path} {data_gold}  &> {steps.data.resu} /logs.txt"
+            print("Starting Prediction...")
+            print(f"cmd prediction: {cmd}")
+            os.system(cmd)
+            
+            data_conll = f"{steps.data.resu}/{steps.data.name}_test.predictions.conll" ## à faire en relatif
+            print(f"Starting Formating from json to tok format...to {data_conll}")
+            j2c.main(data_json, "split.tok", data_conll)
+            #data_pred_ner = f"{steps.data.resu}/eng.rst.rstdt_test.predictions.conll.ner"
+            #c2n.main(data_conll, data_pred_ner, steps.data.file)
+            print(f"starting eval, gold={data_gold}, predictions={data_conll}, model={model_path}")
+            data_g = "../data/eng.rst.rstdt/eng.rst.rstdt_test.conllu"
+            data_p = "../data/eng.rst.rstdt/results/eng.rst.rstdt_test.predictions.conll" # == data_conll
+            cmd = f"python utils/seg_eval.py {data_gold} {data_conll} &> {steps.data.resu}/Evaluation.txt"
+            os.system(cmd)
+
+
+        else :
+            data_gold = data_tok # changer les noms des var, c'est pas clair !
+            data_pred = data_conll #
+            cmd = f"python utils/seg_eval.py {data_gold} {data_pred} &> {steps.data.resu}/Evaluation.txt"
+            os.system(cmd)
 
 
 
 
     if steps.post_bracket == True :
-    ####prendre le texte tokénisé+tags-prédits et sortir le texte en plain (format du d'ebut, for now en suite de phrases) avec les brackets    # #python $RUNTIME/conll2bracket.py ${RESULT_DIR}/${FILE}.split.tok >  ${RESULT_DIR}/${FILE}.split.tok.bracket
-        data_bracket = "{}/{}.split.tok.bracket".format(steps.data.resu, steps.data.name)
-        print("Starting formating into bracket text...to {}".format(data_bracket))
+    ####prendre le texte tokénisé+tags-prédits et sortir le texte en plain (format du d'ebut, for now en suite de phrases) avec les brackets    
+    # # #python $RUNTIME/conll2bracket.py ${RESULT_DIR}/${FILE}.split.tok >  ${RESULT_DIR}/${FILE}.split.tok.bracket
+        data_bracket = f"{steps.data.resu}/{steps.data.name}.split.tok.bracket"
+        print(f"Starting formating into bracket text...to {data_bracket}")
         c2bracket.main(data_conll, data_bracket)
     
 
@@ -136,6 +193,7 @@ if __name__ == '__main__':
     parser.add_argument('--config', help='Config file in JSON')
     args = parser.parse_args()
     config = args.config
+    steps = get_config_infos(config)
 
-    main(config)
+    main(steps)
     print("Done.")
\ No newline at end of file
diff --git a/code/utils/seg_eval.py b/code/utils/seg_eval.py
index 3083f4b..1808782 100644
--- a/code/utils/seg_eval.py
+++ b/code/utils/seg_eval.py
@@ -158,6 +158,9 @@ def get_scores(gold_file, pred_file, string_input=False):
 	if "BeginSeg=Yes" in gold_labels:
 		mode = "edu"
 		seg_type = "EDUs"
+	#elif "B-S" in gold_labels:
+	#	mode = "edu"
+	#	seg_type = "EDUs"
 	else:
 		mode = "conn"
 		seg_type = "conn spans"
diff --git a/code/utils/training_allennlp.py b/code/utils/training_allennlp.py
new file mode 100644
index 0000000..65d4dfd
--- /dev/null
+++ b/code/utils/training_allennlp.py
@@ -0,0 +1,47 @@
+####### Python version of expes.sh
+
+import os
+
+
+def main(steps):
+    dataset = steps.data.name
+    config = steps.data.file # .tok .conllu
+    lmodel = steps.pretr_lm #options: bert xlm elmo elmo_aligned
+    action = "train" # inutile !
+    evalset = steps.dev_data
+    print(f"dev set : {evalset} \t trainset : {dataset}")
+    has_parent = False # ?? get this var autrement. 
+
+    tr_config = steps.tr_config 
+
+    # cas 1 : pas de "parent", pas de "toolong"
+    # cas 2 : toolong == true donc à spliter
+    # cas 3 : parent == true, pas de toolong
+
+
+    if lmodel == "xlm":
+        bert_vocab = "xlm-roberta-base"
+        bert_weights = "xlm-roberta-base"
+    else : 
+        bert_vocab = "bert-base-multilingual-cased"
+        bert_weights = "bert-base-multilingual-cased"
+
+    if lmodel == "bert_custom" and steps.ner_init == True :
+        # TODO raise error
+        print("You choose bert_custom so 'NER_format_initialisation' shall be set to false.")
+
+    #### train, has_per == False
+    # allennlp train -s Results_${CONFIG}/results_${OUTPUT} ${CODE}configs/${MODEL}.jsonnet --include-package allen_custom.custom_conll_reader --include-package allen_custom.custom_simple_tagger --include-package allen_custom.custom_disrpt_reader --include-package allen_custom.custom_bert_token_embedder 
+    # allennlp train -s Resultts_conllu/results_eng.rst.rstdt_bert ../code/utils/configs/bert.jsonnet ....
+    cmd = f"allennlp train -s {steps.data.resu} {tr_config} --include-package allen_custom.custom_conll_reader --include-package allen_custom.custom_simple_tagger --include-package allen_custom.custom_disrpt_reader --include-package allen_custom.custom_bert_token_embedder" 
+    print(cmd)
+    os.system(cmd)
+    # then...
+
+    # TODO:
+    #### train, has_par == true, en fait on fine_tune...
+    #allennlp fine-tune -m Results_${CONFIG}/results_${PARENT}_${MODEL}/model.tar.gz -c ${CODE}configs/${MODEL}.jsonnet -s Results_${CONFIG}/results_${DATASET}-${PARENT}_${MODEL} --include-package allen_custom.custom_conll_reader --include-package allen_custom.custom_simple_tagger --include-package allen_custom.custom_disrpt_reader --include-package allen_custom.custom_bert_token_embedder 
+
+    # TODO
+    ### ensuite prediction sur valset ou "parent test" ou "finetune test"... ??
+    #allennlp predict --use-dataset-reader --output-file Results_${CONFIG}/results_${OUTPUT}/${DATASET}_${EVAL}.predictions.json Results_${CONFIG}/results_${OUTPUT}/model.tar.gz ${TEST_A_PATH} --silent --include-package allen_custom.custom_conll_reader --include-package allen_custom.custom_simple_tagger --include-package allen_custom.custom_disrpt_reader --predictor sentence-tagger --include-package allen_custom.custom_bert_token_embedder 
-- 
GitLab