diff --git a/Datasets/Utils/PostpreprocesTXT.py b/Datasets/Utils/PostpreprocesTXT.py index b10564ec2b0437e60ec5cdd7fa483e28f43beffa..ded916ca8a637e8329c3c358314c83d19a97816d 100644 --- a/Datasets/Utils/PostpreprocesTXT.py +++ b/Datasets/Utils/PostpreprocesTXT.py @@ -145,4 +145,12 @@ print(t.size) dict = { i : t[i] for i in range(0, len(t) ) } -save_obj(dict,"../index_to_super") \ No newline at end of file +save_obj(dict,"../index_to_super") + +t = np.unique(np.array(list(itertools.chain(*Y1)))) + +print(t.size) + +dict = { i : t[i] for i in range(0, len(t) ) } + +save_obj(dict,"../index_to_pos1") \ No newline at end of file diff --git a/SuperTagger/SuperTagger.py b/SuperTagger/SuperTagger.py index de6e60b06249b8b564e754164904028e294be5ff..70c15924fe51a8d75b35490393e984175dfe18c1 100644 --- a/SuperTagger/SuperTagger.py +++ b/SuperTagger/SuperTagger.py @@ -123,9 +123,9 @@ class SuperTagger: self.model = self.model.cpu() - pred = self.model.predict((sents_tokenized_t, sents_mask_t)) + preds, hidden = self.model.predict((sents_tokenized_t, sents_mask_t)) - return pred, self.tags_tokenizer.convert_ids_to_tags(pred.detach()) + return preds, self.tags_tokenizer.convert_ids_to_tags(preds.detach()), hidden def train(self, sentences, tags, validation_rate=0.1, epochs=20, batch_size=32, tensorboard=False, checkpoint=False): @@ -213,8 +213,7 @@ class SuperTagger: targets = batch[2].to(self.device) self.optimizer.zero_grad() - _, logit = self.model((b_sents_tokenized, b_sents_mask, targets)) - loss = self.loss(torch.transpose(logit, 1, 2), targets) + loss, logit = self.model((b_sents_tokenized, b_sents_mask, targets)) predictions = torch.argmax(logit, dim=2).detach().cpu().numpy() label_ids = targets.cpu().numpy() @@ -239,6 +238,10 @@ class SuperTagger: return epoch_acc, epoch_loss, training_time + def foward(self,b_sents_tokenized, b_sents_mask): + _, logit, hidden = self.model((b_sents_tokenized, b_sents_mask)) + return logit, hidden + def __eval_epoch(self, validation_dataloader): self.model.eval() eval_loss = 0 @@ -251,7 +254,7 @@ class SuperTagger: b_sents_mask = batch[1].to(self.device) b_symbols_tokenized = batch[2].to(self.device) - loss, logits = self.model((b_sents_tokenized, b_sents_mask, b_symbols_tokenized)) + loss, logits, _ = self.model((b_sents_tokenized, b_sents_mask, b_symbols_tokenized)) predictions = torch.argmax(logits, dim=2).detach().cpu().numpy() label_ids = b_symbols_tokenized.cpu().numpy() @@ -276,3 +279,5 @@ class SuperTagger: 'optimizer': self.optimizer, }, path) self.model.to(self.device) + + diff --git a/SuperTagger/Utils/Tagging_bert_model.py b/SuperTagger/Utils/Tagging_bert_model.py index b11dd984b396e0d45b7380ce880c415b92a0089f..3afad6caaf7e9f99894410f15e7dbb0f65a0d105 100644 --- a/SuperTagger/Utils/Tagging_bert_model.py +++ b/SuperTagger/Utils/Tagging_bert_model.py @@ -17,7 +17,8 @@ class Tagging_bert_model(Module): super(Tagging_bert_model, self).__init__() self.bert_name = bert_name self.num_labels = num_labels - self.bert = transformers.AutoModelForTokenClassification.from_pretrained(bert_name, num_labels=num_labels) + config = transformers.AutoConfig.from_pretrained(bert_name, output_hidden_states=True, num_labels=num_labels) + self.bert = transformers.AutoModelForTokenClassification.from_pretrained(bert_name, config=config) def forward(self, batch): b_input_ids = batch[0] @@ -26,9 +27,9 @@ class Tagging_bert_model(Module): output = self.bert( input_ids=b_input_ids, attention_mask=b_input_mask, labels=labels) - loss, logits = output[:2] + loss, logits, hidden = output[:3] - return loss, logits + return loss, logits, hidden def predict(self, batch): b_input_ids = batch[0] @@ -37,4 +38,4 @@ class Tagging_bert_model(Module): output = self.bert( input_ids=b_input_ids, attention_mask=b_input_mask) - return torch.argmax(output[0], dim=2) + return torch.argmax(output[0], dim=2), output[1] diff --git a/good_models/camenbert_classique_80%.pt b/good_models/camenbert_classique_80%.pt new file mode 100644 index 0000000000000000000000000000000000000000..c7e569fba3fcfd837dfd4bd1a0ad0eec52a1ed4a Binary files /dev/null and b/good_models/camenbert_classique_80%.pt differ diff --git a/train.py b/train.py index 786ed05bfa86b3f1c45e200e52b530eebf73d6a0..d5e66aaba2d1810492e8ec908eeed05c214fc7e3 100644 --- a/train.py +++ b/train.py @@ -15,7 +15,7 @@ df = read_csv_pgbar(file_path,1000) texts = df['X'].tolist() -tags = df['Z'].tolist() +tags = df['Y1'].tolist() test_s = texts[:4] tags_s = tags[:4] @@ -24,7 +24,7 @@ texts = texts[4:] tags = tags[4:] -index_to_super = load_obj('Datasets/index_to_super') +index_to_super = load_obj('Datasets/index_to_pos1') super_to_index = {v: int(k) for k, v in index_to_super.items()} tagger = SuperTagger()