Skip to content
Snippets Groups Projects
Commit b291997d authored by Guilherme Henrique's avatar Guilherme Henrique
Browse files

improve code readability

parent 625022cf
No related branches found
No related tags found
No related merge requests found
...@@ -2,44 +2,36 @@ import nltk ...@@ -2,44 +2,36 @@ import nltk
nltk.download('averaged_perceptron_tagger') nltk.download('averaged_perceptron_tagger')
def get_core_concept(e1):
t1 = nltk.pos_tag(e1)
v1 = []
sn = False
for t in t1:
if 'V' in t[1] and len(t[0]) > 4:
v1.append(t[0]) def get_core_concept(entity):
"""
Get the core concept of an entity. The core concept is the first verb with length > 4 or the first noun with its
adjectives.
:param entity: RDFLib entity
:return: list of words
"""
tags = nltk.pos_tag(entity)
core_concept = []
no_name = False
for (word, tag) in tags:
if 'V' in tag and len(word) > 4:
core_concept.append(word)
break break
if 'N' in t[1] or 'J' in t[1] and not sn: if 'N' in tag or 'J' in tag and not no_name:
if 'IN' in t[1]: if 'IN' in tag:
sn = True no_name = True
else: else:
v1.append(t[0]) core_concept.append(word)
return v1 return core_concept
def get_core_tagged(e1): def filter_adjectives(words):
t1 = nltk.pos_tag(e1) """
v1 = [] Filter adjectives from a list of words.
sn = False :param words: list of words
for t in t1: :return: list of words without adjectives
if 'V' in t[1] and len(t[0]) > 4: """
v1.append(t)
break
if 'N' in t[1] or 'J' in t[1] and not sn:
if 'IN' in t[1]:
sn = True
else:
v1.append(t)
return v1
def filter_jj(words):
tags = nltk.pos_tag(words) tags = nltk.pos_tag(words)
return list(map(lambda x: x[0], filter(lambda x: x[1][0] == 'N', tags))) return list(map(lambda word: word[0], filter(lambda word: word[1][0] == 'N', tags)))
\ No newline at end of file
This diff is collapsed.
...@@ -10,42 +10,81 @@ def metrics(correct, tries, total): ...@@ -10,42 +10,81 @@ def metrics(correct, tries, total):
return precision, recall, fm return precision, recall, fm
def gn(e, g): def get_name(entity, graph):
if type(e) is str: if type(entity) is str:
e = Literal(e) entity = Literal(entity)
ns = get_n(e, g) name = get_n(entity, graph)
if ns.startswith('//'): if name.startswith('//'):
ns = e.split('http://yago-knowledge.org/resource/')[-1] name = entity.split('http://yago-knowledge.org/resource/')[-1]
return ns return name
def pad_encode(sentences, word_map):
"""
Encodes a list of sentences into a padded tensor of integer values using a word mapping.
def pad_encode(s, wm): Example:
l1 = [] >>> word_map = {
... 'I': 1,
... 'love': 2,
... 'coding': 3,
... 'Python': 4,
... 'great': 5,
... 'fun': 6,
... 'is': 7,
... }
>>> sentences = ["I love coding Python", "Python is great", "Coding is fun"]
>>> encoded_sentences = pad_encode(sentences, word_map)
>>> print(encoded_sentences)
tensor([[1, 2, 3, 4],
[4, 7, 5, 0],
[3, 7, 6, 0]])
:param sentences: A list of input sentences to be encoded into tensors.
:param word_map: A dictionary mapping words to their corresponding integer representations.
:return: A tensor containing the padded and encoded sentences, where each sentence is represented
as a list of integers. The tensor has dimensions (num_sentences, max_sentence_length), where
num_sentences is the number of input sentences, and max_sentence_length is the length of the longest
sentence in terms of the number of words.
"""
sentence_list = []
max_len = -1 max_len = -1
for q in s: for sentence in sentences:
w = list(map(lambda q: wm[q], q.split())) sentence = list(map(lambda word: word_map[word], sentence.split()))
if len(w) > max_len: if len(sentence) > max_len:
max_len = len(w) max_len = len(sentence)
l1.append(w) sentence_list.append(sentence)
padded_sentences = []
for sentence in sentence_list:
padded_sentences.append(sentence + [0] * (max_len - len(sentence)))
nl1 = [] return torch.LongTensor(padded_sentences)
for w in l1:
nl1.append(w + [0] * (max_len - len(w)))
return torch.LongTensor(nl1)
def emb_average(sentence_ids, model):
"""
Calculates the average word embedding for a list of sentences using a given model.
def emb_average(ids, emb): :param sentence_ids: (list of torch.Tensor): A list of tensors representing sentences with word embeddings.
xe = torch.cat(list(map(lambda q: q.unsqueeze(0), ids))) :param model: (torch.nn.Module): A neural network model that can compute embeddings for input sentences.
xem = emb(xe).sum(dim=1) :return: A tensor representing the average word embedding for each input sentence.
cf = torch.sum((xe != 0).float(), dim=1).unsqueeze(1) """
cf[cf == 0] = 1 unsqueezed_sentence = torch.cat(list(map(lambda embedding: embedding.unsqueeze(0), sentence_ids)))
return xem / cf embedding_sum = model(unsqueezed_sentence).sum(dim=1)
non_zero_embeddings = torch.sum((unsqueezed_sentence != 0).float(), dim=1).unsqueeze(1)
non_zero_embeddings[non_zero_embeddings == 0] = 1
return embedding_sum / non_zero_embeddings
def calc_acc(pred, cty): def calc_acc(predicted, correct):
acc = (torch.LongTensor(pred) == cty).float().sum() / cty.shape[0] """
return acc.item() Calculates the accuracy of a model's predictions.
\ No newline at end of file :param predicted: A list of predicted labels.
:param correct: A list of correct labels.
:return: The accuracy of the model's predictions.
"""
acc = (torch.LongTensor(predicted) == correct).float().sum() / correct.shape[0]
return acc.item()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment