improve code readability

b291997d · Guilherme Henrique · 625022cf · b291997d · b291997d · b291997d
Commit b291997d authored 1 year ago by Guilherme Henrique
--- a/nlp.py
+++ b/nlp.py
@@ -2,44 +2,36 @@ import nltk
 nltk.download('averaged_perceptron_tagger')
-def get_core_concept(e1):
-    t1 = nltk.pos_tag(e1)
-    v1 = []
-    sn = False
-    for t in t1:
-        if 'V' in t[1] and len(t[0]) > 4:
-            v1.append(t[0])
+def get_core_concept(entity):
+    """
+    Get the core concept of an entity. The core concept is the first verb with length > 4 or the first noun with its
+    adjectives.
+    :param entity: RDFLib entity
+    :return: list of words
+    """
+    tags = nltk.pos_tag(entity)
+    core_concept = []
+    no_name = False
+    for (word, tag) in tags:
+        if 'V' in tag and len(word) > 4:
+            core_concept.append(word)
            break
-        if 'N' in t[1] or 'J' in t[1] and not sn:
+        if 'N' in tag or 'J' in tag and not no_name:
-            if 'IN' in t[1]:
+            if 'IN' in tag:
-                sn = True
+                no_name = True
            else:
-                v1.append(t[0])
+                core_concept.append(word)
-    return v1
+    return core_concept
-def get_core_tagged(e1):
+def filter_adjectives(words):
-    t1 = nltk.pos_tag(e1)
+    """
-    v1 = []
+    Filter adjectives from a list of words.
-    sn = False
+    :param words: list of words
-    for t in t1:
+    :return: list of words without adjectives
-        if 'V' in t[1] and len(t[0]) > 4:
+    """
-            v1.append(t)
-            break
-        if 'N' in t[1] or 'J' in t[1] and not sn:
-            if 'IN' in t[1]:
-                sn = True
-            else:
-                v1.append(t)
-    return v1
-def filter_jj(words):
    tags = nltk.pos_tag(words)
-    return list(map(lambda x: x[0], filter(lambda x: x[1][0] == 'N', tags)))
+    return list(map(lambda word: word[0], filter(lambda word: word[1][0] == 'N', tags)))
\ No newline at end of file
--- a/property_matching.py
+++ b/property_matching.py
--- a/utils.py
+++ b/utils.py
@@ -10,42 +10,81 @@ def metrics(correct, tries, total):
    return precision, recall, fm
-def gn(e, g):
+def get_name(entity, graph):
-    if type(e) is str:
+    if type(entity) is str:
-        e = Literal(e)
+        entity = Literal(entity)
-    ns = get_n(e, g)
+    name = get_n(entity, graph)
-    if ns.startswith('//'):
+    if name.startswith('//'):
-        ns = e.split('http://yago-knowledge.org/resource/')[-1]
+        name = entity.split('http://yago-knowledge.org/resource/')[-1]
-    return ns
+    return name
+def pad_encode(sentences, word_map):
+    """
+    Encodes a list of sentences into a padded tensor of integer values using a word mapping.
-def pad_encode(s, wm):
+    Example:
-    l1 = []
+        >>> word_map = {
+        ...     'I': 1,
+        ...     'love': 2,
+        ...     'coding': 3,
+        ...     'Python': 4,
+        ...     'great': 5,
+        ...     'fun': 6,
+        ...     'is': 7,
+        ... }
+        >>> sentences = ["I love coding Python", "Python is great", "Coding is fun"]
+        >>> encoded_sentences = pad_encode(sentences, word_map)
+        >>> print(encoded_sentences)
+        tensor([[1, 2, 3, 4],
+                [4, 7, 5, 0],
+                [3, 7, 6, 0]])
+    :param sentences: A list of input sentences to be encoded into tensors.
+    :param word_map: A dictionary mapping words to their corresponding integer representations.
+    :return: A tensor containing the padded and encoded sentences, where each sentence is represented
+        as a list of integers. The tensor has dimensions (num_sentences, max_sentence_length), where
+        num_sentences is the number of input sentences, and max_sentence_length is the length of the longest
+        sentence in terms of the number of words.
+    """
+    sentence_list = []
    max_len = -1
-    for q in s:
+    for sentence in sentences:
-        w = list(map(lambda q: wm[q], q.split()))
+        sentence = list(map(lambda word: word_map[word], sentence.split()))
-        if len(w) > max_len:
+        if len(sentence) > max_len:
-            max_len = len(w)
+            max_len = len(sentence)
-        l1.append(w)
+        sentence_list.append(sentence)
+    padded_sentences = []
+    for sentence in sentence_list:
+        padded_sentences.append(sentence + [0] * (max_len - len(sentence)))
-    nl1 = []
+    return torch.LongTensor(padded_sentences)
-    for w in l1:
-        nl1.append(w + [0] * (max_len - len(w)))
-    return torch.LongTensor(nl1)
+def emb_average(sentence_ids, model):
+    """
+    Calculates the average word embedding for a list of sentences using a given model.
-def emb_average(ids, emb):
+    :param sentence_ids: (list of torch.Tensor): A list of tensors representing sentences with word embeddings.
-    xe = torch.cat(list(map(lambda q: q.unsqueeze(0), ids)))
+    :param model: (torch.nn.Module): A neural network model that can compute embeddings for input sentences.
-    xem = emb(xe).sum(dim=1)
+    :return: A tensor representing the average word embedding for each input sentence.
-    cf = torch.sum((xe != 0).float(), dim=1).unsqueeze(1)
+    """
-    cf[cf == 0] = 1
+    unsqueezed_sentence = torch.cat(list(map(lambda embedding: embedding.unsqueeze(0), sentence_ids)))
-    return xem / cf
+    embedding_sum = model(unsqueezed_sentence).sum(dim=1)
+    non_zero_embeddings = torch.sum((unsqueezed_sentence != 0).float(), dim=1).unsqueeze(1)
+    non_zero_embeddings[non_zero_embeddings == 0] = 1
+    return embedding_sum / non_zero_embeddings
-def calc_acc(pred, cty):
+def calc_acc(predicted, correct):
-    acc = (torch.LongTensor(pred) == cty).float().sum() / cty.shape[0]
+    """
-    return acc.item()
+    Calculates the accuracy of a model's predictions.
\ No newline at end of file
+    :param predicted: A list of predicted labels.
+    :param correct:  A list of correct labels.
+    :return: The accuracy of the model's predictions.
+    """
+    acc = (torch.LongTensor(predicted) == correct).float().sum() / correct.shape[0]
+    return acc.item()