Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
P
PropMatch
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
MELODI
Ontology Matching
PropMatch
Commits
b291997d
Commit
b291997d
authored
1 year ago
by
Guilherme Henrique
Browse files
Options
Downloads
Patches
Plain Diff
improve code readability
parent
625022cf
No related branches found
No related tags found
No related merge requests found
Changes
3
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
nlp.py
+25
-33
25 additions, 33 deletions
nlp.py
property_matching.py
+416
-280
416 additions, 280 deletions
property_matching.py
utils.py
+66
-27
66 additions, 27 deletions
utils.py
with
507 additions
and
340 deletions
nlp.py
+
25
−
33
View file @
b291997d
...
@@ -2,44 +2,36 @@ import nltk
...
@@ -2,44 +2,36 @@ import nltk
nltk
.
download
(
'
averaged_perceptron_tagger
'
)
nltk
.
download
(
'
averaged_perceptron_tagger
'
)
def
get_core_concept
(
e1
):
t1
=
nltk
.
pos_tag
(
e1
)
v1
=
[]
sn
=
False
for
t
in
t1
:
if
'
V
'
in
t
[
1
]
and
len
(
t
[
0
])
>
4
:
v1
.
append
(
t
[
0
])
def
get_core_concept
(
entity
):
"""
Get the core concept of an entity. The core concept is the first verb with length > 4 or the first noun with its
adjectives.
:param entity: RDFLib entity
:return: list of words
"""
tags
=
nltk
.
pos_tag
(
entity
)
core_concept
=
[]
no_name
=
False
for
(
word
,
tag
)
in
tags
:
if
'
V
'
in
tag
and
len
(
word
)
>
4
:
core_concept
.
append
(
word
)
break
break
if
'
N
'
in
t
[
1
]
or
'
J
'
in
t
[
1
]
and
not
s
n
:
if
'
N
'
in
t
ag
or
'
J
'
in
t
ag
and
not
n
o_name
:
if
'
IN
'
in
t
[
1
]
:
if
'
IN
'
in
t
ag
:
s
n
=
True
n
o_name
=
True
else
:
else
:
v1
.
append
(
t
[
0
]
)
core_concept
.
append
(
word
)
return
v1
return
core_concept
def
get_core_tagged
(
e1
):
def
filter_adjectives
(
words
):
t1
=
nltk
.
pos_tag
(
e1
)
"""
v1
=
[]
Filter adjectives from a list of words.
sn
=
False
:param words: list of words
for
t
in
t1
:
:return: list of words without adjectives
if
'
V
'
in
t
[
1
]
and
len
(
t
[
0
])
>
4
:
"""
v1
.
append
(
t
)
break
if
'
N
'
in
t
[
1
]
or
'
J
'
in
t
[
1
]
and
not
sn
:
if
'
IN
'
in
t
[
1
]:
sn
=
True
else
:
v1
.
append
(
t
)
return
v1
def
filter_jj
(
words
):
tags
=
nltk
.
pos_tag
(
words
)
tags
=
nltk
.
pos_tag
(
words
)
return
list
(
map
(
lambda
x
:
x
[
0
],
filter
(
lambda
x
:
x
[
1
][
0
]
==
'
N
'
,
tags
)))
return
list
(
map
(
lambda
word
:
word
[
0
],
filter
(
lambda
word
:
word
[
1
][
0
]
==
'
N
'
,
tags
)))
\ No newline at end of file
This diff is collapsed.
Click to expand it.
property_matching.py
+
416
−
280
View file @
b291997d
This diff is collapsed.
Click to expand it.
utils.py
+
66
−
27
View file @
b291997d
...
@@ -10,42 +10,81 @@ def metrics(correct, tries, total):
...
@@ -10,42 +10,81 @@ def metrics(correct, tries, total):
return
precision
,
recall
,
fm
return
precision
,
recall
,
fm
def
g
n
(
e
,
g
):
def
g
et_name
(
entity
,
graph
):
if
type
(
e
)
is
str
:
if
type
(
e
ntity
)
is
str
:
e
=
Literal
(
e
)
e
ntity
=
Literal
(
e
ntity
)
n
s
=
get_n
(
e
,
g
)
n
ame
=
get_n
(
e
ntity
,
graph
)
if
n
s
.
startswith
(
'
//
'
):
if
n
ame
.
startswith
(
'
//
'
):
n
s
=
e
.
split
(
'
http://yago-knowledge.org/resource/
'
)[
-
1
]
n
ame
=
e
ntity
.
split
(
'
http://yago-knowledge.org/resource/
'
)[
-
1
]
return
n
s
return
n
ame
def
pad_encode
(
sentences
,
word_map
):
"""
Encodes a list of sentences into a padded tensor of integer values using a word mapping.
def
pad_encode
(
s
,
wm
):
Example:
l1
=
[]
>>>
word_map
=
{
...
'
I
'
:
1
,
...
'
love
'
:
2
,
...
'
coding
'
:
3
,
...
'
Python
'
:
4
,
...
'
great
'
:
5
,
...
'
fun
'
:
6
,
...
'
is
'
:
7
,
...
}
>>>
sentences
=
[
"
I love coding Python
"
,
"
Python is great
"
,
"
Coding is fun
"
]
>>>
encoded_sentences
=
pad_encode
(
sentences
,
word_map
)
>>>
print
(
encoded_sentences
)
tensor
([[
1
,
2
,
3
,
4
],
[
4
,
7
,
5
,
0
],
[
3
,
7
,
6
,
0
]])
:param sentences: A list of input sentences to be encoded into tensors.
:param word_map: A dictionary mapping words to their corresponding integer representations.
:return: A tensor containing the padded and encoded sentences, where each sentence is represented
as a list of integers. The tensor has dimensions (num_sentences, max_sentence_length), where
num_sentences is the number of input sentences, and max_sentence_length is the length of the longest
sentence in terms of the number of words.
"""
sentence_list
=
[]
max_len
=
-
1
max_len
=
-
1
for
q
in
s
:
for
sentence
in
sentences
:
w
=
list
(
map
(
lambda
q
:
wm
[
q
],
q
.
split
()))
sentence
=
list
(
map
(
lambda
word
:
word_map
[
word
],
sentence
.
split
()))
if
len
(
w
)
>
max_len
:
if
len
(
sentence
)
>
max_len
:
max_len
=
len
(
w
)
max_len
=
len
(
sentence
)
l1
.
append
(
w
)
sentence_list
.
append
(
sentence
)
padded_sentences
=
[]
for
sentence
in
sentence_list
:
padded_sentences
.
append
(
sentence
+
[
0
]
*
(
max_len
-
len
(
sentence
)))
nl1
=
[]
return
torch
.
LongTensor
(
padded_sentences
)
for
w
in
l1
:
nl1
.
append
(
w
+
[
0
]
*
(
max_len
-
len
(
w
)))
return
torch
.
LongTensor
(
nl1
)
def
emb_average
(
sentence_ids
,
model
):
"""
Calculates the average word embedding for a list of sentences using a given model.
def
emb_average
(
ids
,
emb
):
:param sentence_ids: (list of torch.Tensor): A list of tensors representing sentences with word embeddings.
xe
=
torch
.
cat
(
list
(
map
(
lambda
q
:
q
.
unsqueeze
(
0
),
ids
)))
:param model: (torch.nn.Module): A neural network model that can compute embeddings for input sentences.
xem
=
emb
(
xe
).
sum
(
dim
=
1
)
:return: A tensor representing the average word embedding for each input sentence.
cf
=
torch
.
sum
((
xe
!=
0
).
float
(),
dim
=
1
).
unsqueeze
(
1
)
"""
cf
[
cf
==
0
]
=
1
unsqueezed_sentence
=
torch
.
cat
(
list
(
map
(
lambda
embedding
:
embedding
.
unsqueeze
(
0
),
sentence_ids
)))
return
xem
/
cf
embedding_sum
=
model
(
unsqueezed_sentence
).
sum
(
dim
=
1
)
non_zero_embeddings
=
torch
.
sum
((
unsqueezed_sentence
!=
0
).
float
(),
dim
=
1
).
unsqueeze
(
1
)
non_zero_embeddings
[
non_zero_embeddings
==
0
]
=
1
return
embedding_sum
/
non_zero_embeddings
def
calc_acc
(
pred
,
cty
):
def
calc_acc
(
predicted
,
correct
):
acc
=
(
torch
.
LongTensor
(
pred
)
==
cty
).
float
().
sum
()
/
cty
.
shape
[
0
]
"""
return
acc
.
item
()
Calculates the accuracy of a model
'
s predictions.
\ No newline at end of file
:param predicted: A list of predicted labels.
:param correct: A list of correct labels.
:return: The accuracy of the model
'
s predictions.
"""
acc
=
(
torch
.
LongTensor
(
predicted
)
==
correct
).
float
().
sum
()
/
correct
.
shape
[
0
]
return
acc
.
item
()
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment