Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
G
Get Rankings
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
dacosta
Get Rankings
Commits
79045f46
Commit
79045f46
authored
2 years ago
by
Georges Da Costa
Browse files
Options
Downloads
Patches
Plain Diff
Refactors JSR part
parent
cebf5d40
No related branches found
No related tags found
No related merge requests found
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
get_rankings/get_rankings.py
+76
-172
76 additions, 172 deletions
get_rankings/get_rankings.py
get_rankings/hash_cache.py
+30
-0
30 additions, 0 deletions
get_rankings/hash_cache.py
get_rankings/tools.py
+69
-0
69 additions, 0 deletions
get_rankings/tools.py
with
175 additions
and
172 deletions
get_rankings/get_rankings.py
+
76
−
172
View file @
79045f46
...
@@ -4,63 +4,20 @@ import logging
...
@@ -4,63 +4,20 @@ import logging
from
tqdm
import
tqdm
from
tqdm
import
tqdm
from
tqdm.contrib.logging
import
logging_redirect_tqdm
from
tqdm.contrib.logging
import
logging_redirect_tqdm
import
os
import
os
import
requests
import
datetime
import
datetime
from
dateutil.parser
import
parse
as
parsedate
from
dateutil.parser
import
parse
as
parsedate
from
bs4
import
BeautifulSoup
from
bs4
import
BeautifulSoup
import
pandas
as
pd
import
pandas
as
pd
import
numpy
import
json
import
argparse
import
argparse
from
hash_cache
import
load_hash_caches
,
save_hash_caches
,
default_cache
from
tools
import
levenshtein
,
download
,
get_in_ordered_list
LOG
=
logging
.
getLogger
(
__name__
)
def
getwithpb
(
url
):
LOG
.
info
(
f
"
fetching
{
url
}
"
)
r
=
requests
.
get
(
url
,
stream
=
True
)
data
=
b
""
total_size
=
int
(
r
.
headers
.
get
(
"
content-length
"
,
0
))
for
chunk
in
tqdm
(
r
.
iter_content
(
32
*
1024
),
total
=
total_size
,
unit
=
"
B
"
,
unit_scale
=
True
,
leave
=
False
,
):
if
chunk
:
data
+=
chunk
return
data
def
fgetwithpb
(
url
,
filename
):
LOG
.
info
(
f
"
fetching
{
url
}
"
)
r
=
requests
.
get
(
url
,
stream
=
True
)
data
=
b
""
total_size
=
int
(
r
.
headers
.
get
(
"
content-length
"
,
0
))
with
open
(
filename
,
"
wb
"
)
as
file
:
for
chunk
in
tqdm
(
r
.
iter_content
(
32
*
1024
),
total
=
total_size
,
unit
=
"
B
"
,
unit_scale
=
True
,
leave
=
False
,
):
if
chunk
:
file
.
write
(
chunk
)
data
+=
chunk
return
data
def
comp_lower
(
a
,
b
):
def
comp_lower
(
a
,
b
):
return
isinstance
(
a
,
str
)
and
isinstance
(
b
,
str
)
and
a
.
lower
()
==
b
.
lower
()
return
isinstance
(
a
,
str
)
and
isinstance
(
b
,
str
)
and
a
.
lower
()
==
b
.
lower
()
def
default_cache
():
return
os
.
environ
[
"
HOME
"
]
+
"
/.local/state/pyrank
"
def
get_dblp
(
url
,
cache
=
True
,
cache_dir
=
None
):
def
get_dblp
(
url
,
cache
=
True
,
cache_dir
=
None
):
if
cache_dir
is
None
:
if
cache_dir
is
None
:
cache_dir
=
default_cache
()
cache_dir
=
default_cache
()
...
@@ -68,7 +25,7 @@ def get_dblp(url, cache=True, cache_dir=None):
...
@@ -68,7 +25,7 @@ def get_dblp(url, cache=True, cache_dir=None):
filename
=
"
%s/%s
"
%
(
cache_dir
,
target
.
replace
(
"
/
"
,
"
_
"
))
filename
=
"
%s/%s
"
%
(
cache_dir
,
target
.
replace
(
"
/
"
,
"
_
"
))
os
.
makedirs
(
cache_dir
,
exist_ok
=
True
)
os
.
makedirs
(
cache_dir
,
exist_ok
=
True
)
if
not
os
.
path
.
exists
(
filename
)
or
not
cache
:
if
not
os
.
path
.
exists
(
filename
)
or
not
cache
:
data
=
fgetwithpb
(
url
,
filename
)
data
=
download
(
url
,
filename
)
else
:
else
:
with
open
(
filename
,
"
rb
"
)
as
file
:
with
open
(
filename
,
"
rb
"
)
as
file
:
data
=
file
.
read
()
data
=
file
.
read
()
...
@@ -115,7 +72,7 @@ def get_core_rank(name, year):
...
@@ -115,7 +72,7 @@ def get_core_rank(name, year):
source
,
source
,
)
)
data
=
getwithpb
(
url
)
data
=
download
(
url
)
cc_soup
=
BeautifulSoup
(
data
,
"
html.parser
"
)
cc_soup
=
BeautifulSoup
(
data
,
"
html.parser
"
)
table
=
cc_soup
.
find_all
(
"
table
"
)
table
=
cc_soup
.
find_all
(
"
table
"
)
if
len
(
table
)
==
0
:
if
len
(
table
)
==
0
:
...
@@ -129,125 +86,82 @@ def get_core_rank(name, year):
...
@@ -129,125 +86,82 @@ def get_core_rank(name, year):
return
None
return
None
def
levenshteinDistanceDP
(
token1
,
token2
):
distances
=
numpy
.
zeros
((
len
(
token1
)
+
1
,
len
(
token2
)
+
1
))
for
t1
in
range
(
len
(
token1
)
+
1
):
distances
[
t1
][
0
]
=
t1
for
t2
in
range
(
len
(
token2
)
+
1
):
distances
[
0
][
t2
]
=
t2
a
=
0
b
=
0
c
=
0
for
t1
in
range
(
1
,
len
(
token1
)
+
1
):
for
t2
in
range
(
1
,
len
(
token2
)
+
1
):
if
token1
[
t1
-
1
]
==
token2
[
t2
-
1
]:
distances
[
t1
][
t2
]
=
distances
[
t1
-
1
][
t2
-
1
]
else
:
a
=
distances
[
t1
][
t2
-
1
]
b
=
distances
[
t1
-
1
][
t2
]
c
=
distances
[
t1
-
1
][
t2
-
1
]
if
a
<=
b
and
a
<=
c
:
distances
[
t1
][
t2
]
=
a
+
1
elif
b
<=
a
and
b
<=
c
:
distances
[
t1
][
t2
]
=
b
+
1
else
:
distances
[
t1
][
t2
]
=
c
+
1
return
distances
[
len
(
token1
)][
len
(
token2
)]
def
list_to_hash
(
content
):
return
{
tuple
(
elem
[
0
]):
elem
[
1
]
for
elem
in
content
}
def
load_ranking_caches
(
basename
,
cache_dir
=
None
):
if
cache_dir
is
None
:
cache_dir
=
default_cache
()
core
=
"
%s/%s.json
"
%
(
cache_dir
,
basename
)
if
os
.
path
.
exists
(
core
):
with
open
(
core
,
"
r
"
)
as
fid
:
# for elem in
return
list_to_hash
(
json
.
load
(
fid
))
return
{}
class
Sjr
:
def
__init__
(
self
):
self
.
ranking_caches
=
load_hash_caches
(
"
sjr
"
)
def
hash_to_list
(
content
):
def
close
(
self
):
return
[[
a
,
content
[
a
]]
for
a
in
content
]
save_hash_caches
(
self
.
ranking_caches
,
"
sjr
"
)
def
save_ranking_caches
(
cache
,
basename
,
cache_dir
=
None
):
if
cache_dir
is
None
:
cache_dir
=
default_cache
()
os
.
makedirs
(
cache_dir
,
exist_ok
=
True
)
core
=
"
%s/%s.json
"
%
(
cache_dir
,
basename
)
with
open
(
core
,
"
w
"
)
as
fid
:
json
.
dump
(
hash_to_list
(
cache
),
fid
)
def
get_sjr_in_cache
(
rankings
,
str_year
):
year
=
int
(
str_year
)
if
rankings
==
[]:
return
None
current
=
rankings
[
0
]
for
elem
in
rankings
[
1
:]:
if
year
<
elem
[
0
]:
return
current
current
=
elem
return
current
def
get_sjr_rank
(
name
):
url
=
"
https://www.scimagojr.com/journalsearch.php?q=%s
"
%
name
.
replace
(
"
"
,
"
+
"
)
data
=
getwithpb
(
url
)
sjr_soup
=
BeautifulSoup
(
data
,
"
html.parser
"
)
revues
=
sjr_soup
.
find
(
"
div
"
,
class_
=
"
search_results
"
)
dist
=
-
1
reference
=
None
best_name
=
None
for
revue
in
revues
.
find_all
(
"
a
"
):
tmp
=
revue
.
find
(
"
span
"
).
text
lev
=
levenshteinDistanceDP
(
tmp
,
name
)
if
dist
==
-
1
or
lev
<
dist
:
dist
=
lev
best_name
=
tmp
reference
=
"
https://www.scimagojr.com/%s
"
%
revue
[
"
href
"
]
if
reference
is
None
:
return
[]
data
=
getwithpb
(
reference
)
sjr_soup
=
BeautifulSoup
(
data
,
"
html.parser
"
)
table
=
sjr_soup
.
find_all
(
"
table
"
)
if
len
(
table
)
==
0
:
return
[]
df
=
pd
.
read_html
(
str
(
table
))[
0
]
if
"
Quartile
"
in
df
:
df
[
"
Rank
"
]
=
[
int
(
val
[
1
])
for
val
in
df
.
Quartile
]
else
:
return
[]
mins
=
df
.
groupby
(
"
Year
"
).
min
().
Rank
maxs
=
df
.
groupby
(
"
Year
"
).
max
().
Rank
.
to_dict
()
def
get
(
self
,
name
,
second_name
,
year
):
result
=
[]
if
(
name
,
second_name
)
in
self
.
ranking_caches
:
for
(
y
,
v
)
in
mins
.
items
():
rankings
=
self
.
ranking_caches
[(
name
,
second_name
)]
if
v
==
maxs
[
y
]:
ranking
=
"
Q%s
"
%
v
else
:
else
:
ranking
=
"
Q%s-Q%s
"
%
(
v
,
maxs
[
y
])
rankings
=
self
.
get_sjr_rank
(
name
)
result
.
append
((
y
,
best_name
,
ranking
))
self
.
ranking_caches
[(
name
,
second_name
)]
=
rankings
rank
=
get_in_ordered_list
(
rankings
,
int
(
year
))
return
result
if
rank
is
None
:
return
[
"
J
"
,
name
,
second_name
,
int
(
year
),
None
,
None
,
None
]
else
:
return
[
"
J
"
,
name
,
second_name
,
int
(
year
),
rank
[
1
],
None
,
rank
[
2
]]
def
get_sjr_rank
(
self
,
name
):
url
=
"
https://www.scimagojr.com/journalsearch.php?q=%s
"
%
name
.
replace
(
"
"
,
"
+
"
)
data
=
download
(
url
)
sjr_soup
=
BeautifulSoup
(
data
,
"
html.parser
"
)
revues
=
sjr_soup
.
find
(
"
div
"
,
class_
=
"
search_results
"
)
dist
=
-
1
reference
=
None
best_name
=
None
for
revue
in
revues
.
find_all
(
"
a
"
):
tmp
=
revue
.
find
(
"
span
"
).
text
lev
=
levenshtein
(
tmp
,
name
)
if
dist
==
-
1
or
lev
<
dist
:
dist
=
lev
best_name
=
tmp
reference
=
"
https://www.scimagojr.com/%s
"
%
revue
[
"
href
"
]
if
reference
is
None
:
return
[]
data
=
download
(
reference
)
sjr_soup
=
BeautifulSoup
(
data
,
"
html.parser
"
)
table
=
sjr_soup
.
find_all
(
"
table
"
)
if
len
(
table
)
==
0
:
return
[]
df
=
pd
.
read_html
(
str
(
table
))[
0
]
if
"
Quartile
"
in
df
:
df
[
"
Rank
"
]
=
[
int
(
val
[
1
])
for
val
in
df
.
Quartile
]
else
:
return
[]
mins
=
df
.
groupby
(
"
Year
"
).
min
().
Rank
maxs
=
df
.
groupby
(
"
Year
"
).
max
().
Rank
.
to_dict
()
result
=
[]
for
(
y
,
v
)
in
mins
.
items
():
if
v
==
maxs
[
y
]:
ranking
=
"
Q%s
"
%
v
else
:
ranking
=
"
Q%s-Q%s
"
%
(
v
,
maxs
[
y
])
result
.
append
((
y
,
best_name
,
ranking
))
return
result
def
main
():
def
main
():
sjr_ranking_caches
=
load_ranking_caches
(
"
sjr
"
)
sjr
=
Sjr
()
core_ranking_caches
=
load_ranking_caches
(
"
core
"
)
#sjr_ranking_caches = load_hash_caches("sjr")
core_ranking_caches
=
load_hash_caches
(
"
core
"
)
parser
=
argparse
.
ArgumentParser
(
parser
=
argparse
.
ArgumentParser
(
description
=
"
Get ranking from DBLP and show a small summary
"
description
=
"
Get ranking from DBLP and show a small summary
"
...
@@ -287,12 +201,14 @@ def main():
...
@@ -287,12 +201,14 @@ def main():
logging
.
basicConfig
(
level
=
args
.
loglevel
,
format
=
"
%(levelname)s %(message)s
"
)
logging
.
basicConfig
(
level
=
args
.
loglevel
,
format
=
"
%(levelname)s %(message)s
"
)
username
,
elements
=
get_dblp
(
url
)
username
,
elements
=
get_dblp
(
url
)
print
(
username
)
# Keeps only elements in the requested range
elements
=
[
elem
for
elem
in
elements
if
start_year
<=
int
(
elem
[
-
1
])
<=
end_year
]
print
(
username
)
result
=
[]
result
=
[]
with
logging_redirect_tqdm
():
with
logging_redirect_tqdm
():
for
venue
,
name
,
second_name
,
year
in
tqdm
(
elements
):
for
venue
,
name
,
second_name
,
year
in
tqdm
(
elements
):
if
start_year
<=
int
(
year
)
<=
end_year
:
if
venue
==
"
conf
"
:
if
venue
==
"
conf
"
:
if
(
name
,
second_name
,
year
)
in
core_ranking_caches
:
if
(
name
,
second_name
,
year
)
in
core_ranking_caches
:
rank
=
core_ranking_caches
[(
name
,
second_name
,
year
)]
rank
=
core_ranking_caches
[(
name
,
second_name
,
year
)]
...
@@ -319,25 +235,13 @@ def main():
...
@@ -319,25 +235,13 @@ def main():
)
)
elif
venue
==
"
journals
"
:
elif
venue
==
"
journals
"
:
if
(
name
,
second_name
)
in
sjr_ranking_caches
:
result
.
append
(
sjr
.
get
(
name
,
second_name
,
year
))
rankings
=
sjr_ranking_caches
[(
name
,
second_name
)]
else
:
rankings
=
get_sjr_rank
(
name
)
sjr_ranking_caches
[(
name
,
second_name
)]
=
rankings
rank
=
get_sjr_in_cache
(
rankings
,
year
)
if
rank
is
None
:
result
.
append
(
[
"
J
"
,
name
,
second_name
,
int
(
year
),
None
,
None
,
None
]
)
else
:
result
.
append
(
[
"
J
"
,
name
,
second_name
,
int
(
year
),
rank
[
1
],
None
,
rank
[
2
]]
)
else
:
else
:
tqdm
.
write
(
f
"
venue:
{
venue
}
?
"
)
tqdm
.
write
(
f
"
venue:
{
venue
}
?
"
)
save_ranking_caches
(
sjr_ranking_caches
,
"
sjr
"
)
#save_hash_caches(sjr_ranking_caches, "sjr")
save_ranking_caches
(
core_ranking_caches
,
"
core
"
)
save_hash_caches
(
core_ranking_caches
,
"
core
"
)
sjr
.
close
()
df
=
pd
.
DataFrame
(
df
=
pd
.
DataFrame
(
result
,
columns
=
[
"
type
"
,
"
name
"
,
"
short
"
,
"
year
"
,
"
longname
"
,
"
acronym
"
,
"
rank
"
]
result
,
columns
=
[
"
type
"
,
"
name
"
,
"
short
"
,
"
year
"
,
"
longname
"
,
"
acronym
"
,
"
rank
"
]
)
)
...
...
This diff is collapsed.
Click to expand it.
get_rankings/hash_cache.py
0 → 100644
+
30
−
0
View file @
79045f46
import
json
import
os
def
default_cache
():
return
os
.
environ
[
"
HOME
"
]
+
"
/.local/state/pyrank
"
def
list_to_hash
(
content
):
return
{
tuple
(
elem
[
0
]):
elem
[
1
]
for
elem
in
content
}
def
hash_to_list
(
content
):
return
[[
a
,
content
[
a
]]
for
a
in
content
]
def
load_hash_caches
(
basename
,
cache_dir
=
None
):
if
cache_dir
is
None
:
cache_dir
=
default_cache
()
core
=
"
%s/%s.json
"
%
(
cache_dir
,
basename
)
if
os
.
path
.
exists
(
core
):
with
open
(
core
,
"
r
"
)
as
fid
:
# for elem in
return
list_to_hash
(
json
.
load
(
fid
))
return
{}
def
save_hash_caches
(
cache
,
basename
,
cache_dir
=
None
):
if
cache_dir
is
None
:
cache_dir
=
default_cache
()
os
.
makedirs
(
cache_dir
,
exist_ok
=
True
)
core
=
"
%s/%s.json
"
%
(
cache_dir
,
basename
)
with
open
(
core
,
"
w
"
)
as
fid
:
json
.
dump
(
hash_to_list
(
cache
),
fid
)
This diff is collapsed.
Click to expand it.
get_rankings/tools.py
0 → 100644
+
69
−
0
View file @
79045f46
import
numpy
import
requests
import
logging
from
tqdm
import
tqdm
LOG
=
logging
.
getLogger
(
__name__
)
def
levenshtein
(
token1
,
token2
):
distances
=
numpy
.
zeros
((
len
(
token1
)
+
1
,
len
(
token2
)
+
1
))
for
t1
in
range
(
len
(
token1
)
+
1
):
distances
[
t1
][
0
]
=
t1
for
t2
in
range
(
len
(
token2
)
+
1
):
distances
[
0
][
t2
]
=
t2
a
=
0
b
=
0
c
=
0
for
t1
in
range
(
1
,
len
(
token1
)
+
1
):
for
t2
in
range
(
1
,
len
(
token2
)
+
1
):
if
token1
[
t1
-
1
]
==
token2
[
t2
-
1
]:
distances
[
t1
][
t2
]
=
distances
[
t1
-
1
][
t2
-
1
]
else
:
a
=
distances
[
t1
][
t2
-
1
]
b
=
distances
[
t1
-
1
][
t2
]
c
=
distances
[
t1
-
1
][
t2
-
1
]
if
a
<=
b
and
a
<=
c
:
distances
[
t1
][
t2
]
=
a
+
1
elif
b
<=
a
and
b
<=
c
:
distances
[
t1
][
t2
]
=
b
+
1
else
:
distances
[
t1
][
t2
]
=
c
+
1
return
distances
[
len
(
token1
)][
len
(
token2
)]
def
download
(
url
,
filename
=
None
):
LOG
.
info
(
f
"
fetching
{
url
}
"
)
r
=
requests
.
get
(
url
,
stream
=
True
)
data
=
b
""
total_size
=
int
(
r
.
headers
.
get
(
"
content-length
"
,
0
))
for
chunk
in
tqdm
(
r
.
iter_content
(
32
*
1024
),
total
=
total_size
,
unit
=
"
B
"
,
unit_scale
=
True
,
leave
=
False
,
):
if
chunk
:
data
+=
chunk
if
not
filename
is
None
:
with
open
(
filename
,
"
wb
"
)
as
file
:
file
.
write
(
data
)
return
data
def
get_in_ordered_list
(
ordered_list
,
year
):
if
ordered_list
==
[]:
return
None
current
=
ordered_list
[
0
]
for
elem
in
ordered_list
[
1
:]:
if
year
<
elem
[
0
]:
return
current
current
=
elem
return
current
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment