Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
G
Get Rankings
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
dacosta
Get Rankings
Commits
cebf5d40
Commit
cebf5d40
authored
2 years ago
by
Georges Da Costa
Browse files
Options
Downloads
Plain Diff
Merge branch 'nopid' into 'main'
Improves date behavior See merge request
!1
parents
2a3146a8
d4dca0bb
Branches
Branches containing commit
No related tags found
1 merge request
!1
Improves date behavior
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
get_rankings/get_rankings.py
+238
-128
238 additions, 128 deletions
get_rankings/get_rankings.py
setup.py
+1
-1
1 addition, 1 deletion
setup.py
with
239 additions
and
129 deletions
get_rankings/get_rankings.py
+
238
−
128
View file @
cebf5d40
#!/usr/bin/python3
#!/usr/bin/
env
python3
import
logging
from
tqdm
import
tqdm
from
tqdm.contrib.logging
import
logging_redirect_tqdm
import
os
import
requests
import
datetime
...
...
@@ -10,74 +13,122 @@ import numpy
import
json
import
argparse
LOG
=
logging
.
getLogger
(
__name__
)
def
getwithpb
(
url
):
LOG
.
info
(
f
"
fetching
{
url
}
"
)
r
=
requests
.
get
(
url
,
stream
=
True
)
data
=
b
""
total_size
=
int
(
r
.
headers
.
get
(
"
content-length
"
,
0
))
for
chunk
in
tqdm
(
r
.
iter_content
(
32
*
1024
),
total
=
total_size
,
unit
=
"
B
"
,
unit_scale
=
True
,
leave
=
False
,
):
if
chunk
:
data
+=
chunk
return
data
def
fgetwithpb
(
url
,
filename
):
LOG
.
info
(
f
"
fetching
{
url
}
"
)
r
=
requests
.
get
(
url
,
stream
=
True
)
data
=
b
""
total_size
=
int
(
r
.
headers
.
get
(
"
content-length
"
,
0
))
with
open
(
filename
,
"
wb
"
)
as
file
:
for
chunk
in
tqdm
(
r
.
iter_content
(
32
*
1024
),
total
=
total_size
,
unit
=
"
B
"
,
unit_scale
=
True
,
leave
=
False
,
):
if
chunk
:
file
.
write
(
chunk
)
data
+=
chunk
return
data
def
comp_lower
(
a
,
b
):
return
isinstance
(
a
,
str
)
and
isinstance
(
b
,
str
)
and
a
.
lower
()
==
b
.
lower
()
def
default_cache
():
return
os
.
environ
[
'
HOME
'
]
+
'
/.local/state/pyrank
'
return
os
.
environ
[
"
HOME
"
]
+
"
/.local/state/pyrank
"
def
get_dblp
(
url
,
cache
=
True
,
cache_dir
=
None
):
def
get_dblp
(
url
,
cache
=
True
,
cache_dir
=
None
):
if
cache_dir
is
None
:
cache_dir
=
default_cache
()
_
,
target
=
url
.
split
(
'
//
'
)
filename
=
'
%s/%s
'
%
(
cache_dir
,
target
.
replace
(
'
/
'
,
'
_
'
))
_
,
target
=
url
.
split
(
"
//
"
)
filename
=
"
%s/%s
"
%
(
cache_dir
,
target
.
replace
(
"
/
"
,
"
_
"
))
os
.
makedirs
(
cache_dir
,
exist_ok
=
True
)
if
not
os
.
path
.
exists
(
filename
)
or
not
cache
:
with
open
(
filename
,
"
wb
"
)
as
file
:
response
=
requests
.
get
(
url
)
data
=
response
.
content
file
.
write
(
data
)
data
=
fgetwithpb
(
url
,
filename
)
else
:
with
open
(
filename
,
"
rb
"
)
as
file
:
data
=
file
.
read
()
soup
=
BeautifulSoup
(
data
,
'
html.parser
'
)
soup
=
BeautifulSoup
(
data
,
"
html.parser
"
)
articles
=
soup
.
find_all
(
"
li
"
,
class_
=
"
entry
"
)
res
=
[]
for
a
in
articles
:
if
'
inproceedings
'
in
a
[
'
class
'
]
or
'
article
'
in
a
[
'
class
'
]:
name
=
a
.
find
(
"
span
"
,
itemprop
=
'
isPartOf
'
).
find
(
"
span
"
,
itemprop
=
'
name
'
).
text
year
=
a
.
find
(
"
span
"
,
itemprop
=
'
datePublished
'
).
text
venue
,
second_name
,
_
=
a
[
'
id
'
].
split
(
'
/
'
)
if
"
inproceedings
"
in
a
[
"
class
"
]
or
"
article
"
in
a
[
"
class
"
]:
name
=
(
a
.
find
(
"
span
"
,
itemprop
=
"
isPartOf
"
).
find
(
"
span
"
,
itemprop
=
"
name
"
).
text
)
year
=
a
.
find
(
"
span
"
,
itemprop
=
"
datePublished
"
).
text
venue
,
second_name
,
_
=
a
[
"
id
"
].
split
(
"
/
"
)
res
.
append
([
venue
,
name
,
second_name
,
year
])
return
soup
.
title
.
text
,
res
def
get_core_year
(
year
):
if
year
>=
2021
:
return
'
CORE2021
'
return
"
CORE2021
"
if
year
>=
2020
:
return
'
CORE2020
'
return
"
CORE2020
"
if
year
>=
2018
:
return
'
CORE2018
'
return
"
CORE2018
"
if
year
>=
2017
:
return
'
CORE2017
'
return
"
CORE2017
"
if
year
>=
2014
:
return
'
CORE2014
'
return
"
CORE2014
"
if
year
>=
2013
:
return
'
CORE2013
'
return
"
CORE2013
"
if
year
>=
2010
:
return
'
ERA2010
'
return
"
ERA2010
"
return
"
CORE2008
"
def
get_core_rank
(
name
,
year
):
source
=
get_core_year
(
int
(
year
))
url
=
"
http://portal.core.edu.au/conf-ranks/?search=%s&by=all&source=%s&page=1
"
%
(
name
,
source
)
url
=
"
http://portal.core.edu.au/conf-ranks/?search=%s&by=all&source=%s&page=1
"
%
(
name
,
source
,
)
response
=
requests
.
get
(
url
)
data
=
response
.
content
cc_soup
=
BeautifulSoup
(
data
,
'
html.parser
'
)
table
=
cc_soup
.
find_all
(
'
table
'
)
data
=
getwithpb
(
url
)
cc_soup
=
BeautifulSoup
(
data
,
"
html.parser
"
)
table
=
cc_soup
.
find_all
(
"
table
"
)
if
len
(
table
)
==
0
:
return
None
df
=
pd
.
read_html
(
str
(
table
))[
0
]
for
index
,
row
in
df
.
iterrows
():
#print(name, year, ' ', row.Title, row.Acronym, row.Rank)
if
row
.
Title
.
lower
()
==
name
.
lower
()
or
row
.
Acronym
.
lower
()
==
name
.
lower
():
# print(name, year, ' ', row.Title, row.Acronym, row.Rank)
if
comp_lower
(
row
.
Title
,
name
)
or
comp_lower
(
row
.
Acronym
,
name
):
return
row
.
Rank
,
row
.
Title
,
row
.
Acronym
return
None
def
levenshteinDistanceDP
(
token1
,
token2
):
distances
=
numpy
.
zeros
((
len
(
token1
)
+
1
,
len
(
token2
)
+
1
))
...
...
@@ -86,23 +137,23 @@ def levenshteinDistanceDP(token1, token2):
for
t2
in
range
(
len
(
token2
)
+
1
):
distances
[
0
][
t2
]
=
t2
a
=
0
b
=
0
c
=
0
for
t1
in
range
(
1
,
len
(
token1
)
+
1
):
for
t2
in
range
(
1
,
len
(
token2
)
+
1
):
if
(
token1
[
t1
-
1
]
==
token2
[
t2
-
1
]
)
:
if
token1
[
t1
-
1
]
==
token2
[
t2
-
1
]:
distances
[
t1
][
t2
]
=
distances
[
t1
-
1
][
t2
-
1
]
else
:
a
=
distances
[
t1
][
t2
-
1
]
b
=
distances
[
t1
-
1
][
t2
]
c
=
distances
[
t1
-
1
][
t2
-
1
]
if
(
a
<=
b
and
a
<=
c
)
:
if
a
<=
b
and
a
<=
c
:
distances
[
t1
][
t2
]
=
a
+
1
elif
(
b
<=
a
and
b
<=
c
)
:
elif
b
<=
a
and
b
<=
c
:
distances
[
t1
][
t2
]
=
b
+
1
else
:
distances
[
t1
][
t2
]
=
c
+
1
...
...
@@ -111,29 +162,33 @@ def levenshteinDistanceDP(token1, token2):
def
list_to_hash
(
content
):
return
{
tuple
(
elem
[
0
]):
elem
[
1
]
for
elem
in
content
}
def
load_ranking_caches
(
basename
,
cache_dir
=
None
):
return
{
tuple
(
elem
[
0
]):
elem
[
1
]
for
elem
in
content
}
def
load_ranking_caches
(
basename
,
cache_dir
=
None
):
if
cache_dir
is
None
:
cache_dir
=
default_cache
()
core
=
'
%s/%s.json
'
%
(
cache_dir
,
basename
)
core
=
"
%s/%s.json
"
%
(
cache_dir
,
basename
)
if
os
.
path
.
exists
(
core
):
with
open
(
core
,
'
r
'
)
as
fid
:
#for elem in
with
open
(
core
,
"
r
"
)
as
fid
:
#
for elem in
return
list_to_hash
(
json
.
load
(
fid
))
return
{}
def
hash_to_list
(
content
):
return
[[
a
,
content
[
a
]]
for
a
in
content
]
return
[[
a
,
content
[
a
]]
for
a
in
content
]
def
save_ranking_caches
(
cache
,
basename
,
cache_dir
=
None
):
def
save_ranking_caches
(
cache
,
basename
,
cache_dir
=
None
):
if
cache_dir
is
None
:
cache_dir
=
default_cache
()
os
.
makedirs
(
cache_dir
,
exist_ok
=
True
)
core
=
'
%s/%s.json
'
%
(
cache_dir
,
basename
)
with
open
(
core
,
'
w
'
)
as
fid
:
os
.
makedirs
(
cache_dir
,
exist_ok
=
True
)
core
=
"
%s/%s.json
"
%
(
cache_dir
,
basename
)
with
open
(
core
,
"
w
"
)
as
fid
:
json
.
dump
(
hash_to_list
(
cache
),
fid
)
def
get_sjr_in_cache
(
rankings
,
str_year
):
year
=
int
(
str_year
)
if
rankings
==
[]:
...
...
@@ -145,58 +200,83 @@ def get_sjr_in_cache(rankings, str_year):
current
=
elem
return
current
def
get_sjr_rank
(
name
):
url
=
"
https://www.scimagojr.com/journalsearch.php?q=%s
"
%
name
.
replace
(
'
'
,
'
+
'
)
response
=
requests
.
get
(
url
)
data
=
response
.
content
sjr_soup
=
BeautifulSoup
(
data
,
'
html.parser
'
)
url
=
"
https://www.scimagojr.com/journalsearch.php?q=%s
"
%
name
.
replace
(
"
"
,
"
+
"
)
data
=
getwithpb
(
url
)
sjr_soup
=
BeautifulSoup
(
data
,
"
html.parser
"
)
revues
=
sjr_soup
.
find
(
'
div
'
,
class_
=
'
search_results
'
)
revues
=
sjr_soup
.
find
(
"
div
"
,
class_
=
"
search_results
"
)
dist
=
-
1
reference
=
None
best_name
=
None
for
revue
in
revues
.
find_all
(
'
a
'
):
tmp
=
revue
.
find
(
'
span
'
).
text
for
revue
in
revues
.
find_all
(
"
a
"
):
tmp
=
revue
.
find
(
"
span
"
).
text
lev
=
levenshteinDistanceDP
(
tmp
,
name
)
if
dist
==
-
1
or
lev
<
dist
:
dist
=
lev
best_name
=
tmp
reference
=
"
https://www.scimagojr.com/%s
"
%
revue
[
'
href
'
]
reference
=
"
https://www.scimagojr.com/%s
"
%
revue
[
"
href
"
]
if
reference
is
None
:
return
[]
response
=
requests
.
get
(
reference
)
data
=
response
.
content
sjr_soup
=
BeautifulSoup
(
data
,
'
html.parser
'
)
table
=
sjr_soup
.
find_all
(
'
table
'
)
data
=
getwithpb
(
reference
)
sjr_soup
=
BeautifulSoup
(
data
,
"
html.parser
"
)
table
=
sjr_soup
.
find_all
(
"
table
"
)
if
len
(
table
)
==
0
:
return
[]
df
=
pd
.
read_html
(
str
(
table
))[
0
]
df
[
'
Rank
'
]
=
[
int
(
val
[
1
])
for
val
in
df
.
Quartile
]
if
"
Quartile
"
in
df
:
df
[
"
Rank
"
]
=
[
int
(
val
[
1
])
for
val
in
df
.
Quartile
]
else
:
return
[]
mins
=
df
.
groupby
(
'
Year
'
).
min
().
Rank
maxs
=
df
.
groupby
(
'
Year
'
).
max
().
Rank
.
to_dict
()
mins
=
df
.
groupby
(
"
Year
"
).
min
().
Rank
maxs
=
df
.
groupby
(
"
Year
"
).
max
().
Rank
.
to_dict
()
result
=
[]
for
(
y
,
v
)
in
mins
.
items
():
if
v
==
maxs
[
y
]:
ranking
=
'
Q%s
'
%
v
ranking
=
"
Q%s
"
%
v
else
:
ranking
=
'
Q%s-Q%s
'
%
(
v
,
maxs
[
y
])
ranking
=
"
Q%s-Q%s
"
%
(
v
,
maxs
[
y
])
result
.
append
((
y
,
best_name
,
ranking
))
return
result
def
main
():
sjr_ranking_caches
=
load_ranking_caches
(
'
sjr
'
)
core_ranking_caches
=
load_ranking_caches
(
'
core
'
)
parser
=
argparse
.
ArgumentParser
(
description
=
'
Get ranking from DBLP and show a small summary
'
)
parser
.
add_argument
(
'
url
'
,
help
=
'
DBLP url
'
)
parser
.
add_argument
(
'
--start
'
,
type
=
int
,
default
=
-
1
,
help
=
'
starting year
'
)
parser
.
add_argument
(
'
--end
'
,
type
=
int
,
default
=
10000
,
help
=
'
ending year
'
)
parser
.
add_argument
(
'
-o
'
,
metavar
=
(
'
output.csv
'
),
default
=
None
,
help
=
'
output csv file
'
)
parser
.
add_argument
(
'
-d
'
,
action
=
'
store_true
'
,
help
=
'
display conference and journal list
'
)
sjr_ranking_caches
=
load_ranking_caches
(
"
sjr
"
)
core_ranking_caches
=
load_ranking_caches
(
"
core
"
)
parser
=
argparse
.
ArgumentParser
(
description
=
"
Get ranking from DBLP and show a small summary
"
)
parser
.
add_argument
(
"
url
"
,
help
=
"
DBLP url
"
)
parser
.
add_argument
(
"
--start
"
,
type
=
int
,
default
=-
1
,
help
=
"
starting year
"
)
parser
.
add_argument
(
"
--end
"
,
type
=
int
,
default
=
10000
,
help
=
"
ending year
"
)
parser
.
add_argument
(
"
-o
"
,
metavar
=
(
"
output.csv
"
),
default
=
None
,
help
=
"
output csv file
"
)
parser
.
add_argument
(
"
-d
"
,
action
=
"
store_true
"
,
help
=
"
display conference and journal list
"
)
parser
.
add_argument
(
"
--debug
"
,
help
=
"
Print lots of debugging statements
"
,
action
=
"
store_const
"
,
dest
=
"
loglevel
"
,
const
=
logging
.
DEBUG
,
default
=
logging
.
WARNING
,
)
parser
.
add_argument
(
"
-v
"
,
"
--verbose
"
,
help
=
"
Be verbose
"
,
action
=
"
store_const
"
,
dest
=
"
loglevel
"
,
const
=
logging
.
INFO
,
)
args
=
parser
.
parse_args
()
url
=
args
.
url
...
...
@@ -204,69 +284,99 @@ def main():
csv_output
=
args
.
o
start_year
=
args
.
start
display_list
=
args
.
d
logging
.
basicConfig
(
level
=
args
.
loglevel
,
format
=
"
%(levelname)s %(message)s
"
)
username
,
elements
=
get_dblp
(
url
)
print
(
username
)
result
=
[]
for
venue
,
name
,
second_name
,
year
in
elements
:
if
venue
==
'
conf
'
:
if
(
name
,
second_name
,
year
)
in
core_ranking_caches
:
rank
=
core_ranking_caches
[(
name
,
second_name
,
year
)]
else
:
rank
=
get_core_rank
(
name
,
year
)
if
rank
is
None
:
rank
=
get_core_rank
(
second_name
,
year
)
core_ranking_caches
[(
name
,
second_name
,
year
)]
=
rank
if
rank
is
None
:
result
.
append
([
'
C
'
,
name
,
second_name
,
int
(
year
),
None
,
None
,
None
])
else
:
result
.
append
([
'
C
'
,
name
,
second_name
,
int
(
year
),
rank
[
1
],
rank
[
2
],
rank
[
0
]])
with
logging_redirect_tqdm
():
for
venue
,
name
,
second_name
,
year
in
tqdm
(
elements
):
if
start_year
<=
int
(
year
)
<=
end_year
:
if
venue
==
"
conf
"
:
if
(
name
,
second_name
,
year
)
in
core_ranking_caches
:
rank
=
core_ranking_caches
[(
name
,
second_name
,
year
)]
else
:
rank
=
get_core_rank
(
name
,
year
)
if
rank
is
None
:
rank
=
get_core_rank
(
second_name
,
year
)
core_ranking_caches
[(
name
,
second_name
,
year
)]
=
rank
if
rank
is
None
:
result
.
append
(
[
"
C
"
,
name
,
second_name
,
int
(
year
),
None
,
None
,
None
]
)
else
:
result
.
append
(
[
"
C
"
,
name
,
second_name
,
int
(
year
),
rank
[
1
],
rank
[
2
],
rank
[
0
],
]
)
else
:
if
(
name
,
second_name
)
in
sjr_ranking_caches
:
rankings
=
sjr_ranking_caches
[(
name
,
second_name
)]
else
:
rankings
=
get_sjr_rank
(
name
)
sjr_ranking_caches
[(
name
,
second_name
)]
=
rankings
rank
=
get_sjr_in_cache
(
rankings
,
year
)
if
rank
is
None
:
result
.
append
([
'
J
'
,
name
,
second_name
,
int
(
year
),
None
,
None
,
None
])
else
:
result
.
append
([
'
J
'
,
name
,
second_name
,
int
(
year
),
rank
[
1
],
None
,
rank
[
2
]])
save_ranking_caches
(
sjr_ranking_caches
,
'
sjr
'
)
save_ranking_caches
(
core_ranking_caches
,
'
core
'
)
df
=
pd
.
DataFrame
(
result
,
columns
=
[
'
type
'
,
'
name
'
,
'
short
'
,
'
year
'
,
'
longname
'
,
'
acronym
'
,
'
rank
'
])
if
start_year
!=
-
1
:
print
(
'
Starting year
'
,
start_year
)
elif
venue
==
"
journals
"
:
if
(
name
,
second_name
)
in
sjr_ranking_caches
:
rankings
=
sjr_ranking_caches
[(
name
,
second_name
)]
else
:
rankings
=
get_sjr_rank
(
name
)
sjr_ranking_caches
[(
name
,
second_name
)]
=
rankings
rank
=
get_sjr_in_cache
(
rankings
,
year
)
if
rank
is
None
:
result
.
append
(
[
"
J
"
,
name
,
second_name
,
int
(
year
),
None
,
None
,
None
]
)
else
:
result
.
append
(
[
"
J
"
,
name
,
second_name
,
int
(
year
),
rank
[
1
],
None
,
rank
[
2
]]
)
else
:
tqdm
.
write
(
f
"
venue:
{
venue
}
?
"
)
save_ranking_caches
(
sjr_ranking_caches
,
"
sjr
"
)
save_ranking_caches
(
core_ranking_caches
,
"
core
"
)
df
=
pd
.
DataFrame
(
result
,
columns
=
[
"
type
"
,
"
name
"
,
"
short
"
,
"
year
"
,
"
longname
"
,
"
acronym
"
,
"
rank
"
]
)
df
=
df
.
fillna
(
value
=
""
)
if
start_year
!=
-
1
:
print
(
"
Starting year
"
,
start_year
)
else
:
print
(
'
Starting year
'
,
min
(
df
[
'
year
'
]))
print
(
"
Starting year
"
,
min
(
df
[
"
year
"
]))
if
end_year
!=
10000
:
print
(
'
Ending year
'
,
end_year
)
print
(
"
Ending year
"
,
end_year
)
else
:
print
(
'
Ending year
'
,
max
(
df
[
'
year
'
]))
selection
=
df
[(
df
[
'
year
'
]
>=
start_year
)
&
(
df
[
'
year
'
]
<=
end_year
)]
print
(
'
Not found
'
,
len
(
selection
)
-
selection
[
'
rank
'
].
count
(),
'
out of a total of
'
,
len
(
selection
))
evaluation
=
selection
.
groupby
(
'
rank
'
).
count
()
print
(
evaluation
.
drop
([
'
name
'
,
'
short
'
,
'
year
'
,
'
longname
'
,
'
acronym
'
],
axis
=
1
).
rename
(
columns
=
{
'
type
'
:
'
number
'
}))
print
(
"
Ending year
"
,
max
(
df
[
"
year
"
]))
selection
=
df
[(
df
[
"
year
"
]
>=
start_year
)
&
(
df
[
"
year
"
]
<=
end_year
)]
print
(
"
Not found
"
,
len
(
selection
)
-
selection
[
"
rank
"
].
count
(),
"
out of a total of
"
,
len
(
selection
),
)
evaluation
=
selection
.
groupby
(
"
rank
"
).
count
()
print
(
evaluation
.
drop
(
[
"
name
"
,
"
short
"
,
"
year
"
,
"
longname
"
,
"
acronym
"
],
axis
=
1
).
rename
(
columns
=
{
"
type
"
:
"
number
"
})
)
if
not
csv_output
is
None
:
selection
.
to_csv
(
csv_output
,
index
=
False
)
if
display_list
:
pd
.
set_option
(
'
display.max_rows
'
,
len
(
selection
))
pd
.
set_option
(
"
display.max_rows
"
,
len
(
selection
))
print
(
selection
)
if
__name__
==
'
__main__
'
:
if
__name__
==
"
__main__
"
:
main
()
This diff is collapsed.
Click to expand it.
setup.py
+
1
−
1
View file @
cebf5d40
...
...
@@ -19,7 +19,7 @@ setuptools.setup(
"
Operating System :: OS Independent
"
,
],
python_requires
=
'
>=3.6
'
,
install_requires
=
[
'
requests
'
,
'
BeautifulSoup4
'
,
'
datetime
'
,
'
parsedate
'
,
'
pandas
'
,
'
numpy
'
,
'
argparse
'
,
'
lxml
'
],
install_requires
=
[
'
requests
'
,
'
BeautifulSoup4
'
,
'
datetime
'
,
'
parsedate
'
,
'
pandas
'
,
'
numpy
'
,
'
argparse
'
,
'
lxml
'
,
'
tqdm
'
],
entry_points
=
{
'
console_scripts
'
:
[
'
get_rankings = get_rankings.get_rankings:main
'
,
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment