Skip to content
Snippets Groups Projects
Commit 19e53068 authored by Georges Da Costa's avatar Georges Da Costa
Browse files

initial commit

parents
Branches
Tags
No related merge requests found
# DBLP ranking using CORE Rank and SJR
## Install
``` bash
pip install get_rankings
```
## Run
```
get_rankings DBLP_URL
```
Gives and overview of a dblp account. The first run will be slow as some data will be cached.
For example
```
get_rankings https://dblp.org/pid/37/2282.html
```
## Usage
```
usage: get_rankings [-h] [--start START] [--end END] [-o output.csv] [-d] url
Get ranking from DBLP and show a small summary
positional arguments:
url DBLP url
options:
-h, --help show this help message and exit
--start START starting year
--end END ending year
-o output.csv output csv file
-d display conference and journal list
```
## Thanks
Thanks for Laurent Reveillere ([dblp_ranker](https://github.com/reveillere/dblp_ranker) and Xavier Blanc [dblp_ranker](https://github.com/xblanc33/dblp_ranker) for their initial version in nodejs.
#!/usr/bin/python3
import os
import requests
import datetime
from dateutil.parser import parse as parsedate
from bs4 import BeautifulSoup
import pandas as pd
import numpy
import json
import argparse
def default_cache():
return os.environ['HOME']+'/.local/state/pyrank'
def get_dblp(url, cache=True, cache_dir = None):
if cache_dir is None:
cache_dir = default_cache()
_, target = url.split('//')
filename = '%s/%s' % (cache_dir, target.replace('/', '_'))
os.makedirs(cache_dir, exist_ok=True)
if not os.path.exists(filename) or not cache:
with open(filename, "wb") as file:
response = requests.get(url)
data = response.content
file.write(data)
else:
with open(filename, "rb") as file:
data = file.read()
soup = BeautifulSoup(data, 'html.parser')
articles = soup.find_all("li", class_="entry")
res = []
for a in articles:
if 'inproceedings' in a['class'] or 'article' in a['class']:
name = a.find("span", itemprop = 'isPartOf').find("span", itemprop = 'name').text
year = a.find("span", itemprop = 'datePublished').text
venue, second_name, _ = a['id'].split('/')
res.append([venue, name, second_name, year])
return soup.title.text, res
def get_core_year(year):
if year >= 2021:
return 'CORE2021'
if year >= 2020:
return 'CORE2020'
if year >= 2018:
return 'CORE2018'
if year >= 2017:
return 'CORE2017'
if year >= 2014:
return 'CORE2014'
if year >= 2013:
return 'CORE2013'
if year >= 2010:
return 'ERA2010'
return "CORE2008"
def get_core_rank(name, year):
source = get_core_year(int(year))
url = "http://portal.core.edu.au/conf-ranks/?search=%s&by=all&source=%s&page=1" % (name, source)
response = requests.get(url)
data = response.content
cc_soup = BeautifulSoup(data, 'html.parser')
table = cc_soup.find_all('table')
if len(table) == 0:
return None
df = pd.read_html(str(table))[0]
for index, row in df.iterrows():
#print(name, year, ' ', row.Title, row.Acronym, row.Rank)
if row.Title.lower() == name.lower() or row.Acronym.lower() == name.lower():
return row.Rank, row.Title, row.Acronym
return None
def levenshteinDistanceDP(token1, token2):
distances = numpy.zeros((len(token1) + 1, len(token2) + 1))
for t1 in range(len(token1) + 1):
distances[t1][0] = t1
for t2 in range(len(token2) + 1):
distances[0][t2] = t2
a = 0
b = 0
c = 0
for t1 in range(1, len(token1) + 1):
for t2 in range(1, len(token2) + 1):
if (token1[t1-1] == token2[t2-1]):
distances[t1][t2] = distances[t1 - 1][t2 - 1]
else:
a = distances[t1][t2 - 1]
b = distances[t1 - 1][t2]
c = distances[t1 - 1][t2 - 1]
if (a <= b and a <= c):
distances[t1][t2] = a + 1
elif (b <= a and b <= c):
distances[t1][t2] = b + 1
else:
distances[t1][t2] = c + 1
return distances[len(token1)][len(token2)]
def list_to_hash(content):
return {tuple(elem[0]):elem[1] for elem in content}
def load_ranking_caches(basename, cache_dir = None):
if cache_dir is None:
cache_dir = default_cache()
core = '%s/%s.json' % (cache_dir, basename)
if os.path.exists(core):
with open(core, 'r') as fid:
#for elem in
return list_to_hash(json.load(fid))
return {}
def hash_to_list(content):
return [[a,content[a]] for a in content]
def save_ranking_caches(cache, basename, cache_dir = None):
if cache_dir is None:
cache_dir = default_cache()
os.makedirs(cache_dir, exist_ok=True)
core = '%s/%s.json' % (cache_dir, basename)
with open(core, 'w') as fid:
json.dump(hash_to_list(cache), fid)
def get_sjr_in_cache(rankings, str_year):
year = int(str_year)
if rankings == []:
return None
current = rankings[0]
for elem in rankings[1:]:
if year < elem[0]:
return current
current = elem
return current
def get_sjr_rank(name):
url = "https://www.scimagojr.com/journalsearch.php?q=%s" % name.replace(' ', '+')
response = requests.get(url)
data = response.content
sjr_soup = BeautifulSoup(data, 'html.parser')
revues = sjr_soup.find('div', class_='search_results')
dist = -1
reference = None
best_name = None
for revue in revues.find_all('a'):
tmp = revue.find('span').text
lev = levenshteinDistanceDP(tmp, name)
if dist == -1 or lev < dist:
dist = lev
best_name = tmp
reference = "https://www.scimagojr.com/%s" % revue['href']
if reference is None:
return []
response = requests.get(reference)
data = response.content
sjr_soup = BeautifulSoup(data, 'html.parser')
table = sjr_soup.find_all('table')
if len(table) == 0:
return []
df = pd.read_html(str(table))[0]
df['Rank'] = [int(val[1]) for val in df.Quartile]
mins = df.groupby('Year').min().Rank
maxs = df.groupby('Year').max().Rank.to_dict()
result = []
for (y, v) in mins.items():
if v == maxs[y]:
ranking = 'Q%s' % v
else:
ranking = 'Q%s-Q%s' % (v, maxs[y])
result.append((y, best_name, ranking))
return result
def main():
sjr_ranking_caches = load_ranking_caches('sjr')
core_ranking_caches = load_ranking_caches('core')
parser = argparse.ArgumentParser(description='Get ranking from DBLP and show a small summary')
parser.add_argument('url', help='DBLP url')
parser.add_argument('--start', type=int, default = -1, help='starting year')
parser.add_argument('--end', type=int, default = 10000, help='ending year')
parser.add_argument('-o', metavar=('output.csv'), default = None, help='output csv file')
parser.add_argument('-d', action='store_true', help='display conference and journal list')
args = parser.parse_args()
url = args.url
end_year = args.end
csv_output = args.o
start_year = args.start
display_list = args.d
username, elements = get_dblp(url)
print(username)
result = []
for venue, name, second_name, year in elements:
if venue == 'conf':
if (name, second_name, year) in core_ranking_caches:
rank = core_ranking_caches[(name, second_name, year)]
else:
rank = get_core_rank(name, year)
if rank is None:
rank = get_core_rank(second_name, year)
core_ranking_caches[(name, second_name, year)] = rank
if rank is None:
result.append(['C', name, second_name, int(year), None, None, None])
else:
result.append(['C', name, second_name, int(year), rank[1], rank[2], rank[0]])
else:
if (name, second_name) in sjr_ranking_caches:
rankings = sjr_ranking_caches[(name, second_name)]
else:
rankings = get_sjr_rank(name)
sjr_ranking_caches[(name, second_name)] = rankings
rank = get_sjr_in_cache(rankings, year)
if rank is None:
result.append(['J', name, second_name, int(year), None, None, None])
else:
result.append(['J', name, second_name, int(year), rank[1], None, rank[2]])
save_ranking_caches(sjr_ranking_caches, 'sjr')
save_ranking_caches(core_ranking_caches, 'core')
df = pd.DataFrame(result, columns=['type', 'name', 'short', 'year', 'longname', 'acronym', 'rank'])
if start_year != -1 :
print('Starting year', start_year)
else:
print('Starting year', min(df['year']))
if end_year != 10000:
print('Ending year', end_year)
else:
print('Ending year', max(df['year']))
selection = df[(df['year'] >= start_year) & (df['year'] <= end_year)]
print('Not found',
len(selection) - selection['rank'].count(),
'out of a total of',
len(selection))
evaluation = selection.groupby('rank').count()
print(evaluation.drop(['name', 'short', 'year', 'longname', 'acronym'], axis=1).rename(columns={'type':'number'}))
if not csv_output is None:
selection.to_csv(csv_output, index=False)
if display_list:
pd.set_option('display.max_rows', len(selection))
print(selection)
if __name__ == '__main__':
main()
\rm dist/*
python3 setup.py sdist bdist_wheel
python3 -m twine upload dist/*
setup.py 0 → 100644
import setuptools
with open("README.md", "r") as fh:
long_description = fh.read()
setuptools.setup(
name="get_rankings",
version="0.3",
author="Georges Da Costa",
author_email="georges.da-costa@irit.fr",
description="DBLP ranking using CORE Rank and SJR",
long_description=long_description,
long_description_content_type="text/markdown",
url="https://gitlab.irit.fr/sepia-pub/dacosta/get-rankings",
packages=setuptools.find_packages(),
classifiers=[
"Programming Language :: Python :: 3",
"License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)",
"Operating System :: OS Independent",
],
python_requires='>=3.6',
install_requires=['requests', 'BeautifulSoup4'],
entry_points={
'console_scripts': [
'get_rankings = get_rankings.get_rankings:main',
]
}
)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment