Skip to content
Snippets Groups Projects
Commit 12702c6f authored by nicolas.ollinger's avatar nicolas.ollinger
Browse files

reformat & patch

parent 2a3146a8
No related branches found
No related tags found
1 merge request!1Improves date behavior
#!/usr/bin/python3 #!/usr/bin/env python3
import os import os
import requests import requests
...@@ -10,14 +10,20 @@ import numpy ...@@ -10,14 +10,20 @@ import numpy
import json import json
import argparse import argparse
def comp_lower(a, b):
return isinstance(a, str) and isinstance(b, str) and a.lower() == b.lower()
def default_cache(): def default_cache():
return os.environ['HOME']+'/.local/state/pyrank' return os.environ["HOME"] + "/.local/state/pyrank"
def get_dblp(url, cache=True, cache_dir = None):
def get_dblp(url, cache=True, cache_dir=None):
if cache_dir is None: if cache_dir is None:
cache_dir = default_cache() cache_dir = default_cache()
_, target = url.split('//') _, target = url.split("//")
filename = '%s/%s' % (cache_dir, target.replace('/', '_')) filename = "%s/%s" % (cache_dir, target.replace("/", "_"))
os.makedirs(cache_dir, exist_ok=True) os.makedirs(cache_dir, exist_ok=True)
if not os.path.exists(filename) or not cache: if not os.path.exists(filename) or not cache:
with open(filename, "wb") as file: with open(filename, "wb") as file:
...@@ -27,57 +33,64 @@ def get_dblp(url, cache=True, cache_dir = None): ...@@ -27,57 +33,64 @@ def get_dblp(url, cache=True, cache_dir = None):
else: else:
with open(filename, "rb") as file: with open(filename, "rb") as file:
data = file.read() data = file.read()
soup = BeautifulSoup(data, 'html.parser') soup = BeautifulSoup(data, "html.parser")
articles = soup.find_all("li", class_="entry") articles = soup.find_all("li", class_="entry")
res = [] res = []
for a in articles: for a in articles:
if 'inproceedings' in a['class'] or 'article' in a['class']: if "inproceedings" in a["class"] or "article" in a["class"]:
name = a.find("span", itemprop = 'isPartOf').find("span", itemprop = 'name').text name = (
year = a.find("span", itemprop = 'datePublished').text a.find("span", itemprop="isPartOf").find("span", itemprop="name").text
venue, second_name, _ = a['id'].split('/') )
year = a.find("span", itemprop="datePublished").text
venue, second_name, _ = a["id"].split("/")
res.append([venue, name, second_name, year]) res.append([venue, name, second_name, year])
return soup.title.text, res return soup.title.text, res
def get_core_year(year): def get_core_year(year):
if year >= 2021: if year >= 2021:
return 'CORE2021' return "CORE2021"
if year >= 2020: if year >= 2020:
return 'CORE2020' return "CORE2020"
if year >= 2018: if year >= 2018:
return 'CORE2018' return "CORE2018"
if year >= 2017: if year >= 2017:
return 'CORE2017' return "CORE2017"
if year >= 2014: if year >= 2014:
return 'CORE2014' return "CORE2014"
if year >= 2013: if year >= 2013:
return 'CORE2013' return "CORE2013"
if year >= 2010: if year >= 2010:
return 'ERA2010' return "ERA2010"
return "CORE2008" return "CORE2008"
def get_core_rank(name, year): def get_core_rank(name, year):
source = get_core_year(int(year)) source = get_core_year(int(year))
url = "http://portal.core.edu.au/conf-ranks/?search=%s&by=all&source=%s&page=1" % (name, source) url = "http://portal.core.edu.au/conf-ranks/?search=%s&by=all&source=%s&page=1" % (
name,
source,
)
response = requests.get(url) response = requests.get(url)
data = response.content data = response.content
cc_soup = BeautifulSoup(data, 'html.parser') cc_soup = BeautifulSoup(data, "html.parser")
table = cc_soup.find_all('table') table = cc_soup.find_all("table")
if len(table) == 0: if len(table) == 0:
return None return None
df = pd.read_html(str(table))[0] df = pd.read_html(str(table))[0]
for index, row in df.iterrows(): for index, row in df.iterrows():
#print(name, year, ' ', row.Title, row.Acronym, row.Rank) # print(name, year, ' ', row.Title, row.Acronym, row.Rank)
if comp_lower(row.Title, name) or comp_lower(row.Acronym, name):
if row.Title.lower() == name.lower() or row.Acronym.lower() == name.lower():
return row.Rank, row.Title, row.Acronym return row.Rank, row.Title, row.Acronym
return None return None
def levenshteinDistanceDP(token1, token2): def levenshteinDistanceDP(token1, token2):
distances = numpy.zeros((len(token1) + 1, len(token2) + 1)) distances = numpy.zeros((len(token1) + 1, len(token2) + 1))
...@@ -86,23 +99,23 @@ def levenshteinDistanceDP(token1, token2): ...@@ -86,23 +99,23 @@ def levenshteinDistanceDP(token1, token2):
for t2 in range(len(token2) + 1): for t2 in range(len(token2) + 1):
distances[0][t2] = t2 distances[0][t2] = t2
a = 0 a = 0
b = 0 b = 0
c = 0 c = 0
for t1 in range(1, len(token1) + 1): for t1 in range(1, len(token1) + 1):
for t2 in range(1, len(token2) + 1): for t2 in range(1, len(token2) + 1):
if (token1[t1-1] == token2[t2-1]): if token1[t1 - 1] == token2[t2 - 1]:
distances[t1][t2] = distances[t1 - 1][t2 - 1] distances[t1][t2] = distances[t1 - 1][t2 - 1]
else: else:
a = distances[t1][t2 - 1] a = distances[t1][t2 - 1]
b = distances[t1 - 1][t2] b = distances[t1 - 1][t2]
c = distances[t1 - 1][t2 - 1] c = distances[t1 - 1][t2 - 1]
if (a <= b and a <= c): if a <= b and a <= c:
distances[t1][t2] = a + 1 distances[t1][t2] = a + 1
elif (b <= a and b <= c): elif b <= a and b <= c:
distances[t1][t2] = b + 1 distances[t1][t2] = b + 1
else: else:
distances[t1][t2] = c + 1 distances[t1][t2] = c + 1
...@@ -111,29 +124,33 @@ def levenshteinDistanceDP(token1, token2): ...@@ -111,29 +124,33 @@ def levenshteinDistanceDP(token1, token2):
def list_to_hash(content): def list_to_hash(content):
return {tuple(elem[0]):elem[1] for elem in content} return {tuple(elem[0]): elem[1] for elem in content}
def load_ranking_caches(basename, cache_dir = None):
def load_ranking_caches(basename, cache_dir=None):
if cache_dir is None: if cache_dir is None:
cache_dir = default_cache() cache_dir = default_cache()
core = '%s/%s.json' % (cache_dir, basename) core = "%s/%s.json" % (cache_dir, basename)
if os.path.exists(core): if os.path.exists(core):
with open(core, 'r') as fid: with open(core, "r") as fid:
#for elem in # for elem in
return list_to_hash(json.load(fid)) return list_to_hash(json.load(fid))
return {} return {}
def hash_to_list(content): def hash_to_list(content):
return [[a,content[a]] for a in content] return [[a, content[a]] for a in content]
def save_ranking_caches(cache, basename, cache_dir = None):
def save_ranking_caches(cache, basename, cache_dir=None):
if cache_dir is None: if cache_dir is None:
cache_dir = default_cache() cache_dir = default_cache()
os.makedirs(cache_dir, exist_ok=True) os.makedirs(cache_dir, exist_ok=True)
core = '%s/%s.json' % (cache_dir, basename) core = "%s/%s.json" % (cache_dir, basename)
with open(core, 'w') as fid: with open(core, "w") as fid:
json.dump(hash_to_list(cache), fid) json.dump(hash_to_list(cache), fid)
def get_sjr_in_cache(rankings, str_year): def get_sjr_in_cache(rankings, str_year):
year = int(str_year) year = int(str_year)
if rankings == []: if rankings == []:
...@@ -145,58 +162,69 @@ def get_sjr_in_cache(rankings, str_year): ...@@ -145,58 +162,69 @@ def get_sjr_in_cache(rankings, str_year):
current = elem current = elem
return current return current
def get_sjr_rank(name): def get_sjr_rank(name):
url = "https://www.scimagojr.com/journalsearch.php?q=%s" % name.replace(' ', '+') url = "https://www.scimagojr.com/journalsearch.php?q=%s" % name.replace(" ", "+")
response = requests.get(url) response = requests.get(url)
data = response.content data = response.content
sjr_soup = BeautifulSoup(data, 'html.parser') sjr_soup = BeautifulSoup(data, "html.parser")
revues = sjr_soup.find('div', class_='search_results') revues = sjr_soup.find("div", class_="search_results")
dist = -1 dist = -1
reference = None reference = None
best_name = None best_name = None
for revue in revues.find_all('a'): for revue in revues.find_all("a"):
tmp = revue.find('span').text tmp = revue.find("span").text
lev = levenshteinDistanceDP(tmp, name) lev = levenshteinDistanceDP(tmp, name)
if dist == -1 or lev < dist: if dist == -1 or lev < dist:
dist = lev dist = lev
best_name = tmp best_name = tmp
reference = "https://www.scimagojr.com/%s" % revue['href'] reference = "https://www.scimagojr.com/%s" % revue["href"]
if reference is None: if reference is None:
return [] return []
response = requests.get(reference) response = requests.get(reference)
data = response.content data = response.content
sjr_soup = BeautifulSoup(data, 'html.parser') sjr_soup = BeautifulSoup(data, "html.parser")
table = sjr_soup.find_all('table') table = sjr_soup.find_all("table")
if len(table) == 0: if len(table) == 0:
return [] return []
df = pd.read_html(str(table))[0] df = pd.read_html(str(table))[0]
df['Rank'] = [int(val[1]) for val in df.Quartile] if "Quartile" in df:
df["Rank"] = [int(val[1]) for val in df.Quartile]
else:
return []
mins = df.groupby('Year').min().Rank mins = df.groupby("Year").min().Rank
maxs = df.groupby('Year').max().Rank.to_dict() maxs = df.groupby("Year").max().Rank.to_dict()
result = [] result = []
for (y, v) in mins.items(): for (y, v) in mins.items():
if v == maxs[y]: if v == maxs[y]:
ranking = 'Q%s' % v ranking = "Q%s" % v
else: else:
ranking = 'Q%s-Q%s' % (v, maxs[y]) ranking = "Q%s-Q%s" % (v, maxs[y])
result.append((y, best_name, ranking)) result.append((y, best_name, ranking))
return result return result
def main(): def main():
sjr_ranking_caches = load_ranking_caches('sjr') sjr_ranking_caches = load_ranking_caches("sjr")
core_ranking_caches = load_ranking_caches('core') core_ranking_caches = load_ranking_caches("core")
parser = argparse.ArgumentParser(description='Get ranking from DBLP and show a small summary') parser = argparse.ArgumentParser(
parser.add_argument('url', help='DBLP url') description="Get ranking from DBLP and show a small summary"
parser.add_argument('--start', type=int, default = -1, help='starting year') )
parser.add_argument('--end', type=int, default = 10000, help='ending year') parser.add_argument("url", help="DBLP url")
parser.add_argument('-o', metavar=('output.csv'), default = None, help='output csv file') parser.add_argument("--start", type=int, default=-1, help="starting year")
parser.add_argument('-d', action='store_true', help='display conference and journal list') parser.add_argument("--end", type=int, default=10000, help="ending year")
parser.add_argument(
"-o", metavar=("output.csv"), default=None, help="output csv file"
)
parser.add_argument(
"-d", action="store_true", help="display conference and journal list"
)
args = parser.parse_args() args = parser.parse_args()
url = args.url url = args.url
...@@ -204,13 +232,13 @@ def main(): ...@@ -204,13 +232,13 @@ def main():
csv_output = args.o csv_output = args.o
start_year = args.start start_year = args.start
display_list = args.d display_list = args.d
username, elements = get_dblp(url) username, elements = get_dblp(url)
print(username) print(username)
result = [] result = []
for venue, name, second_name, year in elements: for venue, name, second_name, year in elements:
if venue == 'conf': if venue == "conf":
if (name, second_name, year) in core_ranking_caches: if (name, second_name, year) in core_ranking_caches:
rank = core_ranking_caches[(name, second_name, year)] rank = core_ranking_caches[(name, second_name, year)]
else: else:
...@@ -219,9 +247,11 @@ def main(): ...@@ -219,9 +247,11 @@ def main():
rank = get_core_rank(second_name, year) rank = get_core_rank(second_name, year)
core_ranking_caches[(name, second_name, year)] = rank core_ranking_caches[(name, second_name, year)] = rank
if rank is None: if rank is None:
result.append(['C', name, second_name, int(year), None, None, None]) result.append(["C", name, second_name, int(year), None, None, None])
else: else:
result.append(['C', name, second_name, int(year), rank[1], rank[2], rank[0]]) result.append(
["C", name, second_name, int(year), rank[1], rank[2], rank[0]]
)
else: else:
if (name, second_name) in sjr_ranking_caches: if (name, second_name) in sjr_ranking_caches:
...@@ -231,42 +261,53 @@ def main(): ...@@ -231,42 +261,53 @@ def main():
sjr_ranking_caches[(name, second_name)] = rankings sjr_ranking_caches[(name, second_name)] = rankings
rank = get_sjr_in_cache(rankings, year) rank = get_sjr_in_cache(rankings, year)
if rank is None: if rank is None:
result.append(['J', name, second_name, int(year), None, None, None]) result.append(["J", name, second_name, int(year), None, None, None])
else: else:
result.append(['J', name, second_name, int(year), rank[1], None, rank[2]]) result.append(
save_ranking_caches(sjr_ranking_caches, 'sjr') ["J", name, second_name, int(year), rank[1], None, rank[2]]
save_ranking_caches(core_ranking_caches, 'core') )
save_ranking_caches(sjr_ranking_caches, "sjr")
df = pd.DataFrame(result, columns=['type', 'name', 'short', 'year', 'longname', 'acronym', 'rank']) save_ranking_caches(core_ranking_caches, "core")
if start_year != -1 : df = pd.DataFrame(
print('Starting year', start_year) result, columns=["type", "name", "short", "year", "longname", "acronym", "rank"]
)
df = df.fillna(value="")
if start_year != -1:
print("Starting year", start_year)
else: else:
print('Starting year', min(df['year'])) print("Starting year", min(df["year"]))
if end_year != 10000: if end_year != 10000:
print('Ending year', end_year) print("Ending year", end_year)
else: else:
print('Ending year', max(df['year'])) print("Ending year", max(df["year"]))
selection = df[(df['year'] >= start_year) & (df['year'] <= end_year)] selection = df[(df["year"] >= start_year) & (df["year"] <= end_year)]
print('Not found', print(
len(selection) - selection['rank'].count(), "Not found",
'out of a total of', len(selection) - selection["rank"].count(),
len(selection)) "out of a total of",
len(selection),
)
evaluation = selection.groupby('rank').count()
print(evaluation.drop(['name', 'short', 'year', 'longname', 'acronym'], axis=1).rename(columns={'type':'number'})) evaluation = selection.groupby("rank").count()
print(
evaluation.drop(
["name", "short", "year", "longname", "acronym"], axis=1
).rename(columns={"type": "number"})
)
if not csv_output is None: if not csv_output is None:
selection.to_csv(csv_output, index=False) selection.to_csv(csv_output, index=False)
if display_list: if display_list:
pd.set_option('display.max_rows', len(selection)) pd.set_option("display.max_rows", len(selection))
print(selection) print(selection)
if __name__ == '__main__':
if __name__ == "__main__":
main() main()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment