diff --git a/get_rankings/get_rankings.py b/get_rankings/get_rankings.py index ec4354607172ab3834339c686aeb9e8ec45472f9..3bcf288467c0e1a7d88f58d5ed6f65b41fda4d24 100755 --- a/get_rankings/get_rankings.py +++ b/get_rankings/get_rankings.py @@ -1,5 +1,8 @@ -#!/usr/bin/python3 +#!/usr/bin/env python3 +import logging +from tqdm import tqdm +from tqdm.contrib.logging import logging_redirect_tqdm import os import requests import datetime @@ -10,74 +13,122 @@ import numpy import json import argparse + +LOG = logging.getLogger(__name__) + + +def getwithpb(url): + LOG.info(f"fetching {url}") + r = requests.get(url, stream=True) + data = b"" + total_size = int(r.headers.get("content-length", 0)) + for chunk in tqdm( + r.iter_content(32 * 1024), + total=total_size, + unit="B", + unit_scale=True, + leave=False, + ): + if chunk: + data += chunk + return data + + +def fgetwithpb(url, filename): + LOG.info(f"fetching {url}") + r = requests.get(url, stream=True) + data = b"" + total_size = int(r.headers.get("content-length", 0)) + with open(filename, "wb") as file: + for chunk in tqdm( + r.iter_content(32 * 1024), + total=total_size, + unit="B", + unit_scale=True, + leave=False, + ): + if chunk: + file.write(chunk) + data += chunk + return data + + +def comp_lower(a, b): + return isinstance(a, str) and isinstance(b, str) and a.lower() == b.lower() + + def default_cache(): - return os.environ['HOME']+'/.local/state/pyrank' + return os.environ["HOME"] + "/.local/state/pyrank" + -def get_dblp(url, cache=True, cache_dir = None): +def get_dblp(url, cache=True, cache_dir=None): if cache_dir is None: cache_dir = default_cache() - _, target = url.split('//') - filename = '%s/%s' % (cache_dir, target.replace('/', '_')) + _, target = url.split("//") + filename = "%s/%s" % (cache_dir, target.replace("/", "_")) os.makedirs(cache_dir, exist_ok=True) if not os.path.exists(filename) or not cache: - with open(filename, "wb") as file: - response = requests.get(url) - data = response.content - file.write(data) + data = fgetwithpb(url, filename) else: with open(filename, "rb") as file: data = file.read() - - soup = BeautifulSoup(data, 'html.parser') + + soup = BeautifulSoup(data, "html.parser") articles = soup.find_all("li", class_="entry") res = [] for a in articles: - if 'inproceedings' in a['class'] or 'article' in a['class']: - name = a.find("span", itemprop = 'isPartOf').find("span", itemprop = 'name').text - year = a.find("span", itemprop = 'datePublished').text - venue, second_name, _ = a['id'].split('/') + if "inproceedings" in a["class"] or "article" in a["class"]: + name = ( + a.find("span", itemprop="isPartOf").find("span", itemprop="name").text + ) + year = a.find("span", itemprop="datePublished").text + venue, second_name, _ = a["id"].split("/") res.append([venue, name, second_name, year]) return soup.title.text, res + def get_core_year(year): if year >= 2021: - return 'CORE2021' + return "CORE2021" if year >= 2020: - return 'CORE2020' + return "CORE2020" if year >= 2018: - return 'CORE2018' + return "CORE2018" if year >= 2017: - return 'CORE2017' + return "CORE2017" if year >= 2014: - return 'CORE2014' + return "CORE2014" if year >= 2013: - return 'CORE2013' + return "CORE2013" if year >= 2010: - return 'ERA2010' + return "ERA2010" return "CORE2008" + def get_core_rank(name, year): source = get_core_year(int(year)) - url = "http://portal.core.edu.au/conf-ranks/?search=%s&by=all&source=%s&page=1" % (name, source) + url = "http://portal.core.edu.au/conf-ranks/?search=%s&by=all&source=%s&page=1" % ( + name, + source, + ) - response = requests.get(url) - data = response.content - cc_soup = BeautifulSoup(data, 'html.parser') - table = cc_soup.find_all('table') + data = getwithpb(url) + cc_soup = BeautifulSoup(data, "html.parser") + table = cc_soup.find_all("table") if len(table) == 0: return None df = pd.read_html(str(table))[0] for index, row in df.iterrows(): - #print(name, year, ' ', row.Title, row.Acronym, row.Rank) - - if row.Title.lower() == name.lower() or row.Acronym.lower() == name.lower(): + # print(name, year, ' ', row.Title, row.Acronym, row.Rank) + if comp_lower(row.Title, name) or comp_lower(row.Acronym, name): return row.Rank, row.Title, row.Acronym return None + def levenshteinDistanceDP(token1, token2): distances = numpy.zeros((len(token1) + 1, len(token2) + 1)) @@ -86,23 +137,23 @@ def levenshteinDistanceDP(token1, token2): for t2 in range(len(token2) + 1): distances[0][t2] = t2 - + a = 0 b = 0 c = 0 - + for t1 in range(1, len(token1) + 1): for t2 in range(1, len(token2) + 1): - if (token1[t1-1] == token2[t2-1]): + if token1[t1 - 1] == token2[t2 - 1]: distances[t1][t2] = distances[t1 - 1][t2 - 1] else: a = distances[t1][t2 - 1] b = distances[t1 - 1][t2] c = distances[t1 - 1][t2 - 1] - - if (a <= b and a <= c): + + if a <= b and a <= c: distances[t1][t2] = a + 1 - elif (b <= a and b <= c): + elif b <= a and b <= c: distances[t1][t2] = b + 1 else: distances[t1][t2] = c + 1 @@ -111,29 +162,33 @@ def levenshteinDistanceDP(token1, token2): def list_to_hash(content): - return {tuple(elem[0]):elem[1] for elem in content} - -def load_ranking_caches(basename, cache_dir = None): + return {tuple(elem[0]): elem[1] for elem in content} + + +def load_ranking_caches(basename, cache_dir=None): if cache_dir is None: cache_dir = default_cache() - core = '%s/%s.json' % (cache_dir, basename) + core = "%s/%s.json" % (cache_dir, basename) if os.path.exists(core): - with open(core, 'r') as fid: - #for elem in + with open(core, "r") as fid: + # for elem in return list_to_hash(json.load(fid)) return {} + def hash_to_list(content): - return [[a,content[a]] for a in content] + return [[a, content[a]] for a in content] + -def save_ranking_caches(cache, basename, cache_dir = None): +def save_ranking_caches(cache, basename, cache_dir=None): if cache_dir is None: cache_dir = default_cache() - os.makedirs(cache_dir, exist_ok=True) - core = '%s/%s.json' % (cache_dir, basename) - with open(core, 'w') as fid: + os.makedirs(cache_dir, exist_ok=True) + core = "%s/%s.json" % (cache_dir, basename) + with open(core, "w") as fid: json.dump(hash_to_list(cache), fid) - + + def get_sjr_in_cache(rankings, str_year): year = int(str_year) if rankings == []: @@ -145,58 +200,83 @@ def get_sjr_in_cache(rankings, str_year): current = elem return current + def get_sjr_rank(name): - url = "https://www.scimagojr.com/journalsearch.php?q=%s" % name.replace(' ', '+') - response = requests.get(url) - data = response.content - sjr_soup = BeautifulSoup(data, 'html.parser') + url = "https://www.scimagojr.com/journalsearch.php?q=%s" % name.replace(" ", "+") + data = getwithpb(url) + sjr_soup = BeautifulSoup(data, "html.parser") - revues = sjr_soup.find('div', class_='search_results') + revues = sjr_soup.find("div", class_="search_results") dist = -1 reference = None best_name = None - for revue in revues.find_all('a'): - tmp = revue.find('span').text + for revue in revues.find_all("a"): + tmp = revue.find("span").text lev = levenshteinDistanceDP(tmp, name) if dist == -1 or lev < dist: dist = lev best_name = tmp - reference = "https://www.scimagojr.com/%s" % revue['href'] + reference = "https://www.scimagojr.com/%s" % revue["href"] if reference is None: return [] - response = requests.get(reference) - data = response.content - sjr_soup = BeautifulSoup(data, 'html.parser') - table = sjr_soup.find_all('table') + data = getwithpb(reference) + sjr_soup = BeautifulSoup(data, "html.parser") + table = sjr_soup.find_all("table") if len(table) == 0: return [] df = pd.read_html(str(table))[0] - df['Rank'] = [int(val[1]) for val in df.Quartile] + if "Quartile" in df: + df["Rank"] = [int(val[1]) for val in df.Quartile] + else: + return [] - mins = df.groupby('Year').min().Rank - maxs = df.groupby('Year').max().Rank.to_dict() + mins = df.groupby("Year").min().Rank + maxs = df.groupby("Year").max().Rank.to_dict() result = [] for (y, v) in mins.items(): if v == maxs[y]: - ranking = 'Q%s' % v + ranking = "Q%s" % v else: - ranking = 'Q%s-Q%s' % (v, maxs[y]) + ranking = "Q%s-Q%s" % (v, maxs[y]) result.append((y, best_name, ranking)) return result + def main(): - sjr_ranking_caches = load_ranking_caches('sjr') - core_ranking_caches = load_ranking_caches('core') - - parser = argparse.ArgumentParser(description='Get ranking from DBLP and show a small summary') - parser.add_argument('url', help='DBLP url') - parser.add_argument('--start', type=int, default = -1, help='starting year') - parser.add_argument('--end', type=int, default = 10000, help='ending year') - parser.add_argument('-o', metavar=('output.csv'), default = None, help='output csv file') - parser.add_argument('-d', action='store_true', help='display conference and journal list') + sjr_ranking_caches = load_ranking_caches("sjr") + core_ranking_caches = load_ranking_caches("core") + + parser = argparse.ArgumentParser( + description="Get ranking from DBLP and show a small summary" + ) + parser.add_argument("url", help="DBLP url") + parser.add_argument("--start", type=int, default=-1, help="starting year") + parser.add_argument("--end", type=int, default=10000, help="ending year") + parser.add_argument( + "-o", metavar=("output.csv"), default=None, help="output csv file" + ) + parser.add_argument( + "-d", action="store_true", help="display conference and journal list" + ) + parser.add_argument( + "--debug", + help="Print lots of debugging statements", + action="store_const", + dest="loglevel", + const=logging.DEBUG, + default=logging.WARNING, + ) + parser.add_argument( + "-v", + "--verbose", + help="Be verbose", + action="store_const", + dest="loglevel", + const=logging.INFO, + ) args = parser.parse_args() url = args.url @@ -204,69 +284,99 @@ def main(): csv_output = args.o start_year = args.start display_list = args.d - + logging.basicConfig(level=args.loglevel, format="%(levelname)s %(message)s") + username, elements = get_dblp(url) print(username) - + result = [] - for venue, name, second_name, year in elements: - if venue == 'conf': - if (name, second_name, year) in core_ranking_caches: - rank = core_ranking_caches[(name, second_name, year)] - else: - rank = get_core_rank(name, year) - if rank is None: - rank = get_core_rank(second_name, year) - core_ranking_caches[(name, second_name, year)] = rank - if rank is None: - result.append(['C', name, second_name, int(year), None, None, None]) - else: - result.append(['C', name, second_name, int(year), rank[1], rank[2], rank[0]]) + with logging_redirect_tqdm(): + for venue, name, second_name, year in tqdm(elements): + if start_year <= int(year) <= end_year: + if venue == "conf": + if (name, second_name, year) in core_ranking_caches: + rank = core_ranking_caches[(name, second_name, year)] + else: + rank = get_core_rank(name, year) + if rank is None: + rank = get_core_rank(second_name, year) + core_ranking_caches[(name, second_name, year)] = rank + if rank is None: + result.append( + ["C", name, second_name, int(year), None, None, None] + ) + else: + result.append( + [ + "C", + name, + second_name, + int(year), + rank[1], + rank[2], + rank[0], + ] + ) - else: - if (name, second_name) in sjr_ranking_caches: - rankings = sjr_ranking_caches[(name, second_name)] - else: - rankings = get_sjr_rank(name) - sjr_ranking_caches[(name, second_name)] = rankings - rank = get_sjr_in_cache(rankings, year) - if rank is None: - result.append(['J', name, second_name, int(year), None, None, None]) - else: - result.append(['J', name, second_name, int(year), rank[1], None, rank[2]]) - save_ranking_caches(sjr_ranking_caches, 'sjr') - save_ranking_caches(core_ranking_caches, 'core') - - df = pd.DataFrame(result, columns=['type', 'name', 'short', 'year', 'longname', 'acronym', 'rank']) - - if start_year != -1 : - print('Starting year', start_year) + elif venue == "journals": + if (name, second_name) in sjr_ranking_caches: + rankings = sjr_ranking_caches[(name, second_name)] + else: + rankings = get_sjr_rank(name) + sjr_ranking_caches[(name, second_name)] = rankings + rank = get_sjr_in_cache(rankings, year) + if rank is None: + result.append( + ["J", name, second_name, int(year), None, None, None] + ) + else: + result.append( + ["J", name, second_name, int(year), rank[1], None, rank[2]] + ) + else: + tqdm.write(f"venue: {venue} ?") + save_ranking_caches(sjr_ranking_caches, "sjr") + save_ranking_caches(core_ranking_caches, "core") + + df = pd.DataFrame( + result, columns=["type", "name", "short", "year", "longname", "acronym", "rank"] + ) + + df = df.fillna(value="") + + if start_year != -1: + print("Starting year", start_year) else: - print('Starting year', min(df['year'])) - + print("Starting year", min(df["year"])) + if end_year != 10000: - print('Ending year', end_year) + print("Ending year", end_year) else: - print('Ending year', max(df['year'])) - - selection = df[(df['year'] >= start_year) & (df['year'] <= end_year)] - - print('Not found', - len(selection) - selection['rank'].count(), - 'out of a total of', - len(selection)) - - - evaluation = selection.groupby('rank').count() - print(evaluation.drop(['name', 'short', 'year', 'longname', 'acronym'], axis=1).rename(columns={'type':'number'})) - + print("Ending year", max(df["year"])) + + selection = df[(df["year"] >= start_year) & (df["year"] <= end_year)] + + print( + "Not found", + len(selection) - selection["rank"].count(), + "out of a total of", + len(selection), + ) + + evaluation = selection.groupby("rank").count() + print( + evaluation.drop( + ["name", "short", "year", "longname", "acronym"], axis=1 + ).rename(columns={"type": "number"}) + ) + if not csv_output is None: selection.to_csv(csv_output, index=False) - + if display_list: - pd.set_option('display.max_rows', len(selection)) + pd.set_option("display.max_rows", len(selection)) print(selection) - -if __name__ == '__main__': + + +if __name__ == "__main__": main() - diff --git a/setup.py b/setup.py index f0a6fcaee35e7dc36d7bf1a769121d1f1ae3357c..01ea904025ef3358d65d6b77d92a7c3f60ccabac 100644 --- a/setup.py +++ b/setup.py @@ -19,7 +19,7 @@ setuptools.setup( "Operating System :: OS Independent", ], python_requires='>=3.6', - install_requires=['requests', 'BeautifulSoup4', 'datetime', 'parsedate', 'pandas', 'numpy', 'argparse', 'lxml'], + install_requires=['requests', 'BeautifulSoup4', 'datetime', 'parsedate', 'pandas', 'numpy', 'argparse', 'lxml', 'tqdm'], entry_points={ 'console_scripts': [ 'get_rankings = get_rankings.get_rankings:main',