diff --git a/get_rankings/get_rankings.py b/get_rankings/get_rankings.py index 642473d789c78adf046d3cc6b14ff473c6dc4c00..3bcf288467c0e1a7d88f58d5ed6f65b41fda4d24 100755 --- a/get_rankings/get_rankings.py +++ b/get_rankings/get_rankings.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 +import logging from tqdm import tqdm +from tqdm.contrib.logging import logging_redirect_tqdm import os import requests import datetime @@ -12,6 +14,45 @@ import json import argparse +LOG = logging.getLogger(__name__) + + +def getwithpb(url): + LOG.info(f"fetching {url}") + r = requests.get(url, stream=True) + data = b"" + total_size = int(r.headers.get("content-length", 0)) + for chunk in tqdm( + r.iter_content(32 * 1024), + total=total_size, + unit="B", + unit_scale=True, + leave=False, + ): + if chunk: + data += chunk + return data + + +def fgetwithpb(url, filename): + LOG.info(f"fetching {url}") + r = requests.get(url, stream=True) + data = b"" + total_size = int(r.headers.get("content-length", 0)) + with open(filename, "wb") as file: + for chunk in tqdm( + r.iter_content(32 * 1024), + total=total_size, + unit="B", + unit_scale=True, + leave=False, + ): + if chunk: + file.write(chunk) + data += chunk + return data + + def comp_lower(a, b): return isinstance(a, str) and isinstance(b, str) and a.lower() == b.lower() @@ -27,10 +68,7 @@ def get_dblp(url, cache=True, cache_dir=None): filename = "%s/%s" % (cache_dir, target.replace("/", "_")) os.makedirs(cache_dir, exist_ok=True) if not os.path.exists(filename) or not cache: - with open(filename, "wb") as file: - response = requests.get(url) - data = response.content - file.write(data) + data = fgetwithpb(url, filename) else: with open(filename, "rb") as file: data = file.read() @@ -77,8 +115,7 @@ def get_core_rank(name, year): source, ) - response = requests.get(url) - data = response.content + data = getwithpb(url) cc_soup = BeautifulSoup(data, "html.parser") table = cc_soup.find_all("table") if len(table) == 0: @@ -166,8 +203,7 @@ def get_sjr_in_cache(rankings, str_year): def get_sjr_rank(name): url = "https://www.scimagojr.com/journalsearch.php?q=%s" % name.replace(" ", "+") - response = requests.get(url) - data = response.content + data = getwithpb(url) sjr_soup = BeautifulSoup(data, "html.parser") revues = sjr_soup.find("div", class_="search_results") @@ -184,8 +220,7 @@ def get_sjr_rank(name): if reference is None: return [] - response = requests.get(reference) - data = response.content + data = getwithpb(reference) sjr_soup = BeautifulSoup(data, "html.parser") table = sjr_soup.find_all("table") if len(table) == 0: @@ -226,6 +261,22 @@ def main(): parser.add_argument( "-d", action="store_true", help="display conference and journal list" ) + parser.add_argument( + "--debug", + help="Print lots of debugging statements", + action="store_const", + dest="loglevel", + const=logging.DEBUG, + default=logging.WARNING, + ) + parser.add_argument( + "-v", + "--verbose", + help="Be verbose", + action="store_const", + dest="loglevel", + const=logging.INFO, + ) args = parser.parse_args() url = args.url @@ -233,40 +284,57 @@ def main(): csv_output = args.o start_year = args.start display_list = args.d + logging.basicConfig(level=args.loglevel, format="%(levelname)s %(message)s") username, elements = get_dblp(url) print(username) result = [] - for venue, name, second_name, year in tqdm(elements): - if venue == "conf": - if (name, second_name, year) in core_ranking_caches: - rank = core_ranking_caches[(name, second_name, year)] - else: - rank = get_core_rank(name, year) - if rank is None: - rank = get_core_rank(second_name, year) - core_ranking_caches[(name, second_name, year)] = rank - if rank is None: - result.append(["C", name, second_name, int(year), None, None, None]) - else: - result.append( - ["C", name, second_name, int(year), rank[1], rank[2], rank[0]] - ) - - else: - if (name, second_name) in sjr_ranking_caches: - rankings = sjr_ranking_caches[(name, second_name)] - else: - rankings = get_sjr_rank(name) - sjr_ranking_caches[(name, second_name)] = rankings - rank = get_sjr_in_cache(rankings, year) - if rank is None: - result.append(["J", name, second_name, int(year), None, None, None]) - else: - result.append( - ["J", name, second_name, int(year), rank[1], None, rank[2]] - ) + with logging_redirect_tqdm(): + for venue, name, second_name, year in tqdm(elements): + if start_year <= int(year) <= end_year: + if venue == "conf": + if (name, second_name, year) in core_ranking_caches: + rank = core_ranking_caches[(name, second_name, year)] + else: + rank = get_core_rank(name, year) + if rank is None: + rank = get_core_rank(second_name, year) + core_ranking_caches[(name, second_name, year)] = rank + if rank is None: + result.append( + ["C", name, second_name, int(year), None, None, None] + ) + else: + result.append( + [ + "C", + name, + second_name, + int(year), + rank[1], + rank[2], + rank[0], + ] + ) + + elif venue == "journals": + if (name, second_name) in sjr_ranking_caches: + rankings = sjr_ranking_caches[(name, second_name)] + else: + rankings = get_sjr_rank(name) + sjr_ranking_caches[(name, second_name)] = rankings + rank = get_sjr_in_cache(rankings, year) + if rank is None: + result.append( + ["J", name, second_name, int(year), None, None, None] + ) + else: + result.append( + ["J", name, second_name, int(year), rank[1], None, rank[2]] + ) + else: + tqdm.write(f"venue: {venue} ?") save_ranking_caches(sjr_ranking_caches, "sjr") save_ranking_caches(core_ranking_caches, "core")