From 79045f461da828585b3f9e2253aca8b2d4eaeefd Mon Sep 17 00:00:00 2001 From: Georges Da Costa <dacosta@irit.fr> Date: Sun, 25 Sep 2022 19:13:56 +0200 Subject: [PATCH] Refactors JSR part --- get_rankings/get_rankings.py | 248 +++++++++++------------------------ get_rankings/hash_cache.py | 30 +++++ get_rankings/tools.py | 69 ++++++++++ 3 files changed, 175 insertions(+), 172 deletions(-) create mode 100644 get_rankings/hash_cache.py create mode 100644 get_rankings/tools.py diff --git a/get_rankings/get_rankings.py b/get_rankings/get_rankings.py index 3bcf288..93fe945 100755 --- a/get_rankings/get_rankings.py +++ b/get_rankings/get_rankings.py @@ -4,63 +4,20 @@ import logging from tqdm import tqdm from tqdm.contrib.logging import logging_redirect_tqdm import os -import requests import datetime from dateutil.parser import parse as parsedate from bs4 import BeautifulSoup import pandas as pd -import numpy -import json import argparse +from hash_cache import load_hash_caches, save_hash_caches, default_cache +from tools import levenshtein, download, get_in_ordered_list -LOG = logging.getLogger(__name__) - - -def getwithpb(url): - LOG.info(f"fetching {url}") - r = requests.get(url, stream=True) - data = b"" - total_size = int(r.headers.get("content-length", 0)) - for chunk in tqdm( - r.iter_content(32 * 1024), - total=total_size, - unit="B", - unit_scale=True, - leave=False, - ): - if chunk: - data += chunk - return data - - -def fgetwithpb(url, filename): - LOG.info(f"fetching {url}") - r = requests.get(url, stream=True) - data = b"" - total_size = int(r.headers.get("content-length", 0)) - with open(filename, "wb") as file: - for chunk in tqdm( - r.iter_content(32 * 1024), - total=total_size, - unit="B", - unit_scale=True, - leave=False, - ): - if chunk: - file.write(chunk) - data += chunk - return data def comp_lower(a, b): return isinstance(a, str) and isinstance(b, str) and a.lower() == b.lower() - -def default_cache(): - return os.environ["HOME"] + "/.local/state/pyrank" - - def get_dblp(url, cache=True, cache_dir=None): if cache_dir is None: cache_dir = default_cache() @@ -68,7 +25,7 @@ def get_dblp(url, cache=True, cache_dir=None): filename = "%s/%s" % (cache_dir, target.replace("/", "_")) os.makedirs(cache_dir, exist_ok=True) if not os.path.exists(filename) or not cache: - data = fgetwithpb(url, filename) + data = download(url, filename) else: with open(filename, "rb") as file: data = file.read() @@ -115,7 +72,7 @@ def get_core_rank(name, year): source, ) - data = getwithpb(url) + data = download(url) cc_soup = BeautifulSoup(data, "html.parser") table = cc_soup.find_all("table") if len(table) == 0: @@ -129,125 +86,82 @@ def get_core_rank(name, year): return None -def levenshteinDistanceDP(token1, token2): - distances = numpy.zeros((len(token1) + 1, len(token2) + 1)) - - for t1 in range(len(token1) + 1): - distances[t1][0] = t1 - - for t2 in range(len(token2) + 1): - distances[0][t2] = t2 - a = 0 - b = 0 - c = 0 - for t1 in range(1, len(token1) + 1): - for t2 in range(1, len(token2) + 1): - if token1[t1 - 1] == token2[t2 - 1]: - distances[t1][t2] = distances[t1 - 1][t2 - 1] - else: - a = distances[t1][t2 - 1] - b = distances[t1 - 1][t2] - c = distances[t1 - 1][t2 - 1] - - if a <= b and a <= c: - distances[t1][t2] = a + 1 - elif b <= a and b <= c: - distances[t1][t2] = b + 1 - else: - distances[t1][t2] = c + 1 - - return distances[len(token1)][len(token2)] -def list_to_hash(content): - return {tuple(elem[0]): elem[1] for elem in content} -def load_ranking_caches(basename, cache_dir=None): - if cache_dir is None: - cache_dir = default_cache() - core = "%s/%s.json" % (cache_dir, basename) - if os.path.exists(core): - with open(core, "r") as fid: - # for elem in - return list_to_hash(json.load(fid)) - return {} +class Sjr: + def __init__(self): + self.ranking_caches = load_hash_caches("sjr") -def hash_to_list(content): - return [[a, content[a]] for a in content] - - -def save_ranking_caches(cache, basename, cache_dir=None): - if cache_dir is None: - cache_dir = default_cache() - os.makedirs(cache_dir, exist_ok=True) - core = "%s/%s.json" % (cache_dir, basename) - with open(core, "w") as fid: - json.dump(hash_to_list(cache), fid) - - -def get_sjr_in_cache(rankings, str_year): - year = int(str_year) - if rankings == []: - return None - current = rankings[0] - for elem in rankings[1:]: - if year < elem[0]: - return current - current = elem - return current - - -def get_sjr_rank(name): - url = "https://www.scimagojr.com/journalsearch.php?q=%s" % name.replace(" ", "+") - data = getwithpb(url) - sjr_soup = BeautifulSoup(data, "html.parser") - - revues = sjr_soup.find("div", class_="search_results") - dist = -1 - reference = None - best_name = None - for revue in revues.find_all("a"): - tmp = revue.find("span").text - lev = levenshteinDistanceDP(tmp, name) - if dist == -1 or lev < dist: - dist = lev - best_name = tmp - reference = "https://www.scimagojr.com/%s" % revue["href"] - if reference is None: - return [] - - data = getwithpb(reference) - sjr_soup = BeautifulSoup(data, "html.parser") - table = sjr_soup.find_all("table") - if len(table) == 0: - return [] - - df = pd.read_html(str(table))[0] - if "Quartile" in df: - df["Rank"] = [int(val[1]) for val in df.Quartile] - else: - return [] + def close(self): + save_hash_caches(self.ranking_caches, "sjr") - mins = df.groupby("Year").min().Rank - maxs = df.groupby("Year").max().Rank.to_dict() - result = [] - for (y, v) in mins.items(): - if v == maxs[y]: - ranking = "Q%s" % v + + def get(self, name, second_name, year): + if (name, second_name) in self.ranking_caches: + rankings = self.ranking_caches[(name, second_name)] else: - ranking = "Q%s-Q%s" % (v, maxs[y]) - result.append((y, best_name, ranking)) - - return result + rankings = self.get_sjr_rank(name) + self.ranking_caches[(name, second_name)] = rankings + rank = get_in_ordered_list(rankings, int(year)) + if rank is None: + return ["J", name, second_name, int(year), None, None, None] + else: + return ["J", name, second_name, int(year), rank[1], None, rank[2]] + + def get_sjr_rank(self, name): + url = "https://www.scimagojr.com/journalsearch.php?q=%s" % name.replace(" ", "+") + data = download(url) + sjr_soup = BeautifulSoup(data, "html.parser") + + revues = sjr_soup.find("div", class_="search_results") + dist = -1 + reference = None + best_name = None + for revue in revues.find_all("a"): + tmp = revue.find("span").text + lev = levenshtein(tmp, name) + if dist == -1 or lev < dist: + dist = lev + best_name = tmp + reference = "https://www.scimagojr.com/%s" % revue["href"] + if reference is None: + return [] + + data = download(reference) + sjr_soup = BeautifulSoup(data, "html.parser") + table = sjr_soup.find_all("table") + if len(table) == 0: + return [] + + df = pd.read_html(str(table))[0] + if "Quartile" in df: + df["Rank"] = [int(val[1]) for val in df.Quartile] + else: + return [] + + mins = df.groupby("Year").min().Rank + maxs = df.groupby("Year").max().Rank.to_dict() + result = [] + for (y, v) in mins.items(): + if v == maxs[y]: + ranking = "Q%s" % v + else: + ranking = "Q%s-Q%s" % (v, maxs[y]) + result.append((y, best_name, ranking)) + return result + + def main(): - sjr_ranking_caches = load_ranking_caches("sjr") - core_ranking_caches = load_ranking_caches("core") + sjr = Sjr() + #sjr_ranking_caches = load_hash_caches("sjr") + core_ranking_caches = load_hash_caches("core") parser = argparse.ArgumentParser( description="Get ranking from DBLP and show a small summary" @@ -287,12 +201,14 @@ def main(): logging.basicConfig(level=args.loglevel, format="%(levelname)s %(message)s") username, elements = get_dblp(url) - print(username) + # Keeps only elements in the requested range + elements = [elem for elem in elements if start_year <= int(elem[-1]) <= end_year] + + print(username) result = [] with logging_redirect_tqdm(): for venue, name, second_name, year in tqdm(elements): - if start_year <= int(year) <= end_year: if venue == "conf": if (name, second_name, year) in core_ranking_caches: rank = core_ranking_caches[(name, second_name, year)] @@ -319,25 +235,13 @@ def main(): ) elif venue == "journals": - if (name, second_name) in sjr_ranking_caches: - rankings = sjr_ranking_caches[(name, second_name)] - else: - rankings = get_sjr_rank(name) - sjr_ranking_caches[(name, second_name)] = rankings - rank = get_sjr_in_cache(rankings, year) - if rank is None: - result.append( - ["J", name, second_name, int(year), None, None, None] - ) - else: - result.append( - ["J", name, second_name, int(year), rank[1], None, rank[2]] - ) + result.append(sjr.get(name, second_name, year)) else: tqdm.write(f"venue: {venue} ?") - save_ranking_caches(sjr_ranking_caches, "sjr") - save_ranking_caches(core_ranking_caches, "core") - + #save_hash_caches(sjr_ranking_caches, "sjr") + save_hash_caches(core_ranking_caches, "core") + sjr.close() + df = pd.DataFrame( result, columns=["type", "name", "short", "year", "longname", "acronym", "rank"] ) diff --git a/get_rankings/hash_cache.py b/get_rankings/hash_cache.py new file mode 100644 index 0000000..78bbf58 --- /dev/null +++ b/get_rankings/hash_cache.py @@ -0,0 +1,30 @@ +import json +import os + +def default_cache(): + return os.environ["HOME"] + "/.local/state/pyrank" + +def list_to_hash(content): + return {tuple(elem[0]): elem[1] for elem in content} + +def hash_to_list(content): + return [[a, content[a]] for a in content] + + +def load_hash_caches(basename, cache_dir=None): + if cache_dir is None: + cache_dir = default_cache() + core = "%s/%s.json" % (cache_dir, basename) + if os.path.exists(core): + with open(core, "r") as fid: + # for elem in + return list_to_hash(json.load(fid)) + return {} + +def save_hash_caches(cache, basename, cache_dir=None): + if cache_dir is None: + cache_dir = default_cache() + os.makedirs(cache_dir, exist_ok=True) + core = "%s/%s.json" % (cache_dir, basename) + with open(core, "w") as fid: + json.dump(hash_to_list(cache), fid) diff --git a/get_rankings/tools.py b/get_rankings/tools.py new file mode 100644 index 0000000..607fea1 --- /dev/null +++ b/get_rankings/tools.py @@ -0,0 +1,69 @@ +import numpy +import requests + +import logging +from tqdm import tqdm + +LOG = logging.getLogger(__name__) + +def levenshtein(token1, token2): + distances = numpy.zeros((len(token1) + 1, len(token2) + 1)) + + for t1 in range(len(token1) + 1): + distances[t1][0] = t1 + + for t2 in range(len(token2) + 1): + distances[0][t2] = t2 + + a = 0 + b = 0 + c = 0 + + for t1 in range(1, len(token1) + 1): + for t2 in range(1, len(token2) + 1): + if token1[t1 - 1] == token2[t2 - 1]: + distances[t1][t2] = distances[t1 - 1][t2 - 1] + else: + a = distances[t1][t2 - 1] + b = distances[t1 - 1][t2] + c = distances[t1 - 1][t2 - 1] + + if a <= b and a <= c: + distances[t1][t2] = a + 1 + elif b <= a and b <= c: + distances[t1][t2] = b + 1 + else: + distances[t1][t2] = c + 1 + + return distances[len(token1)][len(token2)] + + +def download(url, filename=None): + LOG.info(f"fetching {url}") + r = requests.get(url, stream=True) + data = b"" + total_size = int(r.headers.get("content-length", 0)) + for chunk in tqdm( + r.iter_content(32 * 1024), + total=total_size, + unit="B", + unit_scale=True, + leave=False, + ): + if chunk: + data += chunk + if not filename is None: + with open(filename, "wb") as file: + file.write(data) + + return data + +def get_in_ordered_list(ordered_list, year): + if ordered_list == []: + return None + current = ordered_list[0] + for elem in ordered_list[1:]: + if year < elem[0]: + return current + current = elem + return current -- GitLab