Skip to content
Snippets Groups Projects
Commit 79045f46 authored by Georges Da Costa's avatar Georges Da Costa
Browse files

Refactors JSR part

parent cebf5d40
Branches
No related tags found
No related merge requests found
......@@ -4,63 +4,20 @@ import logging
from tqdm import tqdm
from tqdm.contrib.logging import logging_redirect_tqdm
import os
import requests
import datetime
from dateutil.parser import parse as parsedate
from bs4 import BeautifulSoup
import pandas as pd
import numpy
import json
import argparse
from hash_cache import load_hash_caches, save_hash_caches, default_cache
from tools import levenshtein, download, get_in_ordered_list
LOG = logging.getLogger(__name__)
def getwithpb(url):
LOG.info(f"fetching {url}")
r = requests.get(url, stream=True)
data = b""
total_size = int(r.headers.get("content-length", 0))
for chunk in tqdm(
r.iter_content(32 * 1024),
total=total_size,
unit="B",
unit_scale=True,
leave=False,
):
if chunk:
data += chunk
return data
def fgetwithpb(url, filename):
LOG.info(f"fetching {url}")
r = requests.get(url, stream=True)
data = b""
total_size = int(r.headers.get("content-length", 0))
with open(filename, "wb") as file:
for chunk in tqdm(
r.iter_content(32 * 1024),
total=total_size,
unit="B",
unit_scale=True,
leave=False,
):
if chunk:
file.write(chunk)
data += chunk
return data
def comp_lower(a, b):
return isinstance(a, str) and isinstance(b, str) and a.lower() == b.lower()
def default_cache():
return os.environ["HOME"] + "/.local/state/pyrank"
def get_dblp(url, cache=True, cache_dir=None):
if cache_dir is None:
cache_dir = default_cache()
......@@ -68,7 +25,7 @@ def get_dblp(url, cache=True, cache_dir=None):
filename = "%s/%s" % (cache_dir, target.replace("/", "_"))
os.makedirs(cache_dir, exist_ok=True)
if not os.path.exists(filename) or not cache:
data = fgetwithpb(url, filename)
data = download(url, filename)
else:
with open(filename, "rb") as file:
data = file.read()
......@@ -115,7 +72,7 @@ def get_core_rank(name, year):
source,
)
data = getwithpb(url)
data = download(url)
cc_soup = BeautifulSoup(data, "html.parser")
table = cc_soup.find_all("table")
if len(table) == 0:
......@@ -129,125 +86,82 @@ def get_core_rank(name, year):
return None
def levenshteinDistanceDP(token1, token2):
distances = numpy.zeros((len(token1) + 1, len(token2) + 1))
for t1 in range(len(token1) + 1):
distances[t1][0] = t1
for t2 in range(len(token2) + 1):
distances[0][t2] = t2
a = 0
b = 0
c = 0
for t1 in range(1, len(token1) + 1):
for t2 in range(1, len(token2) + 1):
if token1[t1 - 1] == token2[t2 - 1]:
distances[t1][t2] = distances[t1 - 1][t2 - 1]
else:
a = distances[t1][t2 - 1]
b = distances[t1 - 1][t2]
c = distances[t1 - 1][t2 - 1]
if a <= b and a <= c:
distances[t1][t2] = a + 1
elif b <= a and b <= c:
distances[t1][t2] = b + 1
else:
distances[t1][t2] = c + 1
return distances[len(token1)][len(token2)]
def list_to_hash(content):
return {tuple(elem[0]): elem[1] for elem in content}
def load_ranking_caches(basename, cache_dir=None):
if cache_dir is None:
cache_dir = default_cache()
core = "%s/%s.json" % (cache_dir, basename)
if os.path.exists(core):
with open(core, "r") as fid:
# for elem in
return list_to_hash(json.load(fid))
return {}
class Sjr:
def __init__(self):
self.ranking_caches = load_hash_caches("sjr")
def hash_to_list(content):
return [[a, content[a]] for a in content]
def save_ranking_caches(cache, basename, cache_dir=None):
if cache_dir is None:
cache_dir = default_cache()
os.makedirs(cache_dir, exist_ok=True)
core = "%s/%s.json" % (cache_dir, basename)
with open(core, "w") as fid:
json.dump(hash_to_list(cache), fid)
def get_sjr_in_cache(rankings, str_year):
year = int(str_year)
if rankings == []:
return None
current = rankings[0]
for elem in rankings[1:]:
if year < elem[0]:
return current
current = elem
return current
def get_sjr_rank(name):
url = "https://www.scimagojr.com/journalsearch.php?q=%s" % name.replace(" ", "+")
data = getwithpb(url)
sjr_soup = BeautifulSoup(data, "html.parser")
revues = sjr_soup.find("div", class_="search_results")
dist = -1
reference = None
best_name = None
for revue in revues.find_all("a"):
tmp = revue.find("span").text
lev = levenshteinDistanceDP(tmp, name)
if dist == -1 or lev < dist:
dist = lev
best_name = tmp
reference = "https://www.scimagojr.com/%s" % revue["href"]
if reference is None:
return []
data = getwithpb(reference)
sjr_soup = BeautifulSoup(data, "html.parser")
table = sjr_soup.find_all("table")
if len(table) == 0:
return []
df = pd.read_html(str(table))[0]
if "Quartile" in df:
df["Rank"] = [int(val[1]) for val in df.Quartile]
else:
return []
def close(self):
save_hash_caches(self.ranking_caches, "sjr")
mins = df.groupby("Year").min().Rank
maxs = df.groupby("Year").max().Rank.to_dict()
result = []
for (y, v) in mins.items():
if v == maxs[y]:
ranking = "Q%s" % v
def get(self, name, second_name, year):
if (name, second_name) in self.ranking_caches:
rankings = self.ranking_caches[(name, second_name)]
else:
ranking = "Q%s-Q%s" % (v, maxs[y])
result.append((y, best_name, ranking))
return result
rankings = self.get_sjr_rank(name)
self.ranking_caches[(name, second_name)] = rankings
rank = get_in_ordered_list(rankings, int(year))
if rank is None:
return ["J", name, second_name, int(year), None, None, None]
else:
return ["J", name, second_name, int(year), rank[1], None, rank[2]]
def get_sjr_rank(self, name):
url = "https://www.scimagojr.com/journalsearch.php?q=%s" % name.replace(" ", "+")
data = download(url)
sjr_soup = BeautifulSoup(data, "html.parser")
revues = sjr_soup.find("div", class_="search_results")
dist = -1
reference = None
best_name = None
for revue in revues.find_all("a"):
tmp = revue.find("span").text
lev = levenshtein(tmp, name)
if dist == -1 or lev < dist:
dist = lev
best_name = tmp
reference = "https://www.scimagojr.com/%s" % revue["href"]
if reference is None:
return []
data = download(reference)
sjr_soup = BeautifulSoup(data, "html.parser")
table = sjr_soup.find_all("table")
if len(table) == 0:
return []
df = pd.read_html(str(table))[0]
if "Quartile" in df:
df["Rank"] = [int(val[1]) for val in df.Quartile]
else:
return []
mins = df.groupby("Year").min().Rank
maxs = df.groupby("Year").max().Rank.to_dict()
result = []
for (y, v) in mins.items():
if v == maxs[y]:
ranking = "Q%s" % v
else:
ranking = "Q%s-Q%s" % (v, maxs[y])
result.append((y, best_name, ranking))
return result
def main():
sjr_ranking_caches = load_ranking_caches("sjr")
core_ranking_caches = load_ranking_caches("core")
sjr = Sjr()
#sjr_ranking_caches = load_hash_caches("sjr")
core_ranking_caches = load_hash_caches("core")
parser = argparse.ArgumentParser(
description="Get ranking from DBLP and show a small summary"
......@@ -287,12 +201,14 @@ def main():
logging.basicConfig(level=args.loglevel, format="%(levelname)s %(message)s")
username, elements = get_dblp(url)
print(username)
# Keeps only elements in the requested range
elements = [elem for elem in elements if start_year <= int(elem[-1]) <= end_year]
print(username)
result = []
with logging_redirect_tqdm():
for venue, name, second_name, year in tqdm(elements):
if start_year <= int(year) <= end_year:
if venue == "conf":
if (name, second_name, year) in core_ranking_caches:
rank = core_ranking_caches[(name, second_name, year)]
......@@ -319,25 +235,13 @@ def main():
)
elif venue == "journals":
if (name, second_name) in sjr_ranking_caches:
rankings = sjr_ranking_caches[(name, second_name)]
else:
rankings = get_sjr_rank(name)
sjr_ranking_caches[(name, second_name)] = rankings
rank = get_sjr_in_cache(rankings, year)
if rank is None:
result.append(
["J", name, second_name, int(year), None, None, None]
)
else:
result.append(
["J", name, second_name, int(year), rank[1], None, rank[2]]
)
result.append(sjr.get(name, second_name, year))
else:
tqdm.write(f"venue: {venue} ?")
save_ranking_caches(sjr_ranking_caches, "sjr")
save_ranking_caches(core_ranking_caches, "core")
#save_hash_caches(sjr_ranking_caches, "sjr")
save_hash_caches(core_ranking_caches, "core")
sjr.close()
df = pd.DataFrame(
result, columns=["type", "name", "short", "year", "longname", "acronym", "rank"]
)
......
import json
import os
def default_cache():
return os.environ["HOME"] + "/.local/state/pyrank"
def list_to_hash(content):
return {tuple(elem[0]): elem[1] for elem in content}
def hash_to_list(content):
return [[a, content[a]] for a in content]
def load_hash_caches(basename, cache_dir=None):
if cache_dir is None:
cache_dir = default_cache()
core = "%s/%s.json" % (cache_dir, basename)
if os.path.exists(core):
with open(core, "r") as fid:
# for elem in
return list_to_hash(json.load(fid))
return {}
def save_hash_caches(cache, basename, cache_dir=None):
if cache_dir is None:
cache_dir = default_cache()
os.makedirs(cache_dir, exist_ok=True)
core = "%s/%s.json" % (cache_dir, basename)
with open(core, "w") as fid:
json.dump(hash_to_list(cache), fid)
import numpy
import requests
import logging
from tqdm import tqdm
LOG = logging.getLogger(__name__)
def levenshtein(token1, token2):
distances = numpy.zeros((len(token1) + 1, len(token2) + 1))
for t1 in range(len(token1) + 1):
distances[t1][0] = t1
for t2 in range(len(token2) + 1):
distances[0][t2] = t2
a = 0
b = 0
c = 0
for t1 in range(1, len(token1) + 1):
for t2 in range(1, len(token2) + 1):
if token1[t1 - 1] == token2[t2 - 1]:
distances[t1][t2] = distances[t1 - 1][t2 - 1]
else:
a = distances[t1][t2 - 1]
b = distances[t1 - 1][t2]
c = distances[t1 - 1][t2 - 1]
if a <= b and a <= c:
distances[t1][t2] = a + 1
elif b <= a and b <= c:
distances[t1][t2] = b + 1
else:
distances[t1][t2] = c + 1
return distances[len(token1)][len(token2)]
def download(url, filename=None):
LOG.info(f"fetching {url}")
r = requests.get(url, stream=True)
data = b""
total_size = int(r.headers.get("content-length", 0))
for chunk in tqdm(
r.iter_content(32 * 1024),
total=total_size,
unit="B",
unit_scale=True,
leave=False,
):
if chunk:
data += chunk
if not filename is None:
with open(filename, "wb") as file:
file.write(data)
return data
def get_in_ordered_list(ordered_list, year):
if ordered_list == []:
return None
current = ordered_list[0]
for elem in ordered_list[1:]:
if year < elem[0]:
return current
current = elem
return current
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment