Skip to content
Snippets Groups Projects
Commit 79045f46 authored by Georges Da Costa's avatar Georges Da Costa
Browse files

Refactors JSR part

parent cebf5d40
No related branches found
No related tags found
No related merge requests found
...@@ -4,63 +4,20 @@ import logging ...@@ -4,63 +4,20 @@ import logging
from tqdm import tqdm from tqdm import tqdm
from tqdm.contrib.logging import logging_redirect_tqdm from tqdm.contrib.logging import logging_redirect_tqdm
import os import os
import requests
import datetime import datetime
from dateutil.parser import parse as parsedate from dateutil.parser import parse as parsedate
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import pandas as pd import pandas as pd
import numpy
import json
import argparse import argparse
from hash_cache import load_hash_caches, save_hash_caches, default_cache
from tools import levenshtein, download, get_in_ordered_list
LOG = logging.getLogger(__name__)
def getwithpb(url):
LOG.info(f"fetching {url}")
r = requests.get(url, stream=True)
data = b""
total_size = int(r.headers.get("content-length", 0))
for chunk in tqdm(
r.iter_content(32 * 1024),
total=total_size,
unit="B",
unit_scale=True,
leave=False,
):
if chunk:
data += chunk
return data
def fgetwithpb(url, filename):
LOG.info(f"fetching {url}")
r = requests.get(url, stream=True)
data = b""
total_size = int(r.headers.get("content-length", 0))
with open(filename, "wb") as file:
for chunk in tqdm(
r.iter_content(32 * 1024),
total=total_size,
unit="B",
unit_scale=True,
leave=False,
):
if chunk:
file.write(chunk)
data += chunk
return data
def comp_lower(a, b): def comp_lower(a, b):
return isinstance(a, str) and isinstance(b, str) and a.lower() == b.lower() return isinstance(a, str) and isinstance(b, str) and a.lower() == b.lower()
def default_cache():
return os.environ["HOME"] + "/.local/state/pyrank"
def get_dblp(url, cache=True, cache_dir=None): def get_dblp(url, cache=True, cache_dir=None):
if cache_dir is None: if cache_dir is None:
cache_dir = default_cache() cache_dir = default_cache()
...@@ -68,7 +25,7 @@ def get_dblp(url, cache=True, cache_dir=None): ...@@ -68,7 +25,7 @@ def get_dblp(url, cache=True, cache_dir=None):
filename = "%s/%s" % (cache_dir, target.replace("/", "_")) filename = "%s/%s" % (cache_dir, target.replace("/", "_"))
os.makedirs(cache_dir, exist_ok=True) os.makedirs(cache_dir, exist_ok=True)
if not os.path.exists(filename) or not cache: if not os.path.exists(filename) or not cache:
data = fgetwithpb(url, filename) data = download(url, filename)
else: else:
with open(filename, "rb") as file: with open(filename, "rb") as file:
data = file.read() data = file.read()
...@@ -115,7 +72,7 @@ def get_core_rank(name, year): ...@@ -115,7 +72,7 @@ def get_core_rank(name, year):
source, source,
) )
data = getwithpb(url) data = download(url)
cc_soup = BeautifulSoup(data, "html.parser") cc_soup = BeautifulSoup(data, "html.parser")
table = cc_soup.find_all("table") table = cc_soup.find_all("table")
if len(table) == 0: if len(table) == 0:
...@@ -129,125 +86,82 @@ def get_core_rank(name, year): ...@@ -129,125 +86,82 @@ def get_core_rank(name, year):
return None return None
def levenshteinDistanceDP(token1, token2):
distances = numpy.zeros((len(token1) + 1, len(token2) + 1))
for t1 in range(len(token1) + 1):
distances[t1][0] = t1
for t2 in range(len(token2) + 1):
distances[0][t2] = t2
a = 0
b = 0
c = 0
for t1 in range(1, len(token1) + 1):
for t2 in range(1, len(token2) + 1):
if token1[t1 - 1] == token2[t2 - 1]:
distances[t1][t2] = distances[t1 - 1][t2 - 1]
else:
a = distances[t1][t2 - 1]
b = distances[t1 - 1][t2]
c = distances[t1 - 1][t2 - 1]
if a <= b and a <= c:
distances[t1][t2] = a + 1
elif b <= a and b <= c:
distances[t1][t2] = b + 1
else:
distances[t1][t2] = c + 1
return distances[len(token1)][len(token2)]
def list_to_hash(content):
return {tuple(elem[0]): elem[1] for elem in content}
def load_ranking_caches(basename, cache_dir=None):
if cache_dir is None:
cache_dir = default_cache()
core = "%s/%s.json" % (cache_dir, basename)
if os.path.exists(core):
with open(core, "r") as fid:
# for elem in
return list_to_hash(json.load(fid))
return {}
class Sjr:
def __init__(self):
self.ranking_caches = load_hash_caches("sjr")
def hash_to_list(content): def close(self):
return [[a, content[a]] for a in content] save_hash_caches(self.ranking_caches, "sjr")
def save_ranking_caches(cache, basename, cache_dir=None):
if cache_dir is None:
cache_dir = default_cache()
os.makedirs(cache_dir, exist_ok=True)
core = "%s/%s.json" % (cache_dir, basename)
with open(core, "w") as fid:
json.dump(hash_to_list(cache), fid)
def get_sjr_in_cache(rankings, str_year):
year = int(str_year)
if rankings == []:
return None
current = rankings[0]
for elem in rankings[1:]:
if year < elem[0]:
return current
current = elem
return current
def get_sjr_rank(name):
url = "https://www.scimagojr.com/journalsearch.php?q=%s" % name.replace(" ", "+")
data = getwithpb(url)
sjr_soup = BeautifulSoup(data, "html.parser")
revues = sjr_soup.find("div", class_="search_results")
dist = -1
reference = None
best_name = None
for revue in revues.find_all("a"):
tmp = revue.find("span").text
lev = levenshteinDistanceDP(tmp, name)
if dist == -1 or lev < dist:
dist = lev
best_name = tmp
reference = "https://www.scimagojr.com/%s" % revue["href"]
if reference is None:
return []
data = getwithpb(reference)
sjr_soup = BeautifulSoup(data, "html.parser")
table = sjr_soup.find_all("table")
if len(table) == 0:
return []
df = pd.read_html(str(table))[0]
if "Quartile" in df:
df["Rank"] = [int(val[1]) for val in df.Quartile]
else:
return []
mins = df.groupby("Year").min().Rank
maxs = df.groupby("Year").max().Rank.to_dict() def get(self, name, second_name, year):
result = [] if (name, second_name) in self.ranking_caches:
for (y, v) in mins.items(): rankings = self.ranking_caches[(name, second_name)]
if v == maxs[y]:
ranking = "Q%s" % v
else: else:
ranking = "Q%s-Q%s" % (v, maxs[y]) rankings = self.get_sjr_rank(name)
result.append((y, best_name, ranking)) self.ranking_caches[(name, second_name)] = rankings
rank = get_in_ordered_list(rankings, int(year))
return result if rank is None:
return ["J", name, second_name, int(year), None, None, None]
else:
return ["J", name, second_name, int(year), rank[1], None, rank[2]]
def get_sjr_rank(self, name):
url = "https://www.scimagojr.com/journalsearch.php?q=%s" % name.replace(" ", "+")
data = download(url)
sjr_soup = BeautifulSoup(data, "html.parser")
revues = sjr_soup.find("div", class_="search_results")
dist = -1
reference = None
best_name = None
for revue in revues.find_all("a"):
tmp = revue.find("span").text
lev = levenshtein(tmp, name)
if dist == -1 or lev < dist:
dist = lev
best_name = tmp
reference = "https://www.scimagojr.com/%s" % revue["href"]
if reference is None:
return []
data = download(reference)
sjr_soup = BeautifulSoup(data, "html.parser")
table = sjr_soup.find_all("table")
if len(table) == 0:
return []
df = pd.read_html(str(table))[0]
if "Quartile" in df:
df["Rank"] = [int(val[1]) for val in df.Quartile]
else:
return []
mins = df.groupby("Year").min().Rank
maxs = df.groupby("Year").max().Rank.to_dict()
result = []
for (y, v) in mins.items():
if v == maxs[y]:
ranking = "Q%s" % v
else:
ranking = "Q%s-Q%s" % (v, maxs[y])
result.append((y, best_name, ranking))
return result
def main(): def main():
sjr_ranking_caches = load_ranking_caches("sjr") sjr = Sjr()
core_ranking_caches = load_ranking_caches("core") #sjr_ranking_caches = load_hash_caches("sjr")
core_ranking_caches = load_hash_caches("core")
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description="Get ranking from DBLP and show a small summary" description="Get ranking from DBLP and show a small summary"
...@@ -287,12 +201,14 @@ def main(): ...@@ -287,12 +201,14 @@ def main():
logging.basicConfig(level=args.loglevel, format="%(levelname)s %(message)s") logging.basicConfig(level=args.loglevel, format="%(levelname)s %(message)s")
username, elements = get_dblp(url) username, elements = get_dblp(url)
print(username)
# Keeps only elements in the requested range
elements = [elem for elem in elements if start_year <= int(elem[-1]) <= end_year]
print(username)
result = [] result = []
with logging_redirect_tqdm(): with logging_redirect_tqdm():
for venue, name, second_name, year in tqdm(elements): for venue, name, second_name, year in tqdm(elements):
if start_year <= int(year) <= end_year:
if venue == "conf": if venue == "conf":
if (name, second_name, year) in core_ranking_caches: if (name, second_name, year) in core_ranking_caches:
rank = core_ranking_caches[(name, second_name, year)] rank = core_ranking_caches[(name, second_name, year)]
...@@ -319,25 +235,13 @@ def main(): ...@@ -319,25 +235,13 @@ def main():
) )
elif venue == "journals": elif venue == "journals":
if (name, second_name) in sjr_ranking_caches: result.append(sjr.get(name, second_name, year))
rankings = sjr_ranking_caches[(name, second_name)]
else:
rankings = get_sjr_rank(name)
sjr_ranking_caches[(name, second_name)] = rankings
rank = get_sjr_in_cache(rankings, year)
if rank is None:
result.append(
["J", name, second_name, int(year), None, None, None]
)
else:
result.append(
["J", name, second_name, int(year), rank[1], None, rank[2]]
)
else: else:
tqdm.write(f"venue: {venue} ?") tqdm.write(f"venue: {venue} ?")
save_ranking_caches(sjr_ranking_caches, "sjr") #save_hash_caches(sjr_ranking_caches, "sjr")
save_ranking_caches(core_ranking_caches, "core") save_hash_caches(core_ranking_caches, "core")
sjr.close()
df = pd.DataFrame( df = pd.DataFrame(
result, columns=["type", "name", "short", "year", "longname", "acronym", "rank"] result, columns=["type", "name", "short", "year", "longname", "acronym", "rank"]
) )
......
import json
import os
def default_cache():
return os.environ["HOME"] + "/.local/state/pyrank"
def list_to_hash(content):
return {tuple(elem[0]): elem[1] for elem in content}
def hash_to_list(content):
return [[a, content[a]] for a in content]
def load_hash_caches(basename, cache_dir=None):
if cache_dir is None:
cache_dir = default_cache()
core = "%s/%s.json" % (cache_dir, basename)
if os.path.exists(core):
with open(core, "r") as fid:
# for elem in
return list_to_hash(json.load(fid))
return {}
def save_hash_caches(cache, basename, cache_dir=None):
if cache_dir is None:
cache_dir = default_cache()
os.makedirs(cache_dir, exist_ok=True)
core = "%s/%s.json" % (cache_dir, basename)
with open(core, "w") as fid:
json.dump(hash_to_list(cache), fid)
import numpy
import requests
import logging
from tqdm import tqdm
LOG = logging.getLogger(__name__)
def levenshtein(token1, token2):
distances = numpy.zeros((len(token1) + 1, len(token2) + 1))
for t1 in range(len(token1) + 1):
distances[t1][0] = t1
for t2 in range(len(token2) + 1):
distances[0][t2] = t2
a = 0
b = 0
c = 0
for t1 in range(1, len(token1) + 1):
for t2 in range(1, len(token2) + 1):
if token1[t1 - 1] == token2[t2 - 1]:
distances[t1][t2] = distances[t1 - 1][t2 - 1]
else:
a = distances[t1][t2 - 1]
b = distances[t1 - 1][t2]
c = distances[t1 - 1][t2 - 1]
if a <= b and a <= c:
distances[t1][t2] = a + 1
elif b <= a and b <= c:
distances[t1][t2] = b + 1
else:
distances[t1][t2] = c + 1
return distances[len(token1)][len(token2)]
def download(url, filename=None):
LOG.info(f"fetching {url}")
r = requests.get(url, stream=True)
data = b""
total_size = int(r.headers.get("content-length", 0))
for chunk in tqdm(
r.iter_content(32 * 1024),
total=total_size,
unit="B",
unit_scale=True,
leave=False,
):
if chunk:
data += chunk
if not filename is None:
with open(filename, "wb") as file:
file.write(data)
return data
def get_in_ordered_list(ordered_list, year):
if ordered_list == []:
return None
current = ordered_list[0]
for elem in ordered_list[1:]:
if year < elem[0]:
return current
current = elem
return current
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment