Skip to content
Snippets Groups Projects
Commit 3af91ee4 authored by Georges Da Costa's avatar Georges Da Costa
Browse files

Uses ISSN for SJR

parent 79045f46
No related branches found
No related tags found
No related merge requests found
......@@ -9,15 +9,16 @@ from dateutil.parser import parse as parsedate
from bs4 import BeautifulSoup
import pandas as pd
import argparse
import re
from hash_cache import load_hash_caches, save_hash_caches, default_cache
from tools import levenshtein, download, get_in_ordered_list
def comp_lower(a, b):
return isinstance(a, str) and isinstance(b, str) and a.lower() == b.lower()
def get_dblp(url, cache=True, cache_dir=None):
if cache_dir is None:
cache_dir = default_cache()
......@@ -86,13 +87,6 @@ def get_core_rank(name, year):
return None
class Sjr:
def __init__(self):
self.ranking_caches = load_hash_caches("sjr")
......@@ -100,12 +94,21 @@ class Sjr:
def close(self):
save_hash_caches(self.ranking_caches, "sjr")
def get_issn(self, acronym):
data = download("https://dblp.org/db/journals/%s/index.html" % acronym)
soup = BeautifulSoup(data, "html.parser")
full_name = soup.find("h1").text
issn = soup.find(
"a", attrs={"href": re.compile("^https://portal.issn.org/resource/ISSN/")}
).text
return (full_name, issn)
def get(self, name, second_name, year):
if (name, second_name) in self.ranking_caches:
rankings = self.ranking_caches[(name, second_name)]
else:
rankings = self.get_sjr_rank(name)
_ , issn = self.get_issn(second_name)
rankings = self.get_sjr_rank(issn)
self.ranking_caches[(name, second_name)] = rankings
rank = get_in_ordered_list(rankings, int(year))
if rank is None:
......@@ -114,7 +117,9 @@ class Sjr:
return ["J", name, second_name, int(year), rank[1], None, rank[2]]
def get_sjr_rank(self, name):
url = "https://www.scimagojr.com/journalsearch.php?q=%s" % name.replace(" ", "+")
url = "https://www.scimagojr.com/journalsearch.php?q=%s" % name.replace(
" ", "+"
)
data = download(url)
sjr_soup = BeautifulSoup(data, "html.parser")
......@@ -129,6 +134,9 @@ class Sjr:
dist = lev
best_name = tmp
reference = "https://www.scimagojr.com/%s" % revue["href"]
if dist == 0:
break
if reference is None:
return []
......@@ -155,12 +163,10 @@ class Sjr:
result.append((y, best_name, ranking))
return result
def main():
sjr = Sjr()
#sjr_ranking_caches = load_hash_caches("sjr")
core_ranking_caches = load_hash_caches("core")
parser = argparse.ArgumentParser(
......@@ -209,39 +215,37 @@ def main():
result = []
with logging_redirect_tqdm():
for venue, name, second_name, year in tqdm(elements):
if venue == "conf":
if (name, second_name, year) in core_ranking_caches:
rank = core_ranking_caches[(name, second_name, year)]
else:
rank = get_core_rank(name, year)
if rank is None:
rank = get_core_rank(second_name, year)
core_ranking_caches[(name, second_name, year)] = rank
if venue == "conf":
if (name, second_name, year) in core_ranking_caches:
rank = core_ranking_caches[(name, second_name, year)]
else:
rank = get_core_rank(name, year)
if rank is None:
result.append(
["C", name, second_name, int(year), None, None, None]
)
else:
result.append(
[
"C",
name,
second_name,
int(year),
rank[1],
rank[2],
rank[0],
]
)
elif venue == "journals":
result.append(sjr.get(name, second_name, year))
rank = get_core_rank(second_name, year)
core_ranking_caches[(name, second_name, year)] = rank
if rank is None:
result.append(["C", name, second_name, int(year), None, None, None])
else:
tqdm.write(f"venue: {venue} ?")
#save_hash_caches(sjr_ranking_caches, "sjr")
result.append(
[
"C",
name,
second_name,
int(year),
rank[1],
rank[2],
rank[0],
]
)
elif venue == "journals":
result.append(sjr.get(name, second_name, year))
else:
tqdm.write(f"venue: {venue} ?")
save_hash_caches(core_ranking_caches, "core")
sjr.close()
df = pd.DataFrame(
result, columns=["type", "name", "short", "year", "longname", "acronym", "rank"]
)
......@@ -258,16 +262,14 @@ def main():
else:
print("Ending year", max(df["year"]))
selection = df[(df["year"] >= start_year) & (df["year"] <= end_year)]
print(
"Not found",
len(selection) - selection["rank"].count(),
len(df) - df["rank"].count(),
"out of a total of",
len(selection),
len(df),
)
evaluation = selection.groupby("rank").count()
evaluation = df.groupby("rank").count()
print(
evaluation.drop(
["name", "short", "year", "longname", "acronym"], axis=1
......@@ -275,12 +277,11 @@ def main():
)
if not csv_output is None:
selection.to_csv(csv_output, index=False)
df.to_csv(csv_output, index=False)
if display_list:
pd.set_option("display.max_rows", len(selection))
print(selection)
pd.set_option("display.max_rows", len(df))
print(df)
if __name__ == "__main__":
main()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment