Skip to content
Snippets Groups Projects
Commit cebf5d40 authored by Georges Da Costa's avatar Georges Da Costa
Browse files

Merge branch 'nopid' into 'main'

Improves date behavior

See merge request !1
parents 2a3146a8 d4dca0bb
Branches
No related tags found
1 merge request!1Improves date behavior
#!/usr/bin/python3
#!/usr/bin/env python3
import logging
from tqdm import tqdm
from tqdm.contrib.logging import logging_redirect_tqdm
import os
import requests
import datetime
......@@ -10,74 +13,122 @@ import numpy
import json
import argparse
LOG = logging.getLogger(__name__)
def getwithpb(url):
LOG.info(f"fetching {url}")
r = requests.get(url, stream=True)
data = b""
total_size = int(r.headers.get("content-length", 0))
for chunk in tqdm(
r.iter_content(32 * 1024),
total=total_size,
unit="B",
unit_scale=True,
leave=False,
):
if chunk:
data += chunk
return data
def fgetwithpb(url, filename):
LOG.info(f"fetching {url}")
r = requests.get(url, stream=True)
data = b""
total_size = int(r.headers.get("content-length", 0))
with open(filename, "wb") as file:
for chunk in tqdm(
r.iter_content(32 * 1024),
total=total_size,
unit="B",
unit_scale=True,
leave=False,
):
if chunk:
file.write(chunk)
data += chunk
return data
def comp_lower(a, b):
return isinstance(a, str) and isinstance(b, str) and a.lower() == b.lower()
def default_cache():
return os.environ['HOME']+'/.local/state/pyrank'
return os.environ["HOME"] + "/.local/state/pyrank"
def get_dblp(url, cache=True, cache_dir = None):
def get_dblp(url, cache=True, cache_dir=None):
if cache_dir is None:
cache_dir = default_cache()
_, target = url.split('//')
filename = '%s/%s' % (cache_dir, target.replace('/', '_'))
_, target = url.split("//")
filename = "%s/%s" % (cache_dir, target.replace("/", "_"))
os.makedirs(cache_dir, exist_ok=True)
if not os.path.exists(filename) or not cache:
with open(filename, "wb") as file:
response = requests.get(url)
data = response.content
file.write(data)
data = fgetwithpb(url, filename)
else:
with open(filename, "rb") as file:
data = file.read()
soup = BeautifulSoup(data, 'html.parser')
soup = BeautifulSoup(data, "html.parser")
articles = soup.find_all("li", class_="entry")
res = []
for a in articles:
if 'inproceedings' in a['class'] or 'article' in a['class']:
name = a.find("span", itemprop = 'isPartOf').find("span", itemprop = 'name').text
year = a.find("span", itemprop = 'datePublished').text
venue, second_name, _ = a['id'].split('/')
if "inproceedings" in a["class"] or "article" in a["class"]:
name = (
a.find("span", itemprop="isPartOf").find("span", itemprop="name").text
)
year = a.find("span", itemprop="datePublished").text
venue, second_name, _ = a["id"].split("/")
res.append([venue, name, second_name, year])
return soup.title.text, res
def get_core_year(year):
if year >= 2021:
return 'CORE2021'
return "CORE2021"
if year >= 2020:
return 'CORE2020'
return "CORE2020"
if year >= 2018:
return 'CORE2018'
return "CORE2018"
if year >= 2017:
return 'CORE2017'
return "CORE2017"
if year >= 2014:
return 'CORE2014'
return "CORE2014"
if year >= 2013:
return 'CORE2013'
return "CORE2013"
if year >= 2010:
return 'ERA2010'
return "ERA2010"
return "CORE2008"
def get_core_rank(name, year):
source = get_core_year(int(year))
url = "http://portal.core.edu.au/conf-ranks/?search=%s&by=all&source=%s&page=1" % (name, source)
url = "http://portal.core.edu.au/conf-ranks/?search=%s&by=all&source=%s&page=1" % (
name,
source,
)
response = requests.get(url)
data = response.content
cc_soup = BeautifulSoup(data, 'html.parser')
table = cc_soup.find_all('table')
data = getwithpb(url)
cc_soup = BeautifulSoup(data, "html.parser")
table = cc_soup.find_all("table")
if len(table) == 0:
return None
df = pd.read_html(str(table))[0]
for index, row in df.iterrows():
#print(name, year, ' ', row.Title, row.Acronym, row.Rank)
if row.Title.lower() == name.lower() or row.Acronym.lower() == name.lower():
# print(name, year, ' ', row.Title, row.Acronym, row.Rank)
if comp_lower(row.Title, name) or comp_lower(row.Acronym, name):
return row.Rank, row.Title, row.Acronym
return None
def levenshteinDistanceDP(token1, token2):
distances = numpy.zeros((len(token1) + 1, len(token2) + 1))
......@@ -86,23 +137,23 @@ def levenshteinDistanceDP(token1, token2):
for t2 in range(len(token2) + 1):
distances[0][t2] = t2
a = 0
b = 0
c = 0
for t1 in range(1, len(token1) + 1):
for t2 in range(1, len(token2) + 1):
if (token1[t1-1] == token2[t2-1]):
if token1[t1 - 1] == token2[t2 - 1]:
distances[t1][t2] = distances[t1 - 1][t2 - 1]
else:
a = distances[t1][t2 - 1]
b = distances[t1 - 1][t2]
c = distances[t1 - 1][t2 - 1]
if (a <= b and a <= c):
if a <= b and a <= c:
distances[t1][t2] = a + 1
elif (b <= a and b <= c):
elif b <= a and b <= c:
distances[t1][t2] = b + 1
else:
distances[t1][t2] = c + 1
......@@ -111,29 +162,33 @@ def levenshteinDistanceDP(token1, token2):
def list_to_hash(content):
return {tuple(elem[0]):elem[1] for elem in content}
def load_ranking_caches(basename, cache_dir = None):
return {tuple(elem[0]): elem[1] for elem in content}
def load_ranking_caches(basename, cache_dir=None):
if cache_dir is None:
cache_dir = default_cache()
core = '%s/%s.json' % (cache_dir, basename)
core = "%s/%s.json" % (cache_dir, basename)
if os.path.exists(core):
with open(core, 'r') as fid:
#for elem in
with open(core, "r") as fid:
# for elem in
return list_to_hash(json.load(fid))
return {}
def hash_to_list(content):
return [[a,content[a]] for a in content]
return [[a, content[a]] for a in content]
def save_ranking_caches(cache, basename, cache_dir = None):
def save_ranking_caches(cache, basename, cache_dir=None):
if cache_dir is None:
cache_dir = default_cache()
os.makedirs(cache_dir, exist_ok=True)
core = '%s/%s.json' % (cache_dir, basename)
with open(core, 'w') as fid:
os.makedirs(cache_dir, exist_ok=True)
core = "%s/%s.json" % (cache_dir, basename)
with open(core, "w") as fid:
json.dump(hash_to_list(cache), fid)
def get_sjr_in_cache(rankings, str_year):
year = int(str_year)
if rankings == []:
......@@ -145,58 +200,83 @@ def get_sjr_in_cache(rankings, str_year):
current = elem
return current
def get_sjr_rank(name):
url = "https://www.scimagojr.com/journalsearch.php?q=%s" % name.replace(' ', '+')
response = requests.get(url)
data = response.content
sjr_soup = BeautifulSoup(data, 'html.parser')
url = "https://www.scimagojr.com/journalsearch.php?q=%s" % name.replace(" ", "+")
data = getwithpb(url)
sjr_soup = BeautifulSoup(data, "html.parser")
revues = sjr_soup.find('div', class_='search_results')
revues = sjr_soup.find("div", class_="search_results")
dist = -1
reference = None
best_name = None
for revue in revues.find_all('a'):
tmp = revue.find('span').text
for revue in revues.find_all("a"):
tmp = revue.find("span").text
lev = levenshteinDistanceDP(tmp, name)
if dist == -1 or lev < dist:
dist = lev
best_name = tmp
reference = "https://www.scimagojr.com/%s" % revue['href']
reference = "https://www.scimagojr.com/%s" % revue["href"]
if reference is None:
return []
response = requests.get(reference)
data = response.content
sjr_soup = BeautifulSoup(data, 'html.parser')
table = sjr_soup.find_all('table')
data = getwithpb(reference)
sjr_soup = BeautifulSoup(data, "html.parser")
table = sjr_soup.find_all("table")
if len(table) == 0:
return []
df = pd.read_html(str(table))[0]
df['Rank'] = [int(val[1]) for val in df.Quartile]
if "Quartile" in df:
df["Rank"] = [int(val[1]) for val in df.Quartile]
else:
return []
mins = df.groupby('Year').min().Rank
maxs = df.groupby('Year').max().Rank.to_dict()
mins = df.groupby("Year").min().Rank
maxs = df.groupby("Year").max().Rank.to_dict()
result = []
for (y, v) in mins.items():
if v == maxs[y]:
ranking = 'Q%s' % v
ranking = "Q%s" % v
else:
ranking = 'Q%s-Q%s' % (v, maxs[y])
ranking = "Q%s-Q%s" % (v, maxs[y])
result.append((y, best_name, ranking))
return result
def main():
sjr_ranking_caches = load_ranking_caches('sjr')
core_ranking_caches = load_ranking_caches('core')
parser = argparse.ArgumentParser(description='Get ranking from DBLP and show a small summary')
parser.add_argument('url', help='DBLP url')
parser.add_argument('--start', type=int, default = -1, help='starting year')
parser.add_argument('--end', type=int, default = 10000, help='ending year')
parser.add_argument('-o', metavar=('output.csv'), default = None, help='output csv file')
parser.add_argument('-d', action='store_true', help='display conference and journal list')
sjr_ranking_caches = load_ranking_caches("sjr")
core_ranking_caches = load_ranking_caches("core")
parser = argparse.ArgumentParser(
description="Get ranking from DBLP and show a small summary"
)
parser.add_argument("url", help="DBLP url")
parser.add_argument("--start", type=int, default=-1, help="starting year")
parser.add_argument("--end", type=int, default=10000, help="ending year")
parser.add_argument(
"-o", metavar=("output.csv"), default=None, help="output csv file"
)
parser.add_argument(
"-d", action="store_true", help="display conference and journal list"
)
parser.add_argument(
"--debug",
help="Print lots of debugging statements",
action="store_const",
dest="loglevel",
const=logging.DEBUG,
default=logging.WARNING,
)
parser.add_argument(
"-v",
"--verbose",
help="Be verbose",
action="store_const",
dest="loglevel",
const=logging.INFO,
)
args = parser.parse_args()
url = args.url
......@@ -204,69 +284,99 @@ def main():
csv_output = args.o
start_year = args.start
display_list = args.d
logging.basicConfig(level=args.loglevel, format="%(levelname)s %(message)s")
username, elements = get_dblp(url)
print(username)
result = []
for venue, name, second_name, year in elements:
if venue == 'conf':
if (name, second_name, year) in core_ranking_caches:
rank = core_ranking_caches[(name, second_name, year)]
else:
rank = get_core_rank(name, year)
if rank is None:
rank = get_core_rank(second_name, year)
core_ranking_caches[(name, second_name, year)] = rank
if rank is None:
result.append(['C', name, second_name, int(year), None, None, None])
else:
result.append(['C', name, second_name, int(year), rank[1], rank[2], rank[0]])
with logging_redirect_tqdm():
for venue, name, second_name, year in tqdm(elements):
if start_year <= int(year) <= end_year:
if venue == "conf":
if (name, second_name, year) in core_ranking_caches:
rank = core_ranking_caches[(name, second_name, year)]
else:
rank = get_core_rank(name, year)
if rank is None:
rank = get_core_rank(second_name, year)
core_ranking_caches[(name, second_name, year)] = rank
if rank is None:
result.append(
["C", name, second_name, int(year), None, None, None]
)
else:
result.append(
[
"C",
name,
second_name,
int(year),
rank[1],
rank[2],
rank[0],
]
)
else:
if (name, second_name) in sjr_ranking_caches:
rankings = sjr_ranking_caches[(name, second_name)]
else:
rankings = get_sjr_rank(name)
sjr_ranking_caches[(name, second_name)] = rankings
rank = get_sjr_in_cache(rankings, year)
if rank is None:
result.append(['J', name, second_name, int(year), None, None, None])
else:
result.append(['J', name, second_name, int(year), rank[1], None, rank[2]])
save_ranking_caches(sjr_ranking_caches, 'sjr')
save_ranking_caches(core_ranking_caches, 'core')
df = pd.DataFrame(result, columns=['type', 'name', 'short', 'year', 'longname', 'acronym', 'rank'])
if start_year != -1 :
print('Starting year', start_year)
elif venue == "journals":
if (name, second_name) in sjr_ranking_caches:
rankings = sjr_ranking_caches[(name, second_name)]
else:
rankings = get_sjr_rank(name)
sjr_ranking_caches[(name, second_name)] = rankings
rank = get_sjr_in_cache(rankings, year)
if rank is None:
result.append(
["J", name, second_name, int(year), None, None, None]
)
else:
result.append(
["J", name, second_name, int(year), rank[1], None, rank[2]]
)
else:
tqdm.write(f"venue: {venue} ?")
save_ranking_caches(sjr_ranking_caches, "sjr")
save_ranking_caches(core_ranking_caches, "core")
df = pd.DataFrame(
result, columns=["type", "name", "short", "year", "longname", "acronym", "rank"]
)
df = df.fillna(value="")
if start_year != -1:
print("Starting year", start_year)
else:
print('Starting year', min(df['year']))
print("Starting year", min(df["year"]))
if end_year != 10000:
print('Ending year', end_year)
print("Ending year", end_year)
else:
print('Ending year', max(df['year']))
selection = df[(df['year'] >= start_year) & (df['year'] <= end_year)]
print('Not found',
len(selection) - selection['rank'].count(),
'out of a total of',
len(selection))
evaluation = selection.groupby('rank').count()
print(evaluation.drop(['name', 'short', 'year', 'longname', 'acronym'], axis=1).rename(columns={'type':'number'}))
print("Ending year", max(df["year"]))
selection = df[(df["year"] >= start_year) & (df["year"] <= end_year)]
print(
"Not found",
len(selection) - selection["rank"].count(),
"out of a total of",
len(selection),
)
evaluation = selection.groupby("rank").count()
print(
evaluation.drop(
["name", "short", "year", "longname", "acronym"], axis=1
).rename(columns={"type": "number"})
)
if not csv_output is None:
selection.to_csv(csv_output, index=False)
if display_list:
pd.set_option('display.max_rows', len(selection))
pd.set_option("display.max_rows", len(selection))
print(selection)
if __name__ == '__main__':
if __name__ == "__main__":
main()
......@@ -19,7 +19,7 @@ setuptools.setup(
"Operating System :: OS Independent",
],
python_requires='>=3.6',
install_requires=['requests', 'BeautifulSoup4', 'datetime', 'parsedate', 'pandas', 'numpy', 'argparse', 'lxml'],
install_requires=['requests', 'BeautifulSoup4', 'datetime', 'parsedate', 'pandas', 'numpy', 'argparse', 'lxml', 'tqdm'],
entry_points={
'console_scripts': [
'get_rankings = get_rankings.get_rankings:main',
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment