diff --git a/.gitignore b/.gitignore index 02f17690dd86b24cdb1e7f593f3cec055755a5d5..296646958d4714f4ea473e06fdaf259b42b4823f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ result .vscode +CREDENTIALS.json diff --git a/default.nix b/default.nix index d2ce1019f4f1cc4ceb617e0676b364859c83593d..bb5d34d32ac925a5cba51278873013b13c0a3f64 100644 --- a/default.nix +++ b/default.nix @@ -29,6 +29,7 @@ in rec { pandas requests click + beautifulsoup4 ]; }; diff --git a/lflex_celcat_survival/__init__.py b/lflex_celcat_survival/__init__.py index a12d1d53b13924e7b0634dc862c8acb73521a157..09dcaf8c295ef23d53f215f7d7f4c73ef71d35a7 100644 --- a/lflex_celcat_survival/__init__.py +++ b/lflex_celcat_survival/__init__.py @@ -1,3 +1,4 @@ +from . import auth from . import course_request from . import events from . import fetch diff --git a/lflex_celcat_survival/auth.py b/lflex_celcat_survival/auth.py new file mode 100644 index 0000000000000000000000000000000000000000..45319f1d9d2ef175dc5f1eea4980030d83ce8f1c --- /dev/null +++ b/lflex_celcat_survival/auth.py @@ -0,0 +1,99 @@ +import json +import requests +import urllib +from bs4 import BeautifulSoup + +def parse_credentials_from_file(credentials_filename): + with open(credentials_filename, 'r') as f: + credentials_dict = json.load(f) + username = credentials_dict['username'] + password = credentials_dict['password'] + return username, password + +def create_authenticated_session(username, password): + s = requests.Session() + r_headers = {"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8"} + + # first simple connection. can become useful if it sets cookies + r0 = s.get('https://edt.univ-tlse3.fr/calendar2') + assert r0.ok + + # start login procedure + r1 = s.get('https://edt.univ-tlse3.fr/calendar2/Login', allow_redirects=True) + assert r1.ok + + soup1 = BeautifulSoup(r1.text, 'html.parser') + assert soup1.title.string == "Service Web d'Authentification - Loading Session Information" + forms1 = soup1.find_all('form') + assert len(forms1) == 1, f"1 form expected in html document but {len(forms1)} were found" + form1 = forms1[0] + assert form1.attrs['action'] == '/idp/profile/SAML2/Redirect/SSO?execution=e1s1' + assert form1.attrs['method'] == 'post' + + # click continue... + r2_payload = {} + for r1_input in form1.find_all('input'): + if r1_input.attrs['type'] == 'hidden': + r2_payload[r1_input.attrs['name']] = r1_input.attrs['value'] if 'value' in r1_input.attrs else '' + r2_payload_str = urllib.parse.urlencode(r2_payload) #"&".join([f"{k}={v}" for k,v in r2_payload.items()]) + r2 = s.post('https://idp.univ-tlse3.fr/idp/profile/SAML2/Redirect/SSO?execution=e1s1', + headers=r_headers, data=r2_payload_str, allow_redirects=True) + assert r2.ok + + soup2 = BeautifulSoup(r2.text, 'html.parser') + assert soup2.title.string == 'CAS - Central Authentication Service' + forms2 = soup2.find_all('form') + assert len(forms2) == 1, f"1 form expected in html document but {len(forms2)} were found" + form2 = forms2[0] + assert form2.attrs['action'] == 'login' + assert form2.attrs['method'] == 'post' + + # give login information + r3_payload = {} + for r2_input in form2.find_all('input'): + if r2_input.attrs['type'] == 'hidden': + r3_payload[r2_input.attrs['name']] = r2_input.attrs['value'] if 'value' in r2_input.attrs else '' + r3_payload['username'] = username + r3_payload['password'] = password + r3_payload_str = urllib.parse.urlencode(r3_payload) # "&".join([f"{k}={v}" for k,v in r3_payload.items()]) + r3 = s.post('https://cas.univ-tlse3.fr/cas/login', headers=r_headers, data=r3_payload_str, allow_redirects=True) + assert r3.ok + + soup3 = BeautifulSoup(r3.text, 'html.parser') + assert soup3.title.string == 'Informations Transmises' + forms3 = soup3.find_all('form') + assert len(forms3) == 1, f"1 form expected in html document but {len(forms3)} were found" + form3 = forms3[0] + assert form3.attrs['action'] == '/idp/profile/SAML2/Redirect/SSO?execution=e1s3' + assert form3.attrs['method'] == 'post' + + # do not remember data forwarding consent + r4_payload = [] + for r3_input in form3.find_all('input'): + if r3_input.attrs['type'] == 'hidden': + r4_payload.append(f"{r3_input.attrs['name']}={r3_input.attrs['value']}") + r4_payload.append("_shib_idp_consentOptions=_shib_idp_doNotRememberConsent") + r4_payload.append("_eventId_proceed=Accepter") + + r4_payload_str = "&".join(r4_payload) + r4 = s.post('https://idp.univ-tlse3.fr/idp/profile/SAML2/Redirect/SSO?execution=e1s3', + headers=r_headers, data=r4_payload_str, allow_redirects=True) + + soup4 = BeautifulSoup(r4.text, 'html.parser') + forms4 = soup4.find_all('form') + assert len(forms4) == 1, f"1 form expected in html document but {len(forms4)} were found" + form4 = forms4[0] + assert form4.attrs['action'] == 'https://edt.univ-tlse3.fr/calendar2/Saml/AssertionConsumerService' + assert form4.attrs['method'] == 'post' + + # click continue... + r5_payload = {} + for r4_input in form4.find_all('input'): + if r4_input.attrs['type'] == 'hidden': + r5_payload[r4_input.attrs['name']] = r4_input.attrs['value'] + r5_payload_str = urllib.parse.urlencode(r5_payload) + r5 = s.post('https://edt.univ-tlse3.fr/calendar2/Saml/AssertionConsumerService', + headers=r_headers, data=r5_payload_str, allow_redirects=True) + assert r5.ok + + return s diff --git a/lflex_celcat_survival/cmd/fetch_celcat.py b/lflex_celcat_survival/cmd/fetch_celcat.py index ea3f8dc4b71e1100262a14cb5615d871ad0e3036..d412bc51699e5c52cd3a440ad791364b6005c532 100755 --- a/lflex_celcat_survival/cmd/fetch_celcat.py +++ b/lflex_celcat_survival/cmd/fetch_celcat.py @@ -5,12 +5,13 @@ import lflex_celcat_survival as lcs @click.command() @click.argument('course_request_file') +@click.argument('credentials_file') @click.option('--json', '--json-raw', default=None, help='If set, raw CELCAT events are written as JSON to this file.') @click.option('--csv-raw', default=None, help='If set, raw (unfiltered) events are written as CSV to this file.') @click.option('--csv', default=None, help='If set, filteret events are written as CSV to this file.') @click.option('--ics', default=None, help='If set, filtered events are written as ICS to this file.') @click.option('--csv-no-description', is_flag=True, default=False, help='If set, CSV outputs will not contain the description column.') -def main(course_request_file, json, csv_raw, csv, ics, csv_no_description): +def main(course_request_file, credentials_file, json, csv_raw, csv, ics, csv_no_description): logging.basicConfig(level=logging.INFO) req = lcs.course_request.CourseRequest(course_request_file) @@ -18,7 +19,10 @@ def main(course_request_file, json, csv_raw, csv, ics, csv_no_description): logging.warning('No option set, doing nothing.') return - celcat_raw_response = req.do_request() + username, password = lcs.auth.parse_credentials_from_file(credentials_file) + session = lcs.auth.create_authenticated_session(username, password) + + celcat_raw_response = req.do_request(session) if json is not None: with open(json, 'w') as f: f.write(celcat_raw_response) diff --git a/lflex_celcat_survival/course_request.py b/lflex_celcat_survival/course_request.py index 8614f2368263ed121bb83a476cb10c20bf12f2c0..1a819b0144d74472d6a19d28b6ed5194135da412 100644 --- a/lflex_celcat_survival/course_request.py +++ b/lflex_celcat_survival/course_request.py @@ -13,6 +13,6 @@ class CourseRequest: return (date_range_min, date_range_max, apogee_codes) - def do_request(self, url='https://edt.univ-tlse3.fr/calendar2/Home/GetCalendarData'): + def do_request(self, session, url='https://edt.univ-tlse3.fr/calendar2/Home/GetCalendarData'): (date_min, date_max, apogee_codes) = self.generate_request_input() - return fetch.do_celcat_calendar_request(date_min, date_max, apogee_codes, url) + return fetch.do_celcat_calendar_request(date_min, date_max, apogee_codes, session, url) diff --git a/lflex_celcat_survival/fetch.py b/lflex_celcat_survival/fetch.py index 858a1ea2615920b1e87d3afcf78680f9291dda34..187032236c9a9fd84c4e47328543796876fa13ff 100644 --- a/lflex_celcat_survival/fetch.py +++ b/lflex_celcat_survival/fetch.py @@ -1,7 +1,7 @@ import logging import requests -def do_celcat_calendar_request(min_date, max_date, module_apogee_codes, url='https://edt.univ-tlse3.fr/calendar2/Home/GetCalendarData'): +def do_celcat_calendar_request(min_date, max_date, module_apogee_codes, session, url='https://edt.univ-tlse3.fr/calendar2/Home/GetCalendarData'): headers = {"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8"} fields = [ f'start={min_date}', @@ -12,7 +12,7 @@ def do_celcat_calendar_request(min_date, max_date, module_apogee_codes, url='htt fields_str = '&'.join(fields) logging.info(f'Fetching modules {module_apogee_codes} from {min_date} to {min_date} on url={url}') - response = requests.post(url, data=fields_str, headers=headers) + response = session.post(url, data=fields_str, headers=headers) if not response.ok: logging.error(f'POST HTTP request failed (status code {response.status_code}): {response.reason}') logging.error(f'Request response text:\n---\n{response.text}\n---') diff --git a/pyproject.toml b/pyproject.toml index b88149d46ba393bc9b4e5d3393842dd980df0cb5..4113012610c4b9318f1fc83c286ab49279396319 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,8 @@ dependencies = [ "ics>=0.7.0", "pandas>=1.3.0", "requests>=2.26.0", - "click>=8.0.0" + "click>=8.0.0", + "beautifulsoup4>=4.10.0" ] [project.scripts]