Last active
November 12, 2019 18:09
-
-
Save mariogeiger/ca07b3d55f99eb91e4f2dce5b57bdc88 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"raw = r'''\n", | |
"<html><head><META http-equiv=\"Content-Type\" content=\"text/html; charset=iso-8859-1\"><link rel=\"stylesheet\" type=\"text/css\" href=\"gedpublicreports.css?ww_x_path=Gestac.Moniteur.Style\"></head><body bgcolor=\"#ffffff\" marginheight=\"0\" marginwidth=\"5\" link=\"#666666\" vlink=\"#666666\" alink=\"#666666\"><fieldset style=\"text-align:right; width:40%; position:relative; margin-right: 10px;float:right; border: 0; padding: 0 0 8px 0;\"><a style=\"color:#990033;\" href=\"!GEDREPORTS.html?ww_x_MAT=80890310&ww_x_CLASSE=null&ww_i_reportModel=66627699&ww_x_MATIERE=%2A&ww_x_PERIODE_ACAD=1866894985&ww_i_reportModelXsl=66627723&ww_x_HIVERETE=2936286\">Identification pour accéder aux e-mails<br>Login to access email adresses</a><br>Nous conseillons aux utilisateurs de Safari d'installer le certificat EPFL pour éviter un affichage pleine page de la liste après authentification.</fieldset><h1>Extraction : Inscriptions aux cours par matières</h1><hr style=\"height:0px;visibility: hidden;display:block;width:0px; float:none; clear:both; color: #ffffff;\"><script type=\"text/javascript\">\n", | |
" function mailList(x) {\n", | |
" var vtop = (screen.height-200)/2;\n", | |
" var vleft=(screen.width-600)/2;\n", | |
" var w=open(\"\", \"emaillist\", \"Scrollbars=1,resizable=1,width=600,height=200,top=\"+vtop+\",left=\"+vleft);\n", | |
" w.focus();\n", | |
" w.document.write(x);\n", | |
" w.document.close();\n", | |
" }\n", | |
" function reloadToc() {\n", | |
" var url = top.toc.location.href.replace(\"GEDPUBLICREPORTS\", \"GEDREPORTS\");\n", | |
" top.toc.location.href = url;\n", | |
" }\n", | |
" function reloadEntete() {\n", | |
" var url = \"https://\" + top.entete.location.host,\n", | |
" path = top.entete.location.pathname;\n", | |
" url = url + path.replace(/(\\/.*\\/).*/i, \"$1gestac.entete\");\n", | |
" top.entete.location.href=url;\n", | |
" }\n", | |
" </script><table border=\"0\" width=\"100%\"><tr><td></td></tr><tr><th><font color=\"black\">Advanced biomedical imaging methods and instrumentation</font></th><th>Enseignant-e-(s): Gruetter Rolf</th></tr><tr><td></td></tr><tr><td colspan=\"2\"><b>Génie électrique (edoc), 2019-2020</b></td></tr><tr><td colspan=\"2\"><font color=\"black\"> (1 ét.)</font></td></tr><tr><td colspan=\"3\"><font color=\"red\"></font><table border=\"0\" width=\"100%\"><tr><td width=\"300\">Vilaclara Laura</td><td width=\"80\">\n", | |
" \n", | |
" </td><td>Génie électrique (edoc), EDOC</td><td></td></tr></table></td></tr><tr><td></td></tr><tr><td colspan=\"2\"><b>Neurosciences (edoc), 2019-2020</b></td></tr><tr><td colspan=\"2\"><font color=\"black\"> (3 ét.)</font></td></tr><tr><td colspan=\"3\"><font color=\"red\"></font><table border=\"0\" width=\"100%\"><tr><td width=\"300\">Beanato Elena</td><td width=\"80\">\n", | |
" \n", | |
" </td><td>Neurosciences (edoc), EDOC</td><td></td></tr><tr><td width=\"300\">Ceylan Gizay</td><td width=\"80\">\n", | |
" \n", | |
" </td><td>Neurosciences (edoc), EDOC</td><td></td></tr><tr><td width=\"300\">Lin Wei-Hsiang</td><td width=\"80\">\n", | |
" \n", | |
" </td><td>Neurosciences (edoc), EDOC</td><td></td></tr></table></td></tr><tr><td></td></tr><tr><td colspan=\"2\"><b>Physique (edoc), 2019-2020</b></td></tr><tr><td colspan=\"2\"><font color=\"black\"> (5 ét.)</font></td></tr><tr><td colspan=\"3\"><font color=\"red\"></font><table border=\"0\" width=\"100%\"><tr><td width=\"300\">Croese Jared</td><td width=\"80\">\n", | |
" \n", | |
" </td><td>Auditeurs EDOC, 2019-2020</td><td></td></tr><tr><td width=\"300\">Ibtisam Aslam Ibtisam</td><td width=\"80\">\n", | |
" \n", | |
" </td><td>Auditeurs EDOC, 2019-2020</td><td></td></tr><tr><td width=\"300\">Kulesz Karolina</td><td width=\"80\">\n", | |
" \n", | |
" </td><td>Auditeurs EDOC, 2019-2020</td><td></td></tr><tr><td width=\"300\">Mosso Jessie Julie</td><td width=\"80\">\n", | |
" \n", | |
" </td><td>Physique (edoc), EDOC</td><td></td></tr><tr><td width=\"300\">Wiström Emma</td><td width=\"80\">\n", | |
" \n", | |
" </td><td>Auditeurs EDOC, 2019-2020</td><td></td></tr></table></td></tr><tr><td></td></tr><tr><td colspan=\"3\"><font color=\"red\"></font><table border=\"0\" width=\"100%\"></table></td></tr></table></body></html>\n", | |
"<!-- OpenXml:0.00s agent ctrl:0.00s xml:0.04s xsl extr&stylesheet:0.00s xsl after parsing:0.00s xsl ctrl data:0.00s transform 2:0.01s xsl process:0.00s -->\n", | |
"'''" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"\n", | |
"<html><head><META http-equiv=\"Content-Type\" content=\"text/html; charset=iso-8859-1\"><link rel=\"stylesheet\" type=\"text/css\" href=\"gedpublicreports.css?ww_x_path=Gestac.Moniteur.Style\"></head><body bgcolor=\"#ffffff\" marginheight=\"0\" marginwidth=\"5\" link=\"#666666\" vlink=\"#666666\" alink=\"#666666\"><fieldset style=\"text-align:right; width:40%; position:relative; margin-right: 10px;float:right; border: 0; padding: 0 0 8px 0;\"><a style=\"color:#990033;\" href=\"!GEDREPORTS.html?ww_x_MAT=80890310&ww_x_CLASSE=null&ww_i_reportModel=66627699&ww_x_MATIERE=%2A&ww_x_PERIODE_ACAD=1866894985&ww_i_reportModelXsl=66627723&ww_x_HIVERETE=2936286\">Identification pour accéder aux e-mails<br>Login to access email adresses</a><br>Nous conseillons aux utilisateurs de Safari d'installer le certificat EPFL pour éviter un affichage pleine page de la liste après authentification.</fieldset><h1>Extraction : Inscriptions aux cours par matières</h1><hr style=\"height:0px;visibility: hidden;display:block;width:0px; float:none; clear:both; color: #ffffff;\"><script type=\"text/javascript\">\n", | |
" function mailList(x) {\n", | |
" var vtop = (screen.height-200)/2;\n", | |
" var vleft=(screen.width-600)/2;\n", | |
" var w=open(\"\", \"emaillist\", \"Scrollbars=1,resizable=1,width=600,height=200,top=\"+vtop+\",left=\"+vleft);\n", | |
" w.focus();\n", | |
" w.document.write(x);\n", | |
" w.document.close();\n", | |
" }\n", | |
" function reloadToc() {\n", | |
" var url = top.toc.location.href.replace(\"GEDPUBLICREPORTS\", \"GEDREPORTS\");\n", | |
" top.toc.location.href = url;\n", | |
" }\n", | |
" function reloadEntete() {\n", | |
" var url = \"https://\" + top.entete.location.host,\n", | |
" path = top.entete.location.pathname;\n", | |
" url = url + path.replace(/(\\/.*\\/).*/i, \"$1gestac.entete\");\n", | |
" top.entete.location.href=url;\n", | |
" }\n", | |
" </script><table border=\"0\" width=\"100%\"><tr><td></td></tr><tr><th><font color=\"black\">Advanced biomedical imaging methods and instrumentation</font></th><th>Enseignant-e-(s): Gruetter Rolf</th></tr><tr><td></td></tr><tr><td colspan=\"2\"><b>Génie électrique (edoc), 2019-2020</b></td></tr><tr><td colspan=\"2\"><font color=\"black\"> (1 ét.)</font></td></tr><tr><td colspan=\"3\"><font color=\"red\"></font><table border=\"0\" width=\"100%\"><tr><td width=\"300\">Vilaclara Laura</td><td width=\"80\">\n", | |
" \n", | |
" </td><td>Génie électrique (edoc), EDOC</td><td></td></tr></table></td></tr><tr><td></td></tr><tr><td colspan=\"2\"><b>Neurosciences (edoc), 2019-2020</b></td></tr><tr><td colspan=\"2\"><font color=\"black\"> (3 ét.)</font></td></tr><tr><td colspan=\"3\"><font color=\"red\"></font><table border=\"0\" width=\"100%\"><tr><td width=\"300\">Beanato Elena</td><td width=\"80\">\n", | |
" \n", | |
" </td><td>Neurosciences (edoc), EDOC</td><td></td></tr><tr><td width=\"300\">Ceylan Gizay</td><td width=\"80\">\n", | |
" \n", | |
" </td><td>Neurosciences (edoc), EDOC</td><td></td></tr><tr><td width=\"300\">Lin Wei-Hsiang</td><td width=\"80\">\n", | |
" \n", | |
" </td><td>Neurosciences (edoc), EDOC</td><td></td></tr></table></td></tr><tr><td></td></tr><tr><td colspan=\"2\"><b>Physique (edoc), 2019-2020</b></td></tr><tr><td colspan=\"2\"><font color=\"black\"> (5 ét.)</font></td></tr><tr><td colspan=\"3\"><font color=\"red\"></font><table border=\"0\" width=\"100%\"><tr><td width=\"300\">Croese Jared</td><td width=\"80\">\n", | |
" \n", | |
" </td><td>Auditeurs EDOC, 2019-2020</td><td></td></tr><tr><td width=\"300\">Ibtisam Aslam Ibtisam</td><td width=\"80\">\n", | |
" \n", | |
" </td><td>Auditeurs EDOC, 2019-2020</td><td></td></tr><tr><td width=\"300\">Kulesz Karolina</td><td width=\"80\">\n", | |
" \n", | |
" </td><td>Auditeurs EDOC, 2019-2020</td><td></td></tr><tr><td width=\"300\">Mosso Jessie Julie</td><td width=\"80\">\n", | |
" \n", | |
" </td><td>Physique (edoc), EDOC</td><td></td></tr><tr><td width=\"300\">Wiström Emma</td><td width=\"80\">\n", | |
" \n", | |
" </td><td>Auditeurs EDOC, 2019-2020</td><td></td></tr></table></td></tr><tr><td></td></tr><tr><td colspan=\"3\"><font color=\"red\"></font><table border=\"0\" width=\"100%\"></table></td></tr></table></body></html>\n", | |
"<!-- OpenXml:0.00s agent ctrl:0.00s xml:0.04s xsl extr&stylesheet:0.00s xsl after parsing:0.00s xsl ctrl data:0.00s transform 2:0.01s xsl process:0.00s -->\n" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"from IPython.core.display import display, HTML\n", | |
"display(HTML(raw))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"scrolled": false | |
}, | |
"outputs": [], | |
"source": [ | |
"from html.parser import HTMLParser\n", | |
"import json\n", | |
"\n", | |
"class ISA_HTMLParser(HTMLParser):\n", | |
" def __init__(self):\n", | |
" super().__init__()\n", | |
" self.data = []\n", | |
" self.tables = []\n", | |
" self.tr = None\n", | |
" self.entry = None\n", | |
" \n", | |
" def handle_starttag(self, tag, attrs):\n", | |
" if tag == 'table':\n", | |
" self.tables.append([]) # start saving a table (you can have a table into another table)\n", | |
" if tag == 'tr':\n", | |
" self.tr = [] # start saving a line (you cannot have a line into another line)\n", | |
" if tag in ['td', 'th']:\n", | |
" self.entry = [] # start saving an entry (you cannot have an entry into another entry)\n", | |
"\n", | |
" def handle_endtag(self, tag):\n", | |
" if tag == 'table' and self.tables:\n", | |
" table = self.tables.pop()\n", | |
" if table: # don't save empty tables\n", | |
" if self.tables:\n", | |
" self.tables[-1].append(table)\n", | |
" else: # root table\n", | |
" self.data.append(table)\n", | |
" \n", | |
" if tag == 'tr' and type(self.tr) == list:\n", | |
" if self.tables:\n", | |
" if self.tr: # don't save empty lines\n", | |
" self.tables[-1].append(self.tr)\n", | |
" self.tr = None\n", | |
" else:\n", | |
" print(\"tr outside a table: {}\".format(self.tr))\n", | |
" \n", | |
" if tag in ['td', 'th'] and self.entry:\n", | |
" if type(self.tr) == list:\n", | |
" self.tr.append(\"\\n\".join(self.entry))\n", | |
" self.entry = None\n", | |
" else:\n", | |
" print(\"td/th outside a tr: {}\".format(self.td))\n", | |
"\n", | |
" def handle_data(self, data):\n", | |
" if type(self.entry) is list:\n", | |
" self.entry.append(data.strip())\n", | |
" " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def parse(raw):\n", | |
" parser = ISA_HTMLParser()\n", | |
" parser.feed(raw)\n", | |
" \n", | |
" if len(parser.data) == 0:\n", | |
" return []\n", | |
" \n", | |
" main = parser.data[0]\n", | |
" \n", | |
" if len(main[0]) != 2:\n", | |
" return []\n", | |
" \n", | |
" course, teacher = main[0]\n", | |
" course = course.strip()\n", | |
" \n", | |
" if 'Enseignant' in teacher:\n", | |
" teacher = teacher.split('Enseignant-e-(s):')[1]\n", | |
" if 'Assistant-e-(s):' in teacher:\n", | |
" teacher, assistant = teacher.split(\"Assistant-e-(s):\")\n", | |
" else:\n", | |
" assistant = \"\"\n", | |
" teacher = teacher.strip()\n", | |
" assistant = assistant.strip()\n", | |
" else:\n", | |
" teacher = teacher.strip()\n", | |
" assistant = \"\"\n", | |
"\n", | |
" students = []\n", | |
" \n", | |
" def fmt(x):\n", | |
" if len(x) == 3:\n", | |
" return x\n", | |
" if len(x) < 3:\n", | |
" return x + [\"\"] * (3 - len(x))\n", | |
" assert False, x\n", | |
" \n", | |
" def valid(x):\n", | |
" if type(x[0]) is list:\n", | |
" return False\n", | |
" return True\n", | |
"\n", | |
" i = 1\n", | |
" while i < len(main):\n", | |
" if len(main[i]) == 1:\n", | |
" label = main[i][0]\n", | |
" i += 1\n", | |
"\n", | |
" if 'ét' in main[i][0]:\n", | |
" i += 1\n", | |
"\n", | |
" students += [fmt(x) + [course, teacher, assistant, label] for x in main[i] if valid(x)]\n", | |
" i += 1\n", | |
" else:\n", | |
" i += 0\n", | |
" else:\n", | |
" i += 1 \n", | |
" \n", | |
" return students" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 28, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "6a356ae908c0476aa1487c926ed7fdc2", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"HBox(children=(IntProgress(value=0, max=8), HTML(value='')))" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "fc21030f694f4182bdf6b309db8f5f56", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"HBox(children=(IntProgress(value=0, max=2), HTML(value='')))" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"(2012, 'autumn')\n", | |
"(2012, 'spring')\n" | |
] | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "f42821e79e57417abbd231f0eb789c4d", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"HBox(children=(IntProgress(value=0, max=2130), HTML(value='')))" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "cb92917606d44042a87ee50b891d4b13", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"HBox(children=(IntProgress(value=0, max=2), HTML(value='')))" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"(2013, 'autumn')\n", | |
"(2013, 'spring')\n" | |
] | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "284df389d14845f69a19563bbfabf51b", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"HBox(children=(IntProgress(value=0, max=2165), HTML(value='')))" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"ename": "KeyboardInterrupt", | |
"evalue": "", | |
"output_type": "error", | |
"traceback": [ | |
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", | |
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", | |
"\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36m_make_request\u001b[0;34m(self, conn, method, url, timeout, chunked, **httplib_request_kw)\u001b[0m\n\u001b[1;32m 376\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# Python 2.7, use buffering of HTTP responses\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 377\u001b[0;31m \u001b[0mhttplib_response\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mconn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgetresponse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbuffering\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 378\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# Python 3\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;31mTypeError\u001b[0m: getresponse() got an unexpected keyword argument 'buffering'", | |
"\nDuring handling of the above exception, another exception occurred:\n", | |
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", | |
"\u001b[0;32m<ipython-input-28-b0774ef2c36a>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 49\u001b[0m ww_x_MATIERE[0])\n\u001b[1;32m 50\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 51\u001b[0;31m \u001b[0mresponse\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrequests\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0murl_mat\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 52\u001b[0m \u001b[0mhtml\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresponse\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcontent\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'latin1'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 53\u001b[0m \u001b[0mxs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhtml\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/requests/api.py\u001b[0m in \u001b[0;36mget\u001b[0;34m(url, params, **kwargs)\u001b[0m\n\u001b[1;32m 73\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 74\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msetdefault\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'allow_redirects'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 75\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'get'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparams\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mparams\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 76\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 77\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/requests/api.py\u001b[0m in \u001b[0;36mrequest\u001b[0;34m(method, url, **kwargs)\u001b[0m\n\u001b[1;32m 58\u001b[0m \u001b[0;31m# cases, and look like a memory leak in others.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 59\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0msessions\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSession\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0msession\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 60\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0msession\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 61\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 62\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/requests/sessions.py\u001b[0m in \u001b[0;36mrequest\u001b[0;34m(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)\u001b[0m\n\u001b[1;32m 531\u001b[0m }\n\u001b[1;32m 532\u001b[0m \u001b[0msend_kwargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msettings\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 533\u001b[0;31m \u001b[0mresp\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mprep\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0msend_kwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 534\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 535\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresp\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/requests/sessions.py\u001b[0m in \u001b[0;36msend\u001b[0;34m(self, request, **kwargs)\u001b[0m\n\u001b[1;32m 644\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 645\u001b[0m \u001b[0;31m# Send the request\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 646\u001b[0;31m \u001b[0mr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0madapter\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 647\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 648\u001b[0m \u001b[0;31m# Total elapsed time of the request (approximately)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/requests/adapters.py\u001b[0m in \u001b[0;36msend\u001b[0;34m(self, request, stream, timeout, verify, cert, proxies)\u001b[0m\n\u001b[1;32m 447\u001b[0m \u001b[0mdecode_content\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 448\u001b[0m \u001b[0mretries\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmax_retries\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 449\u001b[0;31m \u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 450\u001b[0m )\n\u001b[1;32m 451\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36murlopen\u001b[0;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)\u001b[0m\n\u001b[1;32m 598\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtimeout_obj\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 599\u001b[0m \u001b[0mbody\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mbody\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mheaders\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 600\u001b[0;31m chunked=chunked)\n\u001b[0m\u001b[1;32m 601\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 602\u001b[0m \u001b[0;31m# If we're going to release the connection in ``finally:``, then\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36m_make_request\u001b[0;34m(self, conn, method, url, timeout, chunked, **httplib_request_kw)\u001b[0m\n\u001b[1;32m 378\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# Python 3\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 379\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 380\u001b[0;31m \u001b[0mhttplib_response\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mconn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgetresponse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 381\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 382\u001b[0m \u001b[0;31m# Remove the TypeError from the exception chain in Python 3;\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;32m~/anaconda3/lib/python3.7/http/client.py\u001b[0m in \u001b[0;36mgetresponse\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1319\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1320\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1321\u001b[0;31m \u001b[0mresponse\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbegin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1322\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mConnectionError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1323\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclose\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;32m~/anaconda3/lib/python3.7/http/client.py\u001b[0m in \u001b[0;36mbegin\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 294\u001b[0m \u001b[0;31m# read until we get a non-100 response\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 295\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 296\u001b[0;31m \u001b[0mversion\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstatus\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreason\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_read_status\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 297\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mstatus\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mCONTINUE\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 298\u001b[0m \u001b[0;32mbreak\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;32m~/anaconda3/lib/python3.7/http/client.py\u001b[0m in \u001b[0;36m_read_status\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 255\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 256\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_read_status\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 257\u001b[0;31m \u001b[0mline\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreadline\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_MAXLINE\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"iso-8859-1\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 258\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mline\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0m_MAXLINE\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 259\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mLineTooLong\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"status line\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;32m~/anaconda3/lib/python3.7/socket.py\u001b[0m in \u001b[0;36mreadinto\u001b[0;34m(self, b)\u001b[0m\n\u001b[1;32m 587\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 588\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 589\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sock\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrecv_into\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 590\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 591\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_timeout_occurred\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/urllib3/contrib/pyopenssl.py\u001b[0m in \u001b[0;36mrecv_into\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 295\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mrecv_into\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 296\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 297\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconnection\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrecv_into\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 298\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mOpenSSL\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSSL\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSysCallError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 299\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msuppress_ragged_eofs\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0margs\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'Unexpected EOF'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/OpenSSL/SSL.py\u001b[0m in \u001b[0;36mrecv_into\u001b[0;34m(self, buffer, nbytes, flags)\u001b[0m\n\u001b[1;32m 1819\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_lib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSSL_peek\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_ssl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbuf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnbytes\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1820\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1821\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_lib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSSL_read\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_ssl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbuf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnbytes\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1822\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_raise_ssl_error\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_ssl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1823\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;31mKeyboardInterrupt\u001b[0m: " | |
] | |
} | |
], | |
"source": [ | |
"import urllib.request, urllib.error, urllib.parse, requests\n", | |
"from tqdm import tqdm_notebook as tqdm\n", | |
"import csv\n", | |
"\n", | |
"url_base = [\n", | |
" \"https://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.filter?\" + \n", | |
" \"&ww_b_list={}&ww_i_reportmodel={}&ww_c_langue={}&ww_i_reportModelXsl={}&\" + \n", | |
" \"ww_x_CLASSE={}&ww_x_PERIODE_ACAD={}&ww_x_HIVERETE={}&ww_x_MATIERE={}\", \n", | |
" \n", | |
" \"https://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.html?\" + \n", | |
" \"ww_x_MAT={}&ww_x_CLASSE={}&ww_i_reportmodel={}&ww_c_langue={}&\" + \n", | |
" \"ww_i_reportModelXsl={}&ww_x_PERIODE_ACAD={}&ww_x_HIVERETE={}&ww_x_MATIERE={}\"\n", | |
"]\n", | |
"\n", | |
"ww_b_list = [\"1\"]\n", | |
"ww_i_reportmodel = [\"66627699\"]\n", | |
"ww_x_MATIERE = [\"%2A\"] #MATIERE ID\n", | |
"ww_c_langue = [\"\"]\n", | |
"ww_i_reportModelXsl = [\"66627723\", \"66627727\"] #htlm, xls\n", | |
"ww_x_CLASSE = [\"null\"]\n", | |
"ww_x_PERIODE_ACAD = [\"123456101\", \"213637754\", \"213637922\", \"213638028\", \"355925344\", \"355925344\", \"1866893861\", \"1866894985\"]\n", | |
"ww_x_PERIODE_ACAD = list(zip(ww_x_PERIODE_ACAD, range(2012, 2090, 1)))\n", | |
"ww_x_HIVERETE = [(\"2936286\", \"autumn\"), (\"2936295\", \"spring\")]\n", | |
"ww_x_MATIERE = [\"*\"]\n", | |
"\n", | |
"#periods = dict()\n", | |
"\n", | |
"for year, year_name in tqdm(ww_x_PERIODE_ACAD):\n", | |
" for season, season_name in tqdm(ww_x_HIVERETE): \n", | |
" key = (year_name, season_name)\n", | |
" print(key)\n", | |
" if key in periods:\n", | |
" continue\n", | |
" \n", | |
" \n", | |
" url_list = url_base[0].format(ww_b_list[0], ww_i_reportmodel[0], ww_c_langue[0], ww_i_reportModelXsl[0], \n", | |
" ww_x_CLASSE[0], year, season, ww_x_MATIERE[0])\n", | |
" \n", | |
" response = urllib.request.urlopen(url_list)\n", | |
" webContent = str(response.read())\n", | |
" page_splits = webContent.split(\"ww_x_MAT=\")\n", | |
" \n", | |
" xs = []\n", | |
"\n", | |
" for k, split in enumerate(tqdm(page_splits)):\n", | |
" ww_x_MAT = [split[:split.index(\"\\\\\")]]\n", | |
" url_mat = url_base[1].format(ww_x_MAT[0], ww_x_CLASSE[0], ww_i_reportmodel[0], ww_c_langue[0],\n", | |
" ww_i_reportModelXsl[0], year, season, \n", | |
" ww_x_MATIERE[0])\n", | |
" \n", | |
" response = requests.get(url_mat)\n", | |
" html = str(response.content, 'latin1')\n", | |
" xs.append(html)\n", | |
"\n", | |
" periods[key] = xs\n", | |
" import pickle \n", | |
" with open('raw.pkl', 'wb') as f: \n", | |
" pickle.dump(periods, f)\n", | |
" " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 27, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"dict_keys([(2012, 'autumn'), (2013, 'autumn')])" | |
] | |
}, | |
"execution_count": 27, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"periods.keys()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import pickle \n", | |
"with open('raw.pkl', 'rb') as f:\n", | |
" periods = pickle.load(f)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 29, | |
"metadata": { | |
"scrolled": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "fe2379a7d5644502a4291a45519b3f65", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"HBox(children=(IntProgress(value=0, max=3), HTML(value='')))" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "ae0125c4e8ef432da933f7c19c36986a", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"HBox(children=(IntProgress(value=0, max=2130), HTML(value='')))" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "bc19ccce541b4f8cbd22ae5005dcf3d7", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"HBox(children=(IntProgress(value=0, max=2165), HTML(value='')))" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "ccb65f630117411dbb17c05b420c5d65", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"HBox(children=(IntProgress(value=0, max=2130), HTML(value='')))" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"for (year, season), xs in tqdm(periods.items()):\n", | |
" students = []\n", | |
" for html in tqdm(xs):\n", | |
" try:\n", | |
" students += parse(html)\n", | |
" except:\n", | |
" print(url_mat)\n", | |
" print(html)\n", | |
" display(HTML(html))\n", | |
" stop = True\n", | |
" break\n", | |
"\n", | |
" with open(\"data{}{}.csv\".format(year, season), 'w') as f:\n", | |
" writer = csv.writer(f)\n", | |
" for x in students:\n", | |
" writer.writerow(x)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.3" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment