Skip to content

Instantly share code, notes, and snippets.

@mariogeiger
Last active November 12, 2019 18:09
Show Gist options
  • Save mariogeiger/ca07b3d55f99eb91e4f2dce5b57bdc88 to your computer and use it in GitHub Desktop.
Save mariogeiger/ca07b3d55f99eb91e4f2dce5b57bdc88 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"raw = r'''\n",
"<html><head><META http-equiv=\"Content-Type\" content=\"text/html; charset=iso-8859-1\"><link rel=\"stylesheet\" type=\"text/css\" href=\"gedpublicreports.css?ww_x_path=Gestac.Moniteur.Style\"></head><body bgcolor=\"#ffffff\" marginheight=\"0\" marginwidth=\"5\" link=\"#666666\" vlink=\"#666666\" alink=\"#666666\"><fieldset style=\"text-align:right; width:40%; position:relative; margin-right: 10px;float:right; border: 0; padding: 0 0 8px 0;\"><a style=\"color:#990033;\" href=\"!GEDREPORTS.html?ww_x_MAT=80890310&amp;ww_x_CLASSE=null&amp;ww_i_reportModel=66627699&amp;ww_x_MATIERE=%2A&amp;ww_x_PERIODE_ACAD=1866894985&amp;ww_i_reportModelXsl=66627723&amp;ww_x_HIVERETE=2936286\">Identification pour accéder aux e-mails<br>Login to access email adresses</a><br>Nous conseillons aux utilisateurs de Safari d'installer le certificat EPFL pour éviter un affichage pleine page de la liste après authentification.</fieldset><h1>Extraction : Inscriptions aux cours par matières</h1><hr style=\"height:0px;visibility: hidden;display:block;width:0px; float:none; clear:both; color: #ffffff;\"><script type=\"text/javascript\">\n",
" function mailList(x) {\n",
" var vtop = (screen.height-200)/2;\n",
" var vleft=(screen.width-600)/2;\n",
" var w=open(\"\", \"emaillist\", \"Scrollbars=1,resizable=1,width=600,height=200,top=\"+vtop+\",left=\"+vleft);\n",
" w.focus();\n",
" w.document.write(x);\n",
" w.document.close();\n",
" }\n",
" function reloadToc() {\n",
" var url = top.toc.location.href.replace(\"GEDPUBLICREPORTS\", \"GEDREPORTS\");\n",
" top.toc.location.href = url;\n",
" }\n",
" function reloadEntete() {\n",
" var url = \"https://\" + top.entete.location.host,\n",
" path = top.entete.location.pathname;\n",
" url = url + path.replace(/(\\/.*\\/).*/i, \"$1gestac.entete\");\n",
" top.entete.location.href=url;\n",
" }\n",
" </script><table border=\"0\" width=\"100%\"><tr><td></td></tr><tr><th><font color=\"black\">Advanced biomedical imaging methods and instrumentation</font></th><th>Enseignant-e-(s): Gruetter Rolf</th></tr><tr><td></td></tr><tr><td colspan=\"2\"><b>Génie électrique (edoc), 2019-2020</b></td></tr><tr><td colspan=\"2\"><font color=\"black\"> (1 ét.)</font></td></tr><tr><td colspan=\"3\"><font color=\"red\"></font><table border=\"0\" width=\"100%\"><tr><td width=\"300\">Vilaclara Laura</td><td width=\"80\">\n",
" &nbsp;\n",
" </td><td>Génie électrique (edoc), EDOC</td><td></td></tr></table></td></tr><tr><td></td></tr><tr><td colspan=\"2\"><b>Neurosciences (edoc), 2019-2020</b></td></tr><tr><td colspan=\"2\"><font color=\"black\"> (3 ét.)</font></td></tr><tr><td colspan=\"3\"><font color=\"red\"></font><table border=\"0\" width=\"100%\"><tr><td width=\"300\">Beanato Elena</td><td width=\"80\">\n",
" &nbsp;\n",
" </td><td>Neurosciences (edoc), EDOC</td><td></td></tr><tr><td width=\"300\">Ceylan Gizay</td><td width=\"80\">\n",
" &nbsp;\n",
" </td><td>Neurosciences (edoc), EDOC</td><td></td></tr><tr><td width=\"300\">Lin Wei-Hsiang</td><td width=\"80\">\n",
" &nbsp;\n",
" </td><td>Neurosciences (edoc), EDOC</td><td></td></tr></table></td></tr><tr><td></td></tr><tr><td colspan=\"2\"><b>Physique (edoc), 2019-2020</b></td></tr><tr><td colspan=\"2\"><font color=\"black\"> (5 ét.)</font></td></tr><tr><td colspan=\"3\"><font color=\"red\"></font><table border=\"0\" width=\"100%\"><tr><td width=\"300\">Croese Jared</td><td width=\"80\">\n",
" &nbsp;\n",
" </td><td>Auditeurs EDOC, 2019-2020</td><td></td></tr><tr><td width=\"300\">Ibtisam Aslam Ibtisam</td><td width=\"80\">\n",
" &nbsp;\n",
" </td><td>Auditeurs EDOC, 2019-2020</td><td></td></tr><tr><td width=\"300\">Kulesz Karolina</td><td width=\"80\">\n",
" &nbsp;\n",
" </td><td>Auditeurs EDOC, 2019-2020</td><td></td></tr><tr><td width=\"300\">Mosso Jessie Julie</td><td width=\"80\">\n",
" &nbsp;\n",
" </td><td>Physique (edoc), EDOC</td><td></td></tr><tr><td width=\"300\">Wiström Emma</td><td width=\"80\">\n",
" &nbsp;\n",
" </td><td>Auditeurs EDOC, 2019-2020</td><td></td></tr></table></td></tr><tr><td></td></tr><tr><td colspan=\"3\"><font color=\"red\"></font><table border=\"0\" width=\"100%\"></table></td></tr></table></body></html>\n",
"<!-- OpenXml:0.00s agent ctrl:0.00s xml:0.04s xsl extr&stylesheet:0.00s xsl after parsing:0.00s xsl ctrl data:0.00s transform 2:0.01s xsl process:0.00s -->\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"<html><head><META http-equiv=\"Content-Type\" content=\"text/html; charset=iso-8859-1\"><link rel=\"stylesheet\" type=\"text/css\" href=\"gedpublicreports.css?ww_x_path=Gestac.Moniteur.Style\"></head><body bgcolor=\"#ffffff\" marginheight=\"0\" marginwidth=\"5\" link=\"#666666\" vlink=\"#666666\" alink=\"#666666\"><fieldset style=\"text-align:right; width:40%; position:relative; margin-right: 10px;float:right; border: 0; padding: 0 0 8px 0;\"><a style=\"color:#990033;\" href=\"!GEDREPORTS.html?ww_x_MAT=80890310&amp;ww_x_CLASSE=null&amp;ww_i_reportModel=66627699&amp;ww_x_MATIERE=%2A&amp;ww_x_PERIODE_ACAD=1866894985&amp;ww_i_reportModelXsl=66627723&amp;ww_x_HIVERETE=2936286\">Identification pour accéder aux e-mails<br>Login to access email adresses</a><br>Nous conseillons aux utilisateurs de Safari d'installer le certificat EPFL pour éviter un affichage pleine page de la liste après authentification.</fieldset><h1>Extraction : Inscriptions aux cours par matières</h1><hr style=\"height:0px;visibility: hidden;display:block;width:0px; float:none; clear:both; color: #ffffff;\"><script type=\"text/javascript\">\n",
" function mailList(x) {\n",
" var vtop = (screen.height-200)/2;\n",
" var vleft=(screen.width-600)/2;\n",
" var w=open(\"\", \"emaillist\", \"Scrollbars=1,resizable=1,width=600,height=200,top=\"+vtop+\",left=\"+vleft);\n",
" w.focus();\n",
" w.document.write(x);\n",
" w.document.close();\n",
" }\n",
" function reloadToc() {\n",
" var url = top.toc.location.href.replace(\"GEDPUBLICREPORTS\", \"GEDREPORTS\");\n",
" top.toc.location.href = url;\n",
" }\n",
" function reloadEntete() {\n",
" var url = \"https://\" + top.entete.location.host,\n",
" path = top.entete.location.pathname;\n",
" url = url + path.replace(/(\\/.*\\/).*/i, \"$1gestac.entete\");\n",
" top.entete.location.href=url;\n",
" }\n",
" </script><table border=\"0\" width=\"100%\"><tr><td></td></tr><tr><th><font color=\"black\">Advanced biomedical imaging methods and instrumentation</font></th><th>Enseignant-e-(s): Gruetter Rolf</th></tr><tr><td></td></tr><tr><td colspan=\"2\"><b>Génie électrique (edoc), 2019-2020</b></td></tr><tr><td colspan=\"2\"><font color=\"black\"> (1 ét.)</font></td></tr><tr><td colspan=\"3\"><font color=\"red\"></font><table border=\"0\" width=\"100%\"><tr><td width=\"300\">Vilaclara Laura</td><td width=\"80\">\n",
" &nbsp;\n",
" </td><td>Génie électrique (edoc), EDOC</td><td></td></tr></table></td></tr><tr><td></td></tr><tr><td colspan=\"2\"><b>Neurosciences (edoc), 2019-2020</b></td></tr><tr><td colspan=\"2\"><font color=\"black\"> (3 ét.)</font></td></tr><tr><td colspan=\"3\"><font color=\"red\"></font><table border=\"0\" width=\"100%\"><tr><td width=\"300\">Beanato Elena</td><td width=\"80\">\n",
" &nbsp;\n",
" </td><td>Neurosciences (edoc), EDOC</td><td></td></tr><tr><td width=\"300\">Ceylan Gizay</td><td width=\"80\">\n",
" &nbsp;\n",
" </td><td>Neurosciences (edoc), EDOC</td><td></td></tr><tr><td width=\"300\">Lin Wei-Hsiang</td><td width=\"80\">\n",
" &nbsp;\n",
" </td><td>Neurosciences (edoc), EDOC</td><td></td></tr></table></td></tr><tr><td></td></tr><tr><td colspan=\"2\"><b>Physique (edoc), 2019-2020</b></td></tr><tr><td colspan=\"2\"><font color=\"black\"> (5 ét.)</font></td></tr><tr><td colspan=\"3\"><font color=\"red\"></font><table border=\"0\" width=\"100%\"><tr><td width=\"300\">Croese Jared</td><td width=\"80\">\n",
" &nbsp;\n",
" </td><td>Auditeurs EDOC, 2019-2020</td><td></td></tr><tr><td width=\"300\">Ibtisam Aslam Ibtisam</td><td width=\"80\">\n",
" &nbsp;\n",
" </td><td>Auditeurs EDOC, 2019-2020</td><td></td></tr><tr><td width=\"300\">Kulesz Karolina</td><td width=\"80\">\n",
" &nbsp;\n",
" </td><td>Auditeurs EDOC, 2019-2020</td><td></td></tr><tr><td width=\"300\">Mosso Jessie Julie</td><td width=\"80\">\n",
" &nbsp;\n",
" </td><td>Physique (edoc), EDOC</td><td></td></tr><tr><td width=\"300\">Wiström Emma</td><td width=\"80\">\n",
" &nbsp;\n",
" </td><td>Auditeurs EDOC, 2019-2020</td><td></td></tr></table></td></tr><tr><td></td></tr><tr><td colspan=\"3\"><font color=\"red\"></font><table border=\"0\" width=\"100%\"></table></td></tr></table></body></html>\n",
"<!-- OpenXml:0.00s agent ctrl:0.00s xml:0.04s xsl extr&stylesheet:0.00s xsl after parsing:0.00s xsl ctrl data:0.00s transform 2:0.01s xsl process:0.00s -->\n"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from IPython.core.display import display, HTML\n",
"display(HTML(raw))"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"from html.parser import HTMLParser\n",
"import json\n",
"\n",
"class ISA_HTMLParser(HTMLParser):\n",
" def __init__(self):\n",
" super().__init__()\n",
" self.data = []\n",
" self.tables = []\n",
" self.tr = None\n",
" self.entry = None\n",
" \n",
" def handle_starttag(self, tag, attrs):\n",
" if tag == 'table':\n",
" self.tables.append([]) # start saving a table (you can have a table into another table)\n",
" if tag == 'tr':\n",
" self.tr = [] # start saving a line (you cannot have a line into another line)\n",
" if tag in ['td', 'th']:\n",
" self.entry = [] # start saving an entry (you cannot have an entry into another entry)\n",
"\n",
" def handle_endtag(self, tag):\n",
" if tag == 'table' and self.tables:\n",
" table = self.tables.pop()\n",
" if table: # don't save empty tables\n",
" if self.tables:\n",
" self.tables[-1].append(table)\n",
" else: # root table\n",
" self.data.append(table)\n",
" \n",
" if tag == 'tr' and type(self.tr) == list:\n",
" if self.tables:\n",
" if self.tr: # don't save empty lines\n",
" self.tables[-1].append(self.tr)\n",
" self.tr = None\n",
" else:\n",
" print(\"tr outside a table: {}\".format(self.tr))\n",
" \n",
" if tag in ['td', 'th'] and self.entry:\n",
" if type(self.tr) == list:\n",
" self.tr.append(\"\\n\".join(self.entry))\n",
" self.entry = None\n",
" else:\n",
" print(\"td/th outside a tr: {}\".format(self.td))\n",
"\n",
" def handle_data(self, data):\n",
" if type(self.entry) is list:\n",
" self.entry.append(data.strip())\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"def parse(raw):\n",
" parser = ISA_HTMLParser()\n",
" parser.feed(raw)\n",
" \n",
" if len(parser.data) == 0:\n",
" return []\n",
" \n",
" main = parser.data[0]\n",
" \n",
" if len(main[0]) != 2:\n",
" return []\n",
" \n",
" course, teacher = main[0]\n",
" course = course.strip()\n",
" \n",
" if 'Enseignant' in teacher:\n",
" teacher = teacher.split('Enseignant-e-(s):')[1]\n",
" if 'Assistant-e-(s):' in teacher:\n",
" teacher, assistant = teacher.split(\"Assistant-e-(s):\")\n",
" else:\n",
" assistant = \"\"\n",
" teacher = teacher.strip()\n",
" assistant = assistant.strip()\n",
" else:\n",
" teacher = teacher.strip()\n",
" assistant = \"\"\n",
"\n",
" students = []\n",
" \n",
" def fmt(x):\n",
" if len(x) == 3:\n",
" return x\n",
" if len(x) < 3:\n",
" return x + [\"\"] * (3 - len(x))\n",
" assert False, x\n",
" \n",
" def valid(x):\n",
" if type(x[0]) is list:\n",
" return False\n",
" return True\n",
"\n",
" i = 1\n",
" while i < len(main):\n",
" if len(main[i]) == 1:\n",
" label = main[i][0]\n",
" i += 1\n",
"\n",
" if 'ét' in main[i][0]:\n",
" i += 1\n",
"\n",
" students += [fmt(x) + [course, teacher, assistant, label] for x in main[i] if valid(x)]\n",
" i += 1\n",
" else:\n",
" i += 0\n",
" else:\n",
" i += 1 \n",
" \n",
" return students"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "6a356ae908c0476aa1487c926ed7fdc2",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, max=8), HTML(value='')))"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "fc21030f694f4182bdf6b309db8f5f56",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, max=2), HTML(value='')))"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"(2012, 'autumn')\n",
"(2012, 'spring')\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "f42821e79e57417abbd231f0eb789c4d",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, max=2130), HTML(value='')))"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "cb92917606d44042a87ee50b891d4b13",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, max=2), HTML(value='')))"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"(2013, 'autumn')\n",
"(2013, 'spring')\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "284df389d14845f69a19563bbfabf51b",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, max=2165), HTML(value='')))"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"ename": "KeyboardInterrupt",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36m_make_request\u001b[0;34m(self, conn, method, url, timeout, chunked, **httplib_request_kw)\u001b[0m\n\u001b[1;32m 376\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# Python 2.7, use buffering of HTTP responses\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 377\u001b[0;31m \u001b[0mhttplib_response\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mconn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgetresponse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbuffering\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 378\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# Python 3\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mTypeError\u001b[0m: getresponse() got an unexpected keyword argument 'buffering'",
"\nDuring handling of the above exception, another exception occurred:\n",
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-28-b0774ef2c36a>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 49\u001b[0m ww_x_MATIERE[0])\n\u001b[1;32m 50\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 51\u001b[0;31m \u001b[0mresponse\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrequests\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0murl_mat\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 52\u001b[0m \u001b[0mhtml\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresponse\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcontent\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'latin1'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 53\u001b[0m \u001b[0mxs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhtml\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/requests/api.py\u001b[0m in \u001b[0;36mget\u001b[0;34m(url, params, **kwargs)\u001b[0m\n\u001b[1;32m 73\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 74\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msetdefault\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'allow_redirects'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 75\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'get'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparams\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mparams\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 76\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 77\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/requests/api.py\u001b[0m in \u001b[0;36mrequest\u001b[0;34m(method, url, **kwargs)\u001b[0m\n\u001b[1;32m 58\u001b[0m \u001b[0;31m# cases, and look like a memory leak in others.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 59\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0msessions\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSession\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0msession\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 60\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0msession\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 61\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 62\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/requests/sessions.py\u001b[0m in \u001b[0;36mrequest\u001b[0;34m(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)\u001b[0m\n\u001b[1;32m 531\u001b[0m }\n\u001b[1;32m 532\u001b[0m \u001b[0msend_kwargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msettings\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 533\u001b[0;31m \u001b[0mresp\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mprep\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0msend_kwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 534\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 535\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresp\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/requests/sessions.py\u001b[0m in \u001b[0;36msend\u001b[0;34m(self, request, **kwargs)\u001b[0m\n\u001b[1;32m 644\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 645\u001b[0m \u001b[0;31m# Send the request\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 646\u001b[0;31m \u001b[0mr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0madapter\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 647\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 648\u001b[0m \u001b[0;31m# Total elapsed time of the request (approximately)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/requests/adapters.py\u001b[0m in \u001b[0;36msend\u001b[0;34m(self, request, stream, timeout, verify, cert, proxies)\u001b[0m\n\u001b[1;32m 447\u001b[0m \u001b[0mdecode_content\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 448\u001b[0m \u001b[0mretries\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmax_retries\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 449\u001b[0;31m \u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 450\u001b[0m )\n\u001b[1;32m 451\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36murlopen\u001b[0;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)\u001b[0m\n\u001b[1;32m 598\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtimeout_obj\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 599\u001b[0m \u001b[0mbody\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mbody\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mheaders\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 600\u001b[0;31m chunked=chunked)\n\u001b[0m\u001b[1;32m 601\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 602\u001b[0m \u001b[0;31m# If we're going to release the connection in ``finally:``, then\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36m_make_request\u001b[0;34m(self, conn, method, url, timeout, chunked, **httplib_request_kw)\u001b[0m\n\u001b[1;32m 378\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# Python 3\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 379\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 380\u001b[0;31m \u001b[0mhttplib_response\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mconn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgetresponse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 381\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 382\u001b[0m \u001b[0;31m# Remove the TypeError from the exception chain in Python 3;\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/anaconda3/lib/python3.7/http/client.py\u001b[0m in \u001b[0;36mgetresponse\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1319\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1320\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1321\u001b[0;31m \u001b[0mresponse\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbegin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1322\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mConnectionError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1323\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclose\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/anaconda3/lib/python3.7/http/client.py\u001b[0m in \u001b[0;36mbegin\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 294\u001b[0m \u001b[0;31m# read until we get a non-100 response\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 295\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 296\u001b[0;31m \u001b[0mversion\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstatus\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreason\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_read_status\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 297\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mstatus\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mCONTINUE\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 298\u001b[0m \u001b[0;32mbreak\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/anaconda3/lib/python3.7/http/client.py\u001b[0m in \u001b[0;36m_read_status\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 255\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 256\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_read_status\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 257\u001b[0;31m \u001b[0mline\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreadline\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_MAXLINE\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"iso-8859-1\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 258\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mline\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0m_MAXLINE\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 259\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mLineTooLong\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"status line\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/anaconda3/lib/python3.7/socket.py\u001b[0m in \u001b[0;36mreadinto\u001b[0;34m(self, b)\u001b[0m\n\u001b[1;32m 587\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 588\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 589\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sock\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrecv_into\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 590\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 591\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_timeout_occurred\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/urllib3/contrib/pyopenssl.py\u001b[0m in \u001b[0;36mrecv_into\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 295\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mrecv_into\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 296\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 297\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconnection\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrecv_into\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 298\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mOpenSSL\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSSL\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSysCallError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 299\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msuppress_ragged_eofs\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0margs\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'Unexpected EOF'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/OpenSSL/SSL.py\u001b[0m in \u001b[0;36mrecv_into\u001b[0;34m(self, buffer, nbytes, flags)\u001b[0m\n\u001b[1;32m 1819\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_lib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSSL_peek\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_ssl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbuf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnbytes\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1820\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1821\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_lib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSSL_read\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_ssl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbuf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnbytes\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1822\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_raise_ssl_error\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_ssl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1823\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mKeyboardInterrupt\u001b[0m: "
]
}
],
"source": [
"import urllib.request, urllib.error, urllib.parse, requests\n",
"from tqdm import tqdm_notebook as tqdm\n",
"import csv\n",
"\n",
"url_base = [\n",
" \"https://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.filter?\" + \n",
" \"&ww_b_list={}&ww_i_reportmodel={}&ww_c_langue={}&ww_i_reportModelXsl={}&\" + \n",
" \"ww_x_CLASSE={}&ww_x_PERIODE_ACAD={}&ww_x_HIVERETE={}&ww_x_MATIERE={}\", \n",
" \n",
" \"https://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.html?\" + \n",
" \"ww_x_MAT={}&ww_x_CLASSE={}&ww_i_reportmodel={}&ww_c_langue={}&\" + \n",
" \"ww_i_reportModelXsl={}&ww_x_PERIODE_ACAD={}&ww_x_HIVERETE={}&ww_x_MATIERE={}\"\n",
"]\n",
"\n",
"ww_b_list = [\"1\"]\n",
"ww_i_reportmodel = [\"66627699\"]\n",
"ww_x_MATIERE = [\"%2A\"] #MATIERE ID\n",
"ww_c_langue = [\"\"]\n",
"ww_i_reportModelXsl = [\"66627723\", \"66627727\"] #htlm, xls\n",
"ww_x_CLASSE = [\"null\"]\n",
"ww_x_PERIODE_ACAD = [\"123456101\", \"213637754\", \"213637922\", \"213638028\", \"355925344\", \"355925344\", \"1866893861\", \"1866894985\"]\n",
"ww_x_PERIODE_ACAD = list(zip(ww_x_PERIODE_ACAD, range(2012, 2090, 1)))\n",
"ww_x_HIVERETE = [(\"2936286\", \"autumn\"), (\"2936295\", \"spring\")]\n",
"ww_x_MATIERE = [\"*\"]\n",
"\n",
"#periods = dict()\n",
"\n",
"for year, year_name in tqdm(ww_x_PERIODE_ACAD):\n",
" for season, season_name in tqdm(ww_x_HIVERETE): \n",
" key = (year_name, season_name)\n",
" print(key)\n",
" if key in periods:\n",
" continue\n",
" \n",
" \n",
" url_list = url_base[0].format(ww_b_list[0], ww_i_reportmodel[0], ww_c_langue[0], ww_i_reportModelXsl[0], \n",
" ww_x_CLASSE[0], year, season, ww_x_MATIERE[0])\n",
" \n",
" response = urllib.request.urlopen(url_list)\n",
" webContent = str(response.read())\n",
" page_splits = webContent.split(\"ww_x_MAT=\")\n",
" \n",
" xs = []\n",
"\n",
" for k, split in enumerate(tqdm(page_splits)):\n",
" ww_x_MAT = [split[:split.index(\"\\\\\")]]\n",
" url_mat = url_base[1].format(ww_x_MAT[0], ww_x_CLASSE[0], ww_i_reportmodel[0], ww_c_langue[0],\n",
" ww_i_reportModelXsl[0], year, season, \n",
" ww_x_MATIERE[0])\n",
" \n",
" response = requests.get(url_mat)\n",
" html = str(response.content, 'latin1')\n",
" xs.append(html)\n",
"\n",
" periods[key] = xs\n",
" import pickle \n",
" with open('raw.pkl', 'wb') as f: \n",
" pickle.dump(periods, f)\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"dict_keys([(2012, 'autumn'), (2013, 'autumn')])"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"periods.keys()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"import pickle \n",
"with open('raw.pkl', 'rb') as f:\n",
" periods = pickle.load(f)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "fe2379a7d5644502a4291a45519b3f65",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, max=3), HTML(value='')))"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "ae0125c4e8ef432da933f7c19c36986a",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, max=2130), HTML(value='')))"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "bc19ccce541b4f8cbd22ae5005dcf3d7",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, max=2165), HTML(value='')))"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "ccb65f630117411dbb17c05b420c5d65",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, max=2130), HTML(value='')))"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"for (year, season), xs in tqdm(periods.items()):\n",
" students = []\n",
" for html in tqdm(xs):\n",
" try:\n",
" students += parse(html)\n",
" except:\n",
" print(url_mat)\n",
" print(html)\n",
" display(HTML(html))\n",
" stop = True\n",
" break\n",
"\n",
" with open(\"data{}{}.csv\".format(year, season), 'w') as f:\n",
" writer = csv.writer(f)\n",
" for x in students:\n",
" writer.writerow(x)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment