Last active
August 29, 2015 13:56
-
-
Save mkramb/8795572 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<html><head> | |
<title>Zimski semester 2013/14 (Oddelek za matematiko FMF) (verzija 9. 12. 2013)</title> | |
<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-2"> | |
</head> | |
<body bgcolor="Silver"> | |
<font color="Red"> | |
<center><h3>Zimski semester 2013/14 (Oddelek za matematiko FMF) (verzija 9. 12. 2013)</h3></center><p></p> | |
<font color="Black"> | |
<center><h3>Predavatelj: Plestenjak</h3></center><p></p> | |
<table border="" bgcolor="Silver" bordercolor="Black" cols="6"> | |
<tbody><tr> <td> </td> <td bgcolor="AQUA" colspan="1"><h3>PONEDELJEK</h3></td> <td bgcolor="AQUA" colspan="1"><h3>TOREK</h3></td> <td bgcolor="AQUA" colspan="1"><h3>SREDA</h3></td> <td bgcolor="AQUA" colspan="1"><h3>ÄETRTEK</h3></td> <td bgcolor="AQUA" colspan="1"><h3>PETEK</h3></td> </tr> | |
<tr><td bgcolor="AQUA" align="CENTER"><br>7-8</td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> </tr> | |
<tr><td bgcolor="AQUA" align="CENTER"><br>8-9</td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> </tr> | |
<tr><td bgcolor="AQUA" align="CENTER"><br>9-10</td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> </tr> | |
<tr><td bgcolor="AQUA" align="CENTER"><br>10-11</td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td nowrap="" align="CENTER" bgcolor="#00FFFF" colspan="1" rowspan="2"><small><b>NA SEM</b><br><i><a href="http://www.fmf.uni-lj.si/si/imenik/3357/">Plestenjak</a></i> 3.06</small></td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td nowrap="" align="CENTER" bgcolor="#00FFFF" colspan="1" rowspan="2"><small><b>NM 1 (F)</b><br><i><a href="http://www.fmf.uni-lj.si/si/imenik/3357/">Plestenjak</a></i> 2.04</small></td> </tr> | |
<tr><td bgcolor="AQUA" align="CENTER"><br>11-12</td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> </tr> | |
<tr><td bgcolor="AQUA" align="CENTER"><br>12-13</td> <td nowrap="" align="CENTER" bgcolor="#00FFFF" colspan="1" rowspan="2"><small><b>INMLA+IPN</b><br><i><a href="http://www.fmf.uni-lj.si/si/imenik/3357/">Plestenjak</a></i> 3.07</small></td> <td nowrap="" align="CENTER" bgcolor="#00FFFF" colspan="1" rowspan="2"><small><b>UNM</b><br><i><a href="http://www.fmf.uni-lj.si/si/imenik/3357/">Plestenjak</a></i> 2.04</small></td> <td nowrap="" align="CENTER" bgcolor="#00FFFF" colspan="1" rowspan="3"><small><b>SEJA (PLESTENJAK)</b><br><i><a href="http://www.fmf.uni-lj.si/si/imenik/3357/">Plestenjak</a></i> (2.02)</small></td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td nowrap="" align="CENTER" bgcolor="#00FFFF" colspan="1" rowspan="1"><small><b>UNM</b><br><i><a href="http://www.fmf.uni-lj.si/si/imenik/3357/">Plestenjak</a></i> 2.05</small></td> </tr> | |
<tr><td bgcolor="AQUA" align="CENTER"><br>13-14</td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> </tr> | |
<tr><td bgcolor="AQUA" align="CENTER"><br>14-15</td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> </tr> | |
<tr><td bgcolor="AQUA" align="CENTER"><br>15-16</td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> </tr> | |
<tr><td bgcolor="AQUA" align="CENTER"><br>16-17</td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> </tr> | |
<tr><td bgcolor="AQUA" align="CENTER"><br>17-18</td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> </tr> | |
<tr><td bgcolor="AQUA" align="CENTER"><br>18-19</td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> </tr> | |
<tr><td bgcolor="AQUA" align="CENTER"><br>19-20</td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> </tr> | |
</tbody></table> | |
<p></p><hr><h4>Bor Plestenjak, IMFM 1997</h4><p></p> | |
<h4><a href="mailto:grega.cigler@fmf.uni-lj.si">Komentarji</a></h4> | |
</font></font></body></html> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from lxml.html import parse | |
from collections import defaultdict | |
from pprint import pprint | |
def table_to_list(table): | |
dct = table_to_2d_dict(table) | |
return list(iter_2d_dict(dct)) | |
def table_to_2d_dict(table): | |
result = defaultdict(lambda : defaultdict(unicode)) | |
for row_i, row in enumerate(table.xpath('.//tr[position()>1]')): | |
for col_i, col in enumerate(row.xpath('.//td | .//th')): | |
colspan = int(col.get('colspan', 1)) | |
rowspan = int(col.get('rowspan', 1)) | |
col_data = col.text_content().strip() | |
while row_i in result and col_i in result[row_i]: | |
col_i += 1 | |
for i in range(row_i, row_i + rowspan): | |
for j in range(col_i, col_i + colspan): | |
result[i][j] = col_data if col_data else None | |
return result | |
def iter_2d_dict(dct): | |
for i, row in sorted(dct.items()): | |
cols = [] | |
for j, col in sorted(row.items()): | |
cols.append(col) | |
yield cols | |
if __name__ == '__main__': | |
doc = parse('./test.html') | |
for table in doc.xpath('//table'): | |
pprint(table_to_list(table)) | |
""" | |
OUTPUT: | |
[['7-8', None, None, None, None, None], | |
['8-9', None, None, None, None, None], | |
['9-10', None, None, None, None, None], | |
['10-11', | |
None, | |
None, | |
'NA SEMPlestenjak 3.06', | |
None, | |
'NM 1 (F)Plestenjak 2.04'], | |
['11-12', | |
None, | |
None, | |
'NA SEMPlestenjak 3.06', | |
None, | |
'NM 1 (F)Plestenjak 2.04'], | |
['12-13', | |
'INMLA+IPNPlestenjak 3.07', | |
'UNMPlestenjak 2.04', | |
'SEJA (PLESTENJAK)Plestenjak (2.02)', | |
None, | |
'UNMPlestenjak 2.05'], | |
['13-14', | |
'INMLA+IPNPlestenjak 3.07', | |
'UNMPlestenjak 2.04', | |
'SEJA (PLESTENJAK)Plestenjak (2.02)', | |
None, | |
None], | |
['14-15', None, None, 'SEJA (PLESTENJAK)Plestenjak (2.02)', None, None], | |
['15-16', None, None, None, None, None], | |
['16-17', None, None, None, None, None], | |
['17-18', None, None, None, None, None], | |
['18-19', None, None, None, None, None], | |
['19-20', None, None, None, None, None]] | |
""" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment