Skip to content

Instantly share code, notes, and snippets.

@mkramb
Last active August 29, 2015 13:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mkramb/8795572 to your computer and use it in GitHub Desktop.
Save mkramb/8795572 to your computer and use it in GitHub Desktop.
<html><head>
<title>Zimski semester 2013/14 (Oddelek za matematiko FMF) (verzija 9. 12. 2013)</title>
<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-2">
</head>
<body bgcolor="Silver">
<font color="Red">
<center><h3>Zimski semester 2013/14 (Oddelek za matematiko FMF) (verzija 9. 12. 2013)</h3></center><p></p>
<font color="Black">
<center><h3>Predavatelj: Plestenjak</h3></center><p></p>
<table border="" bgcolor="Silver" bordercolor="Black" cols="6">
<tbody><tr> <td> </td> <td bgcolor="AQUA" colspan="1"><h3>PONEDELJEK</h3></td> <td bgcolor="AQUA" colspan="1"><h3>TOREK</h3></td> <td bgcolor="AQUA" colspan="1"><h3>SREDA</h3></td> <td bgcolor="AQUA" colspan="1"><h3>ČETRTEK</h3></td> <td bgcolor="AQUA" colspan="1"><h3>PETEK</h3></td> </tr>
<tr><td bgcolor="AQUA" align="CENTER"><br>7-8</td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> </tr>
<tr><td bgcolor="AQUA" align="CENTER"><br>8-9</td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> </tr>
<tr><td bgcolor="AQUA" align="CENTER"><br>9-10</td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> </tr>
<tr><td bgcolor="AQUA" align="CENTER"><br>10-11</td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td nowrap="" align="CENTER" bgcolor="#00FFFF" colspan="1" rowspan="2"><small><b>NA SEM</b><br><i><a href="http://www.fmf.uni-lj.si/si/imenik/3357/">Plestenjak</a></i> 3.06</small></td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td nowrap="" align="CENTER" bgcolor="#00FFFF" colspan="1" rowspan="2"><small><b>NM 1 (F)</b><br><i><a href="http://www.fmf.uni-lj.si/si/imenik/3357/">Plestenjak</a></i> 2.04</small></td> </tr>
<tr><td bgcolor="AQUA" align="CENTER"><br>11-12</td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> </tr>
<tr><td bgcolor="AQUA" align="CENTER"><br>12-13</td> <td nowrap="" align="CENTER" bgcolor="#00FFFF" colspan="1" rowspan="2"><small><b>INMLA+IPN</b><br><i><a href="http://www.fmf.uni-lj.si/si/imenik/3357/">Plestenjak</a></i> 3.07</small></td> <td nowrap="" align="CENTER" bgcolor="#00FFFF" colspan="1" rowspan="2"><small><b>UNM</b><br><i><a href="http://www.fmf.uni-lj.si/si/imenik/3357/">Plestenjak</a></i> 2.04</small></td> <td nowrap="" align="CENTER" bgcolor="#00FFFF" colspan="1" rowspan="3"><small><b>SEJA (PLESTENJAK)</b><br><i><a href="http://www.fmf.uni-lj.si/si/imenik/3357/">Plestenjak</a></i> (2.02)</small></td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td nowrap="" align="CENTER" bgcolor="#00FFFF" colspan="1" rowspan="1"><small><b>UNM</b><br><i><a href="http://www.fmf.uni-lj.si/si/imenik/3357/">Plestenjak</a></i> 2.05</small></td> </tr>
<tr><td bgcolor="AQUA" align="CENTER"><br>13-14</td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> </tr>
<tr><td bgcolor="AQUA" align="CENTER"><br>14-15</td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> </tr>
<tr><td bgcolor="AQUA" align="CENTER"><br>15-16</td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> </tr>
<tr><td bgcolor="AQUA" align="CENTER"><br>16-17</td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> </tr>
<tr><td bgcolor="AQUA" align="CENTER"><br>17-18</td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> </tr>
<tr><td bgcolor="AQUA" align="CENTER"><br>18-19</td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> </tr>
<tr><td bgcolor="AQUA" align="CENTER"><br>19-20</td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> <td bgcolor="Silver" colspan="1"><pre> </pre> </td> </tr>
</tbody></table>
<p></p><hr><h4>Bor Plestenjak, IMFM 1997</h4><p></p>
<h4><a href="mailto:grega.cigler@fmf.uni-lj.si">Komentarji</a></h4>
</font></font></body></html>
from lxml.html import parse
from collections import defaultdict
from pprint import pprint
def table_to_list(table):
dct = table_to_2d_dict(table)
return list(iter_2d_dict(dct))
def table_to_2d_dict(table):
result = defaultdict(lambda : defaultdict(unicode))
for row_i, row in enumerate(table.xpath('.//tr[position()>1]')):
for col_i, col in enumerate(row.xpath('.//td | .//th')):
colspan = int(col.get('colspan', 1))
rowspan = int(col.get('rowspan', 1))
col_data = col.text_content().strip()
while row_i in result and col_i in result[row_i]:
col_i += 1
for i in range(row_i, row_i + rowspan):
for j in range(col_i, col_i + colspan):
result[i][j] = col_data if col_data else None
return result
def iter_2d_dict(dct):
for i, row in sorted(dct.items()):
cols = []
for j, col in sorted(row.items()):
cols.append(col)
yield cols
if __name__ == '__main__':
doc = parse('./test.html')
for table in doc.xpath('//table'):
pprint(table_to_list(table))
"""
OUTPUT:
[['7-8', None, None, None, None, None],
['8-9', None, None, None, None, None],
['9-10', None, None, None, None, None],
['10-11',
None,
None,
'NA SEMPlestenjak 3.06',
None,
'NM 1 (F)Plestenjak 2.04'],
['11-12',
None,
None,
'NA SEMPlestenjak 3.06',
None,
'NM 1 (F)Plestenjak 2.04'],
['12-13',
'INMLA+IPNPlestenjak 3.07',
'UNMPlestenjak 2.04',
'SEJA (PLESTENJAK)Plestenjak (2.02)',
None,
'UNMPlestenjak 2.05'],
['13-14',
'INMLA+IPNPlestenjak 3.07',
'UNMPlestenjak 2.04',
'SEJA (PLESTENJAK)Plestenjak (2.02)',
None,
None],
['14-15', None, None, 'SEJA (PLESTENJAK)Plestenjak (2.02)', None, None],
['15-16', None, None, None, None, None],
['16-17', None, None, None, None, None],
['17-18', None, None, None, None, None],
['18-19', None, None, None, None, None],
['19-20', None, None, None, None, None]]
"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment