Skip to content

Instantly share code, notes, and snippets.

@luxiao
Last active February 9, 2018 08:26
Show Gist options
  • Save luxiao/fb754ece73bb5e6a95a939cc79b1b6c9 to your computer and use it in GitHub Desktop.
Save luxiao/fb754ece73bb5e6a95a939cc79b1b6c9 to your computer and use it in GitHub Desktop.
format html table with rowspan or colspan
from bs4 import BeautifulSoup
'''
format table with rowspan or colspan to [[]] two dimension lists with the same number td in tr.
html表格格式化,有合并单元格的表格拆分出来,组成td元素个数一致的规整表格。
'''
def test_rowspan():
html_data_row = '''<table border="1">
<tr> <td>Montd</td> <td>Savings</td> <td>Savings for holiday!</td> </tr>
<tr> <td>January</td> <td>$100</td> <td rowspan="2">$50</td> </tr>
<tr> <td>February</td> <td>$80</td> </tr> </table>'''
html_data_col = '''
<table width="100%" border="1">
<tr>
<td>Montd</td>
<td>Savings</td>
</tr>
<tr>
<td colspan="2">January</td>
</tr>
<tr>
<td colspan="2">February</td>
</tr>
</table>
'''
html_data = html_data_col
table_data = [[cell.text for cell in row("td")]
for row in BeautifulSoup(html_data)("tr")]
print table_data
tab = BeautifulSoup(html_data)
for row_num, row in enumerate(tab("tr")):
for td_num, td in enumerate(row("td")):
if 'colspan' in td.attrs and td.attrs['colspan'].isdigit():
colspan = int(td.attrs['colspan'])
for i in range(1, colspan):
table_data[row_num].insert(td_num, td.text)
if 'rowspan' in td.attrs and td.attrs['rowspan'].isdigit():
rowspan = int(td.attrs['rowspan'])
for i in range(1, rowspan):
table_data[row_num+i].insert(td_num, td.text)
print table_data
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment