Last active
February 9, 2018 08:26
-
-
Save luxiao/fb754ece73bb5e6a95a939cc79b1b6c9 to your computer and use it in GitHub Desktop.
format html table with rowspan or colspan
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
''' | |
format table with rowspan or colspan to [[]] two dimension lists with the same number td in tr. | |
html表格格式化,有合并单元格的表格拆分出来,组成td元素个数一致的规整表格。 | |
''' | |
def test_rowspan(): | |
html_data_row = '''<table border="1"> | |
<tr> <td>Montd</td> <td>Savings</td> <td>Savings for holiday!</td> </tr> | |
<tr> <td>January</td> <td>$100</td> <td rowspan="2">$50</td> </tr> | |
<tr> <td>February</td> <td>$80</td> </tr> </table>''' | |
html_data_col = ''' | |
<table width="100%" border="1"> | |
<tr> | |
<td>Montd</td> | |
<td>Savings</td> | |
</tr> | |
<tr> | |
<td colspan="2">January</td> | |
</tr> | |
<tr> | |
<td colspan="2">February</td> | |
</tr> | |
</table> | |
''' | |
html_data = html_data_col | |
table_data = [[cell.text for cell in row("td")] | |
for row in BeautifulSoup(html_data)("tr")] | |
print table_data | |
tab = BeautifulSoup(html_data) | |
for row_num, row in enumerate(tab("tr")): | |
for td_num, td in enumerate(row("td")): | |
if 'colspan' in td.attrs and td.attrs['colspan'].isdigit(): | |
colspan = int(td.attrs['colspan']) | |
for i in range(1, colspan): | |
table_data[row_num].insert(td_num, td.text) | |
if 'rowspan' in td.attrs and td.attrs['rowspan'].isdigit(): | |
rowspan = int(td.attrs['rowspan']) | |
for i in range(1, rowspan): | |
table_data[row_num+i].insert(td_num, td.text) | |
print table_data |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment