Skip to content

Instantly share code, notes, and snippets.

@mila
Last active March 29, 2018 10:50
Show Gist options
  • Save mila/4fb5bcaaa6a1486e277447f91bf7c9cd to your computer and use it in GitHub Desktop.
Save mila/4fb5bcaaa6a1486e277447f91bf7c9cd to your computer and use it in GitHub Desktop.
Extracts CSV from the largest table in a HTML page.
#!/usr/bin/env python3
import argparse
import csv
import bs4
def read_stream(stream, features=None):
soup = bs4.BeautifulSoup(stream, features=features)
table = find_largest_table(soup)
return read_table(table)
def read_table(table):
rows = table.find_all('tr')
return [read_row(row) for row in rows]
def read_row(row):
cells = row.find_all(['th', 'td'])
return [cell.get_text() for cell in cells]
def find_largest_table(soup):
largest = None
largest_size = 0
for table in soup.find_all('table'):
table_size = len(str(table))
if table_size >= largest_size:
largest = table
largest_size = table_size
return largest
def main():
parser = argparse.ArgumentParser(
description='Extracts CSV from the largest table in a HTML page.',
)
parser.add_argument(
'--bs4-parser', default=bs4.builder_registry.lookup().NAME,
help='Beautiful Soup parser.',
)
parser.add_argument('source')
parser.add_argument('target')
args = parser.parse_args()
with open(args.source) as input_stream:
data = read_stream(input_stream, args.bs4_parser)
with open(args.target, 'w') as output_stream:
writer = csv.writer(output_stream)
writer.writerows(data)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment