Skip to content

Instantly share code, notes, and snippets.

@xrogaan
Created August 7, 2019 12:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save xrogaan/f5e1449724c6f6de1e38c63c40c8da21 to your computer and use it in GitHub Desktop.
Save xrogaan/f5e1449724c6f6de1e38c63c40c8da21 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
import os
import codecs
import csv
import re
from collections import namedtuple
from bs4 import BeautifulSoup
# Will walk that directory and grab all .html file
in_dir = 'html/'
# csv files will be written into this directory
out_dir = 'csv/'
#######################
# ~ HERE BE DRAGONS ~ #
#######################
reMatchTitle = re.compile(r'.*(Week Ending [0-9]{1,2}(?:st|nd|rd|th)\s[a-zA-Z]{3}\s[0-9]{4}).*').match
CsvData = namedtuple('CsvData', 'filename headers data')
def parse_html(htmlfile):
with codecs.open(htmlfile, 'r', 'iso-8859-1') as hfile:
html = hfile.read()
soup = BeautifulSoup(html, 'html.parser')
title = soup.find('title').text
tm = reMatchTitle(title)
if tm:
# assuming the filename is 12345.COUNTRY.html
# we want 12345_COUNTRY_{title}.csv
_, filename = os.path.split(htmlfile)
csv_fn = filename.replace('html', '{}').replace('.','_')
csv_fn = csv_fn.format(tm.group(1).replace(' ', '_')) + '.csv'
else:
print(f"WARNING: regex couldn't match string: {title}")
exit()
table = soup.find('table', class_='chart')
headers = [header.text for header in table.find_all('th')]
rows = []
for row in table.find_all('tr'):
# Ignore embedded tables tr entries
if len(row.find_parents("table")) == 2:
continue
rows.append([col.text.strip() for col in row.find_all('td', recursive=False)])
return CsvData(csv_fn, headers, rows)
def write_csv(csv_data):
fn = os.path.abspath(os.path.join(out_dir, csv_data.filename))
with open(fn, 'w') as f:
writer = csv.writer(f)
writer.writerow(csv_data.headers)
writer.writerows(row for row in csv_data.data if row)
if __name__ == '__main__':
for root, dirs, files in os.walk(os.path.abspath(in_dir)):
print(root)
for file in files:
if os.path.splitext(file)[1] == '.html':
print(f"Processing {file}...", end='')
write_csv(parse_html(os.path.join(root, file)))
print("\t\tDONE")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment