xrogaan/main.py

## main.py
#!/usr/bin/env python
import os
import codecs
import csv
import re
from collections import namedtuple
from bs4 import BeautifulSoup


# Will walk that directory and grab all .html file
in_dir = 'html/'
# csv files will be written into this directory
out_dir = 'csv/'

#######################
# ~ HERE BE DRAGONS ~ #
#######################
reMatchTitle = re.compile(r'.*(Week Ending [0-9]{1,2}(?:st|nd|rd|th)\s[a-zA-Z]{3}\s[0-9]{4}).*').match
CsvData = namedtuple('CsvData', 'filename headers data')


def parse_html(htmlfile):
    with codecs.open(htmlfile, 'r', 'iso-8859-1') as hfile:
        html = hfile.read()

    soup = BeautifulSoup(html, 'html.parser')
    title = soup.find('title').text
    tm = reMatchTitle(title)
    if tm:
        # assuming the filename is 12345.COUNTRY.html
        # we want 12345_COUNTRY_{title}.csv
        _, filename = os.path.split(htmlfile)
        csv_fn = filename.replace('html', '{}').replace('.','_')
        csv_fn = csv_fn.format(tm.group(1).replace(' ', '_')) + '.csv'
    else:
        print(f"WARNING: regex couldn't match string: {title}")
        exit()

    table = soup.find('table', class_='chart')
    headers = [header.text for header in table.find_all('th')]

    rows = []
    for row in table.find_all('tr'):
        # Ignore embedded tables tr entries
        if len(row.find_parents("table")) == 2:
            continue
        rows.append([col.text.strip() for col in row.find_all('td', recursive=False)])

    return CsvData(csv_fn, headers, rows)


def write_csv(csv_data):
    fn = os.path.abspath(os.path.join(out_dir, csv_data.filename))
    with open(fn, 'w') as f:
        writer = csv.writer(f)
        writer.writerow(csv_data.headers)
        writer.writerows(row for row in csv_data.data if row)


if __name__ == '__main__':
    for root, dirs, files in os.walk(os.path.abspath(in_dir)):
        print(root)
        for file in files:
            if os.path.splitext(file)[1] == '.html':
                print(f"Processing {file}...", end='')
                write_csv(parse_html(os.path.join(root, file)))
                print("\t\tDONE")
	#!/usr/bin/env python
	import os
	import codecs
	import csv
	import re
	from collections import namedtuple
	from bs4 import BeautifulSoup


	# Will walk that directory and grab all .html file
	in_dir = 'html/'
	# csv files will be written into this directory
	out_dir = 'csv/'

	#######################
	# ~ HERE BE DRAGONS ~ #
	#######################
	reMatchTitle = re.compile(r'.(Week Ending [0-9]{1,2}(?:st\|nd\|rd\|th)\s[a-zA-Z]{3}\s[0-9]{4}).').match
	CsvData = namedtuple('CsvData', 'filename headers data')


	def parse_html(htmlfile):
	with codecs.open(htmlfile, 'r', 'iso-8859-1') as hfile:
	html = hfile.read()

	soup = BeautifulSoup(html, 'html.parser')
	title = soup.find('title').text
	tm = reMatchTitle(title)
	if tm:
	# assuming the filename is 12345.COUNTRY.html
	# we want 12345_COUNTRY_{title}.csv
	_, filename = os.path.split(htmlfile)
	csv_fn = filename.replace('html', '{}').replace('.','_')
	csv_fn = csv_fn.format(tm.group(1).replace(' ', '_')) + '.csv'
	else:
	print(f"WARNING: regex couldn't match string: {title}")
	exit()

	table = soup.find('table', class_='chart')
	headers = [header.text for header in table.find_all('th')]

	rows = []
	for row in table.find_all('tr'):
	# Ignore embedded tables tr entries
	if len(row.find_parents("table")) == 2:
	continue
	rows.append([col.text.strip() for col in row.find_all('td', recursive=False)])

	return CsvData(csv_fn, headers, rows)


	def write_csv(csv_data):
	fn = os.path.abspath(os.path.join(out_dir, csv_data.filename))
	with open(fn, 'w') as f:
	writer = csv.writer(f)
	writer.writerow(csv_data.headers)
	writer.writerows(row for row in csv_data.data if row)


	if __name__ == '__main__':
	for root, dirs, files in os.walk(os.path.abspath(in_dir)):
	print(root)
	for file in files:
	if os.path.splitext(file)[1] == '.html':
	print(f"Processing {file}...", end='')
	write_csv(parse_html(os.path.join(root, file)))
	print("\t\tDONE")