kizernis/anuga_csv.py

## anuga_csv.py
# Extract data from the anuga.com downloaded pages and create a sophisticated CSV table

from bs4 import BeautifulSoup
import csv
from tqdm import trange
import re

page_first = 1
page_last = 7789

with open('anuga_data.csv', 'w', newline='', encoding='utf-8') as f_out:
    field_names = ['Company Name', 'Address', 'Contact 1', 'Contact 2', 'Email', 'Website', 'Category', 'Sub-Category', 'Product Name', 'Sector', 'Channel', 'Target Market']
    writer = csv.DictWriter(f_out, fieldnames=field_names)
    writer.writeheader()
    for i in trange(page_first, page_last + 1):
        with open('html_pages/%04d.html' % i, encoding='utf-8') as f_in:
            soup = BeautifulSoup(f_in, 'lxml')
        row = {}
        row['Company Name'] = soup.find('h1', class_='h1down').get_text().strip()

        soup_contacts = soup.find_all('div', class_='texts grey')
        contacts_len = len(soup_contacts)
        assert contacts_len
        row['Address'] = re.sub(r'\n[\s\n]*', '\n', soup_contacts[0].get_text().strip())
        if 1 < contacts_len:
            row['Contact 1'] = soup_contacts[1].get_text().strip()
            if 2 < contacts_len:
                row['Contact 2'] = soup_contacts[2].get_text().strip()
                if 3 < contacts_len:
                    row['Email'] = soup_contacts[3].get_text().strip()
                    if 4 < contacts_len:
                        row['Website'] = soup_contacts[4].get_text().strip()

        # I could probably simplify the following with stripped_strings generator of bs4.
        soup_div = soup.find('div', class_='searchcontent')
        non_empty_fields = [x2 for x2 in (x1.get_text().strip() for x1 in soup_div.find_all('b')) if x2 != '']
        soup = soup.find_all('ul', class_='ultree')

        # Products fields can be actually empty but their caption is always there.
        assert len(soup) == len(non_empty_fields)

        index = 1
        if 'Product sector' in non_empty_fields:
            row['Sector'] = re.sub(r'\n[\s\n]*', ', ', soup[index].get_text().strip())
            index += 1
        if 'Distribution Channel' in non_empty_fields:
            row['Channel'] = re.sub(r'\n[\s\n]*', ', ', soup[index].get_text().strip())
            index += 1
        if 'Target and sales markets' in non_empty_fields:
            row['Target Market'] = re.sub(r'\n[\s\n]*', ', ', soup[index].get_text().strip())

        data = {}
        for soup_category in soup[0].find_all('li', recursive=False):
            # Sometimes there is an empty <li> before categories (encountered in page 1138)
            try:
                category = next(soup_category.stripped_strings)
            except StopIteration:
                continue

            # Strangely there is a <ul> for every subcategory and product, not just <li>
            data[category] = {}
            for soup_subcategory in (x.find('li') for x in soup_category.find_all('ul', recursive=False)):
                subcategory = next(soup_subcategory.stripped_strings)
                data[category][subcategory] = []
                for soup_product in (x.find('li') for x in soup_subcategory.find_all('ul', recursive=False)):
                    product = next(soup_product.stripped_strings)
                    data[category][subcategory].append(product)

        i1 = 0
        for i1, category in enumerate(data, start=1):
            if i1 > 1:
                row = {}
            row['Category'] = category
            i2 = 0
            for i2, subcategory in enumerate(data[category], start=1):
                if i2 > 1:
                    row = {}
                row['Sub-Category'] = subcategory
                i3 = 0
                for i3, product in enumerate(data[category][subcategory], start=1):
                    if i3 > 1:
                        row = {}
                    row['Product Name'] = product
                    writer.writerow(row)
                if i3 == 0:
                    writer.writerow(row)
            if i2 == 0:
                writer.writerow(row)
        if i1 == 0:
            writer.writerow(row)
	# Extract data from the anuga.com downloaded pages and create a sophisticated CSV table

	from bs4 import BeautifulSoup
	import csv
	from tqdm import trange
	import re

	page_first = 1
	page_last = 7789

	with open('anuga_data.csv', 'w', newline='', encoding='utf-8') as f_out:
	field_names = ['Company Name', 'Address', 'Contact 1', 'Contact 2', 'Email', 'Website', 'Category', 'Sub-Category', 'Product Name', 'Sector', 'Channel', 'Target Market']
	writer = csv.DictWriter(f_out, fieldnames=field_names)
	writer.writeheader()
	for i in trange(page_first, page_last + 1):
	with open('html_pages/%04d.html' % i, encoding='utf-8') as f_in:
	soup = BeautifulSoup(f_in, 'lxml')
	row = {}
	row['Company Name'] = soup.find('h1', class_='h1down').get_text().strip()

	soup_contacts = soup.find_all('div', class_='texts grey')
	contacts_len = len(soup_contacts)
	assert contacts_len
	row['Address'] = re.sub(r'\n[\s\n]*', '\n', soup_contacts[0].get_text().strip())
	if 1 < contacts_len:
	row['Contact 1'] = soup_contacts[1].get_text().strip()
	if 2 < contacts_len:
	row['Contact 2'] = soup_contacts[2].get_text().strip()
	if 3 < contacts_len:
	row['Email'] = soup_contacts[3].get_text().strip()
	if 4 < contacts_len:
	row['Website'] = soup_contacts[4].get_text().strip()

	# I could probably simplify the following with stripped_strings generator of bs4.
	soup_div = soup.find('div', class_='searchcontent')
	non_empty_fields = [x2 for x2 in (x1.get_text().strip() for x1 in soup_div.find_all('b')) if x2 != '']
	soup = soup.find_all('ul', class_='ultree')

	# Products fields can be actually empty but their caption is always there.
	assert len(soup) == len(non_empty_fields)

	index = 1
	if 'Product sector' in non_empty_fields:
	row['Sector'] = re.sub(r'\n[\s\n]*', ', ', soup[index].get_text().strip())
	index += 1
	if 'Distribution Channel' in non_empty_fields:
	row['Channel'] = re.sub(r'\n[\s\n]*', ', ', soup[index].get_text().strip())
	index += 1
	if 'Target and sales markets' in non_empty_fields:
	row['Target Market'] = re.sub(r'\n[\s\n]*', ', ', soup[index].get_text().strip())

	data = {}
	for soup_category in soup[0].find_all('li', recursive=False):
	# Sometimes there is an empty <li> before categories (encountered in page 1138)
	try:
	category = next(soup_category.stripped_strings)
	except StopIteration:
	continue

	# Strangely there is a <ul> for every subcategory and product, not just <li>
	data[category] = {}
	for soup_subcategory in (x.find('li') for x in soup_category.find_all('ul', recursive=False)):
	subcategory = next(soup_subcategory.stripped_strings)
	data[category][subcategory] = []
	for soup_product in (x.find('li') for x in soup_subcategory.find_all('ul', recursive=False)):
	product = next(soup_product.stripped_strings)
	data[category][subcategory].append(product)

	i1 = 0
	for i1, category in enumerate(data, start=1):
	if i1 > 1:
	row = {}
	row['Category'] = category
	i2 = 0
	for i2, subcategory in enumerate(data[category], start=1):
	if i2 > 1:
	row = {}
	row['Sub-Category'] = subcategory
	i3 = 0
	for i3, product in enumerate(data[category][subcategory], start=1):
	if i3 > 1:
	row = {}
	row['Product Name'] = product
	writer.writerow(row)
	if i3 == 0:
	writer.writerow(row)
	if i2 == 0:
	writer.writerow(row)
	if i1 == 0:
	writer.writerow(row)