Skip to content

Instantly share code, notes, and snippets.

@kizernis
Last active May 22, 2019 00:03
Show Gist options
  • Save kizernis/7a80d29cde4bc9b4b727194ddd97e17e to your computer and use it in GitHub Desktop.
Save kizernis/7a80d29cde4bc9b4b727194ddd97e17e to your computer and use it in GitHub Desktop.
# Extract data from the anuga.com downloaded pages and create a sophisticated CSV table
from bs4 import BeautifulSoup
import csv
from tqdm import trange
import re
page_first = 1
page_last = 7789
with open('anuga_data.csv', 'w', newline='', encoding='utf-8') as f_out:
field_names = ['Company Name', 'Address', 'Contact 1', 'Contact 2', 'Email', 'Website', 'Category', 'Sub-Category', 'Product Name', 'Sector', 'Channel', 'Target Market']
writer = csv.DictWriter(f_out, fieldnames=field_names)
writer.writeheader()
for i in trange(page_first, page_last + 1):
with open('html_pages/%04d.html' % i, encoding='utf-8') as f_in:
soup = BeautifulSoup(f_in, 'lxml')
row = {}
row['Company Name'] = soup.find('h1', class_='h1down').get_text().strip()
soup_contacts = soup.find_all('div', class_='texts grey')
contacts_len = len(soup_contacts)
assert contacts_len
row['Address'] = re.sub(r'\n[\s\n]*', '\n', soup_contacts[0].get_text().strip())
if 1 < contacts_len:
row['Contact 1'] = soup_contacts[1].get_text().strip()
if 2 < contacts_len:
row['Contact 2'] = soup_contacts[2].get_text().strip()
if 3 < contacts_len:
row['Email'] = soup_contacts[3].get_text().strip()
if 4 < contacts_len:
row['Website'] = soup_contacts[4].get_text().strip()
# I could probably simplify the following with stripped_strings generator of bs4.
soup_div = soup.find('div', class_='searchcontent')
non_empty_fields = [x2 for x2 in (x1.get_text().strip() for x1 in soup_div.find_all('b')) if x2 != '']
soup = soup.find_all('ul', class_='ultree')
# Products fields can be actually empty but their caption is always there.
assert len(soup) == len(non_empty_fields)
index = 1
if 'Product sector' in non_empty_fields:
row['Sector'] = re.sub(r'\n[\s\n]*', ', ', soup[index].get_text().strip())
index += 1
if 'Distribution Channel' in non_empty_fields:
row['Channel'] = re.sub(r'\n[\s\n]*', ', ', soup[index].get_text().strip())
index += 1
if 'Target and sales markets' in non_empty_fields:
row['Target Market'] = re.sub(r'\n[\s\n]*', ', ', soup[index].get_text().strip())
data = {}
for soup_category in soup[0].find_all('li', recursive=False):
# Sometimes there is an empty <li> before categories (encountered in page 1138)
try:
category = next(soup_category.stripped_strings)
except StopIteration:
continue
# Strangely there is a <ul> for every subcategory and product, not just <li>
data[category] = {}
for soup_subcategory in (x.find('li') for x in soup_category.find_all('ul', recursive=False)):
subcategory = next(soup_subcategory.stripped_strings)
data[category][subcategory] = []
for soup_product in (x.find('li') for x in soup_subcategory.find_all('ul', recursive=False)):
product = next(soup_product.stripped_strings)
data[category][subcategory].append(product)
i1 = 0
for i1, category in enumerate(data, start=1):
if i1 > 1:
row = {}
row['Category'] = category
i2 = 0
for i2, subcategory in enumerate(data[category], start=1):
if i2 > 1:
row = {}
row['Sub-Category'] = subcategory
i3 = 0
for i3, product in enumerate(data[category][subcategory], start=1):
if i3 > 1:
row = {}
row['Product Name'] = product
writer.writerow(row)
if i3 == 0:
writer.writerow(row)
if i2 == 0:
writer.writerow(row)
if i1 == 0:
writer.writerow(row)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment