Instantly share code, notes, and snippets.

Embed
What would you like to do?
#!/usr/bin/python3
"""A script that downloads the contents of an online directory into a .csv file."""
import csv
import random
import time
import requests
from bs4 import BeautifulSoup
USER_AGENTS = [
"Opera/9.80 (Windows NT 6.2; Win64; x64) Presto/2.12 Version/12.16",
"Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)",
"Mozilla/5.0 (Windows NT 6.3; WOW64; rv:45.0) Gecko/20100101 Firefox/45.0",
"Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko",
"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)",
"Mozilla/5.0 (Windows NT 6.3; WOW64; rv:32.0) Gecko/20100101 Firefox/32.0"
]
INITIAL_URL = "https://www.mundoherbolario.com/directorio-empresas"
BASE_URL = "https://www.mundoherbolario.com/directorio-empresas/?p={0}&category=0&zoom=15&is_mile=0&directory_radius=0&view=list&sort=title"
WRITER = csv.writer(open("results.csv", "w", newline="", encoding="utf-8-sig"))
WRITER.writerow(["Name", "Category", "Address", "Phone",
"Email", "Website", "Description"])
def init():
"""Initiates the spider."""
headers = {"User-Agent": random.choice(USER_AGENTS)}
with requests.get(INITIAL_URL, headers=headers) as response:
soup = BeautifulSoup(response.text, "html.parser")
total_pages = int(soup.find_all(
"a", class_="sabai-btn sabai-btn-default sabai-btn-sm")[-2].text)
# This directory starts at page 1
for current_index in range(1, total_pages + 1):
print("Downloading: {0} of {1}".format(current_index, total_pages))
load_directory_page(current_index)
print("Finished: {0} of {1}".format(current_index, total_pages))
time.sleep(1)
def load_directory_page(current_index):
"""Loads a directory page using the current index as the page number."""
headers = {"User-Agent": random.choice(USER_AGENTS)}
with requests.get(BASE_URL.format(current_index), headers=headers) as response:
soup = BeautifulSoup(response.text, "html.parser")
companies_list = soup.find_all(
"div", class_="sabai-entity sabai-entity-type-content sabai-entity-bundle-name-directory-listing sabai-entity-bundle-type-directory-listing sabai-entity-mode-summary sabai-clearfix")
for item in companies_list:
item_name = get_element(item, "sabai-directory-title")
item_cat = get_element(item, "sabai-directory-category")
item_addr = get_element(item, "sabai-directory-location")
item_phone = get_element(item, "sabai-directory-contact-tel")
item_email = get_element(item, "sabai-directory-contact-email")
item_website = get_element(item, "sabai-directory-contact-website")
item_summary = get_element(item, "sabai-directory-body")
WRITER.writerow([item_name, item_cat, item_addr,
item_phone, item_email, item_website, item_summary])
def get_element(soup, class_name):
"""Helper function that returns the element text if available."""
try:
return soup.find("div", class_=class_name).text.strip()
except AttributeError:
return "N/A"
if __name__ == "__main__":
init()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment