#!/usr/bin/python3 | |
"""A script that downloads the contents of an online directory into a .csv file.""" | |
import csv | |
import random | |
import time | |
import requests | |
from bs4 import BeautifulSoup | |
USER_AGENTS = [ | |
"Opera/9.80 (Windows NT 6.2; Win64; x64) Presto/2.12 Version/12.16", | |
"Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko", | |
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)", | |
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)", | |
"Mozilla/5.0 (Windows NT 6.3; WOW64; rv:45.0) Gecko/20100101 Firefox/45.0", | |
"Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko", | |
"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)", | |
"Mozilla/5.0 (Windows NT 6.3; WOW64; rv:32.0) Gecko/20100101 Firefox/32.0" | |
] | |
INITIAL_URL = "https://www.mundoherbolario.com/directorio-empresas" | |
BASE_URL = "https://www.mundoherbolario.com/directorio-empresas/?p={0}&category=0&zoom=15&is_mile=0&directory_radius=0&view=list&sort=title" | |
WRITER = csv.writer(open("results.csv", "w", newline="", encoding="utf-8-sig")) | |
WRITER.writerow(["Name", "Category", "Address", "Phone", | |
"Email", "Website", "Description"]) | |
def init(): | |
"""Initiates the spider.""" | |
headers = {"User-Agent": random.choice(USER_AGENTS)} | |
with requests.get(INITIAL_URL, headers=headers) as response: | |
soup = BeautifulSoup(response.text, "html.parser") | |
total_pages = int(soup.find_all( | |
"a", class_="sabai-btn sabai-btn-default sabai-btn-sm")[-2].text) | |
# This directory starts at page 1 | |
for current_index in range(1, total_pages + 1): | |
print("Downloading: {0} of {1}".format(current_index, total_pages)) | |
load_directory_page(current_index) | |
print("Finished: {0} of {1}".format(current_index, total_pages)) | |
time.sleep(1) | |
def load_directory_page(current_index): | |
"""Loads a directory page using the current index as the page number.""" | |
headers = {"User-Agent": random.choice(USER_AGENTS)} | |
with requests.get(BASE_URL.format(current_index), headers=headers) as response: | |
soup = BeautifulSoup(response.text, "html.parser") | |
companies_list = soup.find_all( | |
"div", class_="sabai-entity sabai-entity-type-content sabai-entity-bundle-name-directory-listing sabai-entity-bundle-type-directory-listing sabai-entity-mode-summary sabai-clearfix") | |
for item in companies_list: | |
item_name = get_element(item, "sabai-directory-title") | |
item_cat = get_element(item, "sabai-directory-category") | |
item_addr = get_element(item, "sabai-directory-location") | |
item_phone = get_element(item, "sabai-directory-contact-tel") | |
item_email = get_element(item, "sabai-directory-contact-email") | |
item_website = get_element(item, "sabai-directory-contact-website") | |
item_summary = get_element(item, "sabai-directory-body") | |
WRITER.writerow([item_name, item_cat, item_addr, | |
item_phone, item_email, item_website, item_summary]) | |
def get_element(soup, class_name): | |
"""Helper function that returns the element text if available.""" | |
try: | |
return soup.find("div", class_=class_name).text.strip() | |
except AttributeError: | |
return "N/A" | |
if __name__ == "__main__": | |
init() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment