Create a gist now

Instantly share code, notes, and snippets.

Embed
What would you like to do?
"""Script that interacts with a website using Selenium to obtain hidden email addresses."""
import csv
from bs4 import BeautifulSoup
from selenium import webdriver
# We hardcode the category name and the number of pages.
CATEGORY = "it-services"
NUMBER_OF_PAGES = 41
# This url will be the same for all categories.
BASE_URL = "https://clutch.co/{0}?page={1}"
def get_page(current_index):
"""Navigates through the pages and saves the email addresses."""
DRIVER.get(BASE_URL.format(CATEGORY, current_index))
contacts = DRIVER.find_elements_by_class_name("contact")
# This clicks all the buttons that reveal the email addresses.
for current_item in range(0, len(contacts)):
contacts[current_item].click()
# After that we get the new HTML and parse it with BeautifulSoup.
html_body = DRIVER.find_element_by_tag_name(
"body").get_attribute("innerHTML")
soup = BeautifulSoup(html_body, "html.parser")
# Finally, we iterate through all the 'a' elements that contain a 'mailto:' value.
for item in soup.find_all("a"):
try:
if "mailto:" in item["href"]:
CSV_WRITER.writerow([item.text])
except KeyError:
pass
print("Finished page: {0}".format(current_index))
if __name__ == "__main__":
# For ease of use we have geckodriver.exe in the same folder as this script.
DRIVER = webdriver.Firefox(executable_path="geckodriver.exe")
# Our csv file will be named as the category name.
CSV_WRITER = csv.writer(open("{0}.csv".format(
CATEGORY), "a", newline="", encoding="utf-8"))
for i in range(0, NUMBER_OF_PAGES):
print("Downloading page: {0}".format(i))
get_page(i)
DRIVER.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment