"""Script that interacts with a website using Selenium to obtain hidden email addresses.""" | |
import csv | |
from bs4 import BeautifulSoup | |
from selenium import webdriver | |
# We hardcode the category name and the number of pages. | |
CATEGORY = "it-services" | |
NUMBER_OF_PAGES = 41 | |
# This url will be the same for all categories. | |
BASE_URL = "https://clutch.co/{0}?page={1}" | |
def get_page(current_index): | |
"""Navigates through the pages and saves the email addresses.""" | |
DRIVER.get(BASE_URL.format(CATEGORY, current_index)) | |
contacts = DRIVER.find_elements_by_class_name("contact") | |
# This clicks all the buttons that reveal the email addresses. | |
for current_item in range(0, len(contacts)): | |
contacts[current_item].click() | |
# After that we get the new HTML and parse it with BeautifulSoup. | |
html_body = DRIVER.find_element_by_tag_name( | |
"body").get_attribute("innerHTML") | |
soup = BeautifulSoup(html_body, "html.parser") | |
# Finally, we iterate through all the 'a' elements that contain a 'mailto:' value. | |
for item in soup.find_all("a"): | |
try: | |
if "mailto:" in item["href"]: | |
CSV_WRITER.writerow([item.text]) | |
except KeyError: | |
pass | |
print("Finished page: {0}".format(current_index)) | |
if __name__ == "__main__": | |
# For ease of use we have geckodriver.exe in the same folder as this script. | |
DRIVER = webdriver.Firefox(executable_path="geckodriver.exe") | |
# Our csv file will be named as the category name. | |
CSV_WRITER = csv.writer(open("{0}.csv".format( | |
CATEGORY), "a", newline="", encoding="utf-8")) | |
for i in range(0, NUMBER_OF_PAGES): | |
print("Downloading page: {0}".format(i)) | |
get_page(i) | |
DRIVER.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment