-
-
Save agentphantom/77eb3f8a9177c4c0b35c6b1e289d4efc to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Script that interacts with a website using Selenium to obtain hidden email addresses.""" | |
import csv | |
from bs4 import BeautifulSoup | |
from selenium import webdriver | |
# We hardcode the category name and the number of pages. | |
CATEGORY = "it-services" | |
NUMBER_OF_PAGES = 41 | |
# This url will be the same for all categories. | |
BASE_URL = "https://clutch.co/{0}?page={1}" | |
def get_page(current_index): | |
"""Navigates through the pages and saves the email addresses.""" | |
DRIVER.get(BASE_URL.format(CATEGORY, current_index)) | |
contacts = DRIVER.find_elements_by_class_name("contact") | |
# This clicks all the buttons that reveal the email addresses. | |
for current_item in range(0, len(contacts)): | |
contacts[current_item].click() | |
# After that we get the new HTML and parse it with BeautifulSoup. | |
html_body = DRIVER.find_element_by_tag_name( | |
"body").get_attribute("innerHTML") | |
soup = BeautifulSoup(html_body, "html.parser") | |
# Finally, we iterate through all the 'a' elements that contain a 'mailto:' value. | |
for item in soup.find_all("a"): | |
try: | |
if "mailto:" in item["href"]: | |
CSV_WRITER.writerow([item.text]) | |
except KeyError: | |
pass | |
print("Finished page: {0}".format(current_index)) | |
if __name__ == "__main__": | |
# For ease of use we have geckodriver.exe in the same folder as this script. | |
DRIVER = webdriver.Firefox(executable_path="geckodriver.exe") | |
# Our csv file will be named as the category name. | |
CSV_WRITER = csv.writer(open("{0}.csv".format( | |
CATEGORY), "a", newline="", encoding="utf-8")) | |
for i in range(0, NUMBER_OF_PAGES): | |
print("Downloading page: {0}".format(i)) | |
get_page(i) | |
DRIVER.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment