agentphantom/selenium_scraper.py Secret

## selenium_scraper.py
"""Script that interacts with a website using Selenium to obtain hidden email addresses."""

import csv

from bs4 import BeautifulSoup

from selenium import webdriver

# We hardcode the category name and the number of pages.
CATEGORY = "it-services"
NUMBER_OF_PAGES = 41

# This url will be the same for all categories.
BASE_URL = "https://clutch.co/{0}?page={1}"


def get_page(current_index):
    """Navigates through the pages and saves the email addresses."""

    DRIVER.get(BASE_URL.format(CATEGORY, current_index))
    contacts = DRIVER.find_elements_by_class_name("contact")

    # This clicks all the buttons that reveal the email addresses.
    for current_item in range(0, len(contacts)):
        contacts[current_item].click()

    # After that we get the new HTML and parse it with BeautifulSoup.
    html_body = DRIVER.find_element_by_tag_name(
        "body").get_attribute("innerHTML")

    soup = BeautifulSoup(html_body, "html.parser")

    # Finally, we iterate through all the 'a' elements that contain a 'mailto:' value.
    for item in soup.find_all("a"):

        try:
            if "mailto:" in item["href"]:
                CSV_WRITER.writerow([item.text])

        except KeyError:
            pass

    print("Finished page: {0}".format(current_index))


if __name__ == "__main__":

    # For ease of use we have geckodriver.exe in the same folder as this script.
    DRIVER = webdriver.Firefox(executable_path="geckodriver.exe")

    # Our csv file will be named as the category name.
    CSV_WRITER = csv.writer(open("{0}.csv".format(
        CATEGORY), "a", newline="", encoding="utf-8"))

    for i in range(0, NUMBER_OF_PAGES):
        print("Downloading page: {0}".format(i))
        get_page(i)

    DRIVER.close()
	"""Script that interacts with a website using Selenium to obtain hidden email addresses."""

	import csv

	from bs4 import BeautifulSoup

	from selenium import webdriver

	# We hardcode the category name and the number of pages.
	CATEGORY = "it-services"
	NUMBER_OF_PAGES = 41

	# This url will be the same for all categories.
	BASE_URL = "https://clutch.co/{0}?page={1}"


	def get_page(current_index):
	"""Navigates through the pages and saves the email addresses."""

	DRIVER.get(BASE_URL.format(CATEGORY, current_index))
	contacts = DRIVER.find_elements_by_class_name("contact")

	# This clicks all the buttons that reveal the email addresses.
	for current_item in range(0, len(contacts)):
	contacts[current_item].click()

	# After that we get the new HTML and parse it with BeautifulSoup.
	html_body = DRIVER.find_element_by_tag_name(
	"body").get_attribute("innerHTML")

	soup = BeautifulSoup(html_body, "html.parser")

	# Finally, we iterate through all the 'a' elements that contain a 'mailto:' value.
	for item in soup.find_all("a"):

	try:
	if "mailto:" in item["href"]:
	CSV_WRITER.writerow([item.text])

	except KeyError:
	pass

	print("Finished page: {0}".format(current_index))


	if __name__ == "__main__":

	# For ease of use we have geckodriver.exe in the same folder as this script.
	DRIVER = webdriver.Firefox(executable_path="geckodriver.exe")

	# Our csv file will be named as the category name.
	CSV_WRITER = csv.writer(open("{0}.csv".format(
	CATEGORY), "a", newline="", encoding="utf-8"))

	for i in range(0, NUMBER_OF_PAGES):
	print("Downloading page: {0}".format(i))
	get_page(i)

	DRIVER.close()