cheevahagadog/requestium_in_cloud_demo.py

## requestium_in_cloud_demo.py
# Note this is designed to run for Python 3.6

# -- Pre reqs:
#  1. Install Python (I like using Miniconda, version 3.6)
#  2. Install Git
#  3. Install a chromedriver
#  4. Install Chrome
#  5. Install Python dependencies

from pyvirtualdisplay import Display  # For headless browsing
from selenium import webdriver
import requestium
import time
import glob
import os

import logging
from logging.handlers import RotatingFileHandler

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
file_handler = RotatingFileHandler("log.txt", 'a', 1 * 1024 * 1024, 10)
file_handler.setFormatter(logging.Formatter('%(asctime)s %(levelname)s: %(message)s [in %(pathname)s:%(lineno)d]'))
file_handler.setLevel(logging.INFO)
logger.addHandler(file_handler)


class Browser(object):

    def __init__(self, link, directory=None):
        """Class for handling proper setup and teardown of a webdriven client for a remote environment.

        Args:
            link: str, URL to the page you want to visit
            directory: str, full path to an existing directory where downloaded files will land
        """
        display = Display(visible=0, size=(1000, 1000))
        self.chromedriver = "/usr/local/bin/chromedriver"
        options = webdriver.ChromeOptions()
        if isinstance(directory, str) and os.path.isdir(directory):
            self.webdriver_options = {'prefs': {'download.default_directory': directory}}
            self.directory = directory
        else:
            self.webdriver_options = None
            self.directory = '.'

        self.session = requestium.Session(
            webdriver_path=self.chromedriver,
            browser='chrome',
            default_timeout=15,
            webdriver_options=self.webdriver_options)
        self.display.start()
        self.session.driver.get(link)
        logger.info(f"(setup) Started display, session and visited {link}.")


    def select_and_download_data(self):
        """Example for interacting with the NCES college website"""
        # Let's just select the schools in New York (because I love New York!)
        states = self.session.driver.ensure_element_by_id('ctl00_cphCollegeNavBody_ucSearchMain_ucMapMain_lstState')
        state_select = requestium.Select(states)
        state_select.deselect_all()
        state_select.select_by_value("NY")

        # Only selecting schools with undergrad and grad options
        self.session.driver.ensure_element_by_id("ctl00_cphCollegeNavBody_ucSearchMain_chkGrad").click()
        self.session.driver.ensure_element_by_id("ctl00_cphCollegeNavBody_ucSearchMain_chkBach").click()

        # Public schools
        self.session.driver.ensure_element_by_id("ctl00_cphCollegeNavBody_ucSearchMain_chkControlPublic").click()

        # Show Results
        self.session.driver.ensure_element_by_id("ctl00_cphCollegeNavBody_ucSearchMain_btnSearch").click()

        # Click on "Export Results"
        self.session.driver.ensure_element_by_id("ctl00_cphCollegeNavBody_ucFavoritesTop_divExport").click()

        # Click the CSV option output
        self.session.driver.ensure_element_by_id("ctl00_cphCollegeNavBody_ucFavoritesTop_rdbCSV").click()

        # click the final export button --> this will download the file to our specified directory
        self.session.driver.ensure_element_by_id("ctl00_cphCollegeNavBody_ucFavoritesTop_aExportData").click()
        success = self.wait_for_download_to_complete(file_name=file_)
        return success

    def wait_for_download_to_complete(self, file_name, delay=2, tries_max=10):
        """Waits for a file to download before continuing execution.

        Args:
            file_name: str, the name of the file to be downloaded including the extension
            delay: int, how many seconds to wait before checking the file again
            tries_max: int, how many attempts at checking a download is happening before quitting

        Returns:
            bool: True if the file successfully downloaded, else False
        """
        downloading_file = os.path.join(self.directory, "Unconfirmed*.crdownload")
        finished_file = os.path.join(self.directory, file_name)
        n_tries = 0
        download_started = False

        while n_tries < tries_max:

            currently_downloading = glob.glob(downloading_file)
            file_is_downloaded = glob.glob(finished_file)

            # A file is downloading, but our expected file isn't there yet
            if currently_downloading and not file_is_downloaded:
                download_started = True
                time.sleep(delay)

            elif not currently_downloading and download_started and not file_is_downloaded:
                raise ValueError(f"File downloaded but was perhaps misnamed. No {finished_file} file found!")

            elif file_is_downloaded:
                break
            # we wait for a file to show up as downloading
            else:
                n_tries += 1
                time.sleep(delay)

        return file_is_downloaded

    def wrap_up(self, session):
        """Close the browser session and the display"""
        session.driver.quit()  # Stops the Chrome session
        self.display.sendstop()  # Safely closes the virtualdisplay
        logger.info('(wrap_up) Closed display and chrome browser')

    def main(self):
        try:
            success = self.select_and_download_data()
        finally:
            self.wrap_up()


if __name__ == '__main__':
    butils = Browser(link="https://nces.ed.gov/collegenavigator/",
                     directory='/my/download/location/dir')
    butils.main()
	# Note this is designed to run for Python 3.6

	# -- Pre reqs:
	# 1. Install Python (I like using Miniconda, version 3.6)
	# 2. Install Git
	# 3. Install a chromedriver
	# 4. Install Chrome
	# 5. Install Python dependencies

	from pyvirtualdisplay import Display # For headless browsing
	from selenium import webdriver
	import requestium
	import time
	import glob
	import os

	import logging
	from logging.handlers import RotatingFileHandler

	logger = logging.getLogger(__name__)
	logger.setLevel(logging.INFO)
	file_handler = RotatingFileHandler("log.txt", 'a', 1 * 1024 * 1024, 10)
	file_handler.setFormatter(logging.Formatter('%(asctime)s %(levelname)s: %(message)s [in %(pathname)s:%(lineno)d]'))
	file_handler.setLevel(logging.INFO)
	logger.addHandler(file_handler)


	class Browser(object):

	def __init__(self, link, directory=None):
	"""Class for handling proper setup and teardown of a webdriven client for a remote environment.

	Args:
	link: str, URL to the page you want to visit
	directory: str, full path to an existing directory where downloaded files will land
	"""
	display = Display(visible=0, size=(1000, 1000))
	self.chromedriver = "/usr/local/bin/chromedriver"
	options = webdriver.ChromeOptions()
	if isinstance(directory, str) and os.path.isdir(directory):
	self.webdriver_options = {'prefs': {'download.default_directory': directory}}
	self.directory = directory
	else:
	self.webdriver_options = None
	self.directory = '.'

	self.session = requestium.Session(
	webdriver_path=self.chromedriver,
	browser='chrome',
	default_timeout=15,
	webdriver_options=self.webdriver_options)
	self.display.start()
	self.session.driver.get(link)
	logger.info(f"(setup) Started display, session and visited {link}.")


	def select_and_download_data(self):
	"""Example for interacting with the NCES college website"""
	# Let's just select the schools in New York (because I love New York!)
	states = self.session.driver.ensure_element_by_id('ctl00_cphCollegeNavBody_ucSearchMain_ucMapMain_lstState')
	state_select = requestium.Select(states)
	state_select.deselect_all()
	state_select.select_by_value("NY")

	# Only selecting schools with undergrad and grad options
	self.session.driver.ensure_element_by_id("ctl00_cphCollegeNavBody_ucSearchMain_chkGrad").click()
	self.session.driver.ensure_element_by_id("ctl00_cphCollegeNavBody_ucSearchMain_chkBach").click()

	# Public schools
	self.session.driver.ensure_element_by_id("ctl00_cphCollegeNavBody_ucSearchMain_chkControlPublic").click()

	# Show Results
	self.session.driver.ensure_element_by_id("ctl00_cphCollegeNavBody_ucSearchMain_btnSearch").click()

	# Click on "Export Results"
	self.session.driver.ensure_element_by_id("ctl00_cphCollegeNavBody_ucFavoritesTop_divExport").click()

	# Click the CSV option output
	self.session.driver.ensure_element_by_id("ctl00_cphCollegeNavBody_ucFavoritesTop_rdbCSV").click()

	# click the final export button --> this will download the file to our specified directory
	self.session.driver.ensure_element_by_id("ctl00_cphCollegeNavBody_ucFavoritesTop_aExportData").click()
	success = self.wait_for_download_to_complete(file_name=file_)
	return success

	def wait_for_download_to_complete(self, file_name, delay=2, tries_max=10):
	"""Waits for a file to download before continuing execution.

	Args:
	file_name: str, the name of the file to be downloaded including the extension
	delay: int, how many seconds to wait before checking the file again
	tries_max: int, how many attempts at checking a download is happening before quitting

	Returns:
	bool: True if the file successfully downloaded, else False
	"""
	downloading_file = os.path.join(self.directory, "Unconfirmed*.crdownload")
	finished_file = os.path.join(self.directory, file_name)
	n_tries = 0
	download_started = False

	while n_tries < tries_max:

	currently_downloading = glob.glob(downloading_file)
	file_is_downloaded = glob.glob(finished_file)

	# A file is downloading, but our expected file isn't there yet
	if currently_downloading and not file_is_downloaded:
	download_started = True
	time.sleep(delay)

	elif not currently_downloading and download_started and not file_is_downloaded:
	raise ValueError(f"File downloaded but was perhaps misnamed. No {finished_file} file found!")

	elif file_is_downloaded:
	break
	# we wait for a file to show up as downloading
	else:
	n_tries += 1
	time.sleep(delay)

	return file_is_downloaded

	def wrap_up(self, session):
	"""Close the browser session and the display"""
	session.driver.quit() # Stops the Chrome session
	self.display.sendstop() # Safely closes the virtualdisplay
	logger.info('(wrap_up) Closed display and chrome browser')

	def main(self):
	try:
	success = self.select_and_download_data()
	finally:
	self.wrap_up()


	if __name__ == '__main__':
	butils = Browser(link="https://nces.ed.gov/collegenavigator/",
	directory='/my/download/location/dir')
	butils.main()