Skip to content

Instantly share code, notes, and snippets.

@cheevahagadog
Last active February 5, 2019 18:23
Show Gist options
  • Save cheevahagadog/bb60240edd16499d3e368879f2f7d07a to your computer and use it in GitHub Desktop.
Save cheevahagadog/bb60240edd16499d3e368879f2f7d07a to your computer and use it in GitHub Desktop.
A brief example of using Requestium in a cloud environment while setting the download directory and waiting for files to download.
# Note this is designed to run for Python 3.6
# -- Pre reqs:
# 1. Install Python (I like using Miniconda, version 3.6)
# 2. Install Git
# 3. Install a chromedriver
# 4. Install Chrome
# 5. Install Python dependencies
from pyvirtualdisplay import Display # For headless browsing
from selenium import webdriver
import requestium
import time
import glob
import os
import logging
from logging.handlers import RotatingFileHandler
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
file_handler = RotatingFileHandler("log.txt", 'a', 1 * 1024 * 1024, 10)
file_handler.setFormatter(logging.Formatter('%(asctime)s %(levelname)s: %(message)s [in %(pathname)s:%(lineno)d]'))
file_handler.setLevel(logging.INFO)
logger.addHandler(file_handler)
class Browser(object):
def __init__(self, link, directory=None):
"""Class for handling proper setup and teardown of a webdriven client for a remote environment.
Args:
link: str, URL to the page you want to visit
directory: str, full path to an existing directory where downloaded files will land
"""
display = Display(visible=0, size=(1000, 1000))
self.chromedriver = "/usr/local/bin/chromedriver"
options = webdriver.ChromeOptions()
if isinstance(directory, str) and os.path.isdir(directory):
self.webdriver_options = {'prefs': {'download.default_directory': directory}}
self.directory = directory
else:
self.webdriver_options = None
self.directory = '.'
self.session = requestium.Session(
webdriver_path=self.chromedriver,
browser='chrome',
default_timeout=15,
webdriver_options=self.webdriver_options)
self.display.start()
self.session.driver.get(link)
logger.info(f"(setup) Started display, session and visited {link}.")
def select_and_download_data(self):
"""Example for interacting with the NCES college website"""
# Let's just select the schools in New York (because I love New York!)
states = self.session.driver.ensure_element_by_id('ctl00_cphCollegeNavBody_ucSearchMain_ucMapMain_lstState')
state_select = requestium.Select(states)
state_select.deselect_all()
state_select.select_by_value("NY")
# Only selecting schools with undergrad and grad options
self.session.driver.ensure_element_by_id("ctl00_cphCollegeNavBody_ucSearchMain_chkGrad").click()
self.session.driver.ensure_element_by_id("ctl00_cphCollegeNavBody_ucSearchMain_chkBach").click()
# Public schools
self.session.driver.ensure_element_by_id("ctl00_cphCollegeNavBody_ucSearchMain_chkControlPublic").click()
# Show Results
self.session.driver.ensure_element_by_id("ctl00_cphCollegeNavBody_ucSearchMain_btnSearch").click()
# Click on "Export Results"
self.session.driver.ensure_element_by_id("ctl00_cphCollegeNavBody_ucFavoritesTop_divExport").click()
# Click the CSV option output
self.session.driver.ensure_element_by_id("ctl00_cphCollegeNavBody_ucFavoritesTop_rdbCSV").click()
# click the final export button --> this will download the file to our specified directory
self.session.driver.ensure_element_by_id("ctl00_cphCollegeNavBody_ucFavoritesTop_aExportData").click()
success = self.wait_for_download_to_complete(file_name=file_)
return success
def wait_for_download_to_complete(self, file_name, delay=2, tries_max=10):
"""Waits for a file to download before continuing execution.
Args:
file_name: str, the name of the file to be downloaded including the extension
delay: int, how many seconds to wait before checking the file again
tries_max: int, how many attempts at checking a download is happening before quitting
Returns:
bool: True if the file successfully downloaded, else False
"""
downloading_file = os.path.join(self.directory, "Unconfirmed*.crdownload")
finished_file = os.path.join(self.directory, file_name)
n_tries = 0
download_started = False
while n_tries < tries_max:
currently_downloading = glob.glob(downloading_file)
file_is_downloaded = glob.glob(finished_file)
# A file is downloading, but our expected file isn't there yet
if currently_downloading and not file_is_downloaded:
download_started = True
time.sleep(delay)
elif not currently_downloading and download_started and not file_is_downloaded:
raise ValueError(f"File downloaded but was perhaps misnamed. No {finished_file} file found!")
elif file_is_downloaded:
break
# we wait for a file to show up as downloading
else:
n_tries += 1
time.sleep(delay)
return file_is_downloaded
def wrap_up(self, session):
"""Close the browser session and the display"""
session.driver.quit() # Stops the Chrome session
self.display.sendstop() # Safely closes the virtualdisplay
logger.info('(wrap_up) Closed display and chrome browser')
def main(self):
try:
success = self.select_and_download_data()
finally:
self.wrap_up()
if __name__ == '__main__':
butils = Browser(link="https://nces.ed.gov/collegenavigator/",
directory='/my/download/location/dir')
butils.main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment