scottpham/headless_scrape.py

## headless_scrape.py
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium
from time import sleep
# remember to pip install lxml

# boot up the headless chrome
def start_driver():
    # create an options object
    options = webdriver.ChromeOptions()
    # only do this if I have to use weird features
    #options.binary_location = '/usr/bin/google-chrome-unstable'
    options.add_argument('headless')
    # seems like a good idea i dunno
    options.add_argument('window-size=1200x600')

    # initialize
    return webdriver.Chrome(options=options).webdriver.support.ui import Select

def get_soup(driver, url):
    # time out after 30 seconds
    driver.set_page_load_timeout(30)
    # let javascript load for 10 seconds (overkill, but gets the job done)
    driver.implicitly_wait(5)

    driver.get(url)
    # scroll down
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    # I'm not sure if the site will wait 10 seconds again at this point, so sometimes I just manually throw in a wait
    sleep(2)

    # get html
    html = driver.page_source

    # soupify
    soup = BeautifulSoup(
        html,
        "lxml"
    )

    return soup

# start driver
driver = start_driver()

URL = "URL THAT YOU WANT"
soup = get_soup(driver, URL)

# Guide to selenium waits: https://selenium-python.readthedocs.io/waits.html
	from bs4 import BeautifulSoup
	from selenium import webdriver
	from selenium
	from time import sleep
	# remember to pip install lxml

	# boot up the headless chrome
	def start_driver():
	# create an options object
	options = webdriver.ChromeOptions()
	# only do this if I have to use weird features
	#options.binary_location = '/usr/bin/google-chrome-unstable'
	options.add_argument('headless')
	# seems like a good idea i dunno
	options.add_argument('window-size=1200x600')

	# initialize
	return webdriver.Chrome(options=options).webdriver.support.ui import Select

	def get_soup(driver, url):
	# time out after 30 seconds
	driver.set_page_load_timeout(30)
	# let javascript load for 10 seconds (overkill, but gets the job done)
	driver.implicitly_wait(5)

	driver.get(url)
	# scroll down
	driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
	# I'm not sure if the site will wait 10 seconds again at this point, so sometimes I just manually throw in a wait
	sleep(2)

	# get html
	html = driver.page_source

	# soupify
	soup = BeautifulSoup(
	html,
	"lxml"
	)

	return soup

	# start driver
	driver = start_driver()

	URL = "URL THAT YOU WANT"
	soup = get_soup(driver, URL)

	# Guide to selenium waits: https://selenium-python.readthedocs.io/waits.html