Skip to content

Instantly share code, notes, and snippets.

@scottpham
Last active September 19, 2019 18:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save scottpham/9386852354a7ed50eee0b986267f2fc9 to your computer and use it in GitHub Desktop.
Save scottpham/9386852354a7ed50eee0b986267f2fc9 to your computer and use it in GitHub Desktop.
headless_scraping
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium
from time import sleep
# remember to pip install lxml
# boot up the headless chrome
def start_driver():
# create an options object
options = webdriver.ChromeOptions()
# only do this if I have to use weird features
#options.binary_location = '/usr/bin/google-chrome-unstable'
options.add_argument('headless')
# seems like a good idea i dunno
options.add_argument('window-size=1200x600')
# initialize
return webdriver.Chrome(options=options).webdriver.support.ui import Select
def get_soup(driver, url):
# time out after 30 seconds
driver.set_page_load_timeout(30)
# let javascript load for 10 seconds (overkill, but gets the job done)
driver.implicitly_wait(5)
driver.get(url)
# scroll down
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# I'm not sure if the site will wait 10 seconds again at this point, so sometimes I just manually throw in a wait
sleep(2)
# get html
html = driver.page_source
# soupify
soup = BeautifulSoup(
html,
"lxml"
)
return soup
# start driver
driver = start_driver()
URL = "URL THAT YOU WANT"
soup = get_soup(driver, URL)
# Guide to selenium waits: https://selenium-python.readthedocs.io/waits.html
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment