Created
August 26, 2018 00:49
-
-
Save kawing-ho/2bcfed03922252e2326420af6a77631f to your computer and use it in GitHub Desktop.
Example of using selenium webdriver + headless chrome to scrape links from dynamic-content pages
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
from os import getpid | |
from re import search | |
from time import sleep | |
from random import randint | |
from selenium import webdriver | |
from pyvirtualdisplay import Display | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.common.keys import Keys | |
from selenium.common.exceptions import TimeoutException | |
from selenium.webdriver.support.ui import WebDriverWait, Select | |
from selenium.webdriver.support import expected_conditions as EC | |
class LinkSpider(): | |
def __init__(self): #can loop over by specifying ?adults= 1..16 | |
self.url_to_crawl = "https://www.airbnb.com/s/New-South-Wales--Australia/homes" | |
self.all_items = [] | |
# Open headless chromedriver | |
def start_driver(self): | |
print('starting driver...') | |
self.display = Display(visible=0, size=(800, 600)) | |
self.display.start() | |
options = webdriver.ChromeOptions() | |
options.add_argument(" - incognito") | |
options.add_argument('--disable-extensions') | |
options.add_argument('--no-sandbox') | |
options.add_argument('--headless') | |
options.add_argument('--disable-gpu') | |
options.add_argument('--disable-setuid-sandbox') | |
options.add_argument('--allow-running-insecure-content') | |
options.add_argument('--ignore-certificate-errors') | |
# don't load images // use cache | |
prefs={"profile.managed_default_content_settings.images": 2, 'disk-cache-size': 4096 } | |
options.add_experimental_option("prefs", prefs) | |
# make sure you have a chromedriver-executable installed, either in path or give the path | |
# exit staus 127 probably means some libraries are missing, try running it to see what it is | |
# sudo apt install libgconf2-4 might help | |
try: | |
print('attempting to start chrome...') | |
self.driver = webdriver.Chrome(executable_path="/var/chromedriver/chromedriver", chrome_options=options) | |
# self.driver.implicitly_wait(20) | |
self.driver.set_page_load_timeout(10) | |
except Exception as e: | |
print('couldn\'t launch chrome') | |
print(str(e)) | |
self.display.stop() | |
self.driver.quit() | |
# Close chromedriver | |
def close_driver(self): | |
print('closing driver...') | |
self.display.stop() | |
self.driver.quit() | |
print('closed!') | |
# Tell the browser to get a page | |
def get_page(self, url): | |
print('getting page: {}'.format(url)) | |
try: | |
self.driver.get(url) | |
except TimeoutException: | |
print("Timed out on loading the page") | |
self.close_driver() | |
exit(1) | |
def grab_list_items(self): | |
print('grabbing list of items...') | |
################### | |
extract_amount = 300 #300 | |
################### | |
count = 0 | |
exception_count = 0 | |
pause_time = 1 | |
while count < extract_amount: | |
try: | |
print("\ntry loading page first ...") | |
sleep(pause_time) | |
#load the div elements and scrape the links | |
for c, div in enumerate(self.driver.find_elements_by_class_name('_1szwzht')): | |
print(div.text) | |
try: data = div.find_element_by_css_selector('._1szwzht a').get_attribute('href') | |
except Exception as e: | |
print(str(e)) | |
pass | |
if data and 'plus' not in data: | |
self.all_items.append(data) | |
count += 1 | |
else: | |
pass | |
#click to go to next page | |
# there are four elements with class "_1ip5u88", we want the last one which is the ">" button | |
buttons = self.driver.find_elements_by_class_name('_1ip5u88') | |
buttons[3].click() | |
except Exception as e: | |
exception_count += 1 | |
print(e) | |
print(self.driver.page_source) | |
if exception_count < 100: continue | |
print("killing because too many exceptions") | |
self.close_driver() | |
exit(1) | |
def parse(self): | |
self.start_driver() | |
#loop through number of guests | |
for guests in range(17): | |
print("--- Crawling through ?guests={} ---".format(guests)) | |
sleep(2) | |
self.get_page(self.url_to_crawl + "?adults=" + str(guests)) | |
self.grab_list_items() | |
items_list = list(set(self.all_items)) | |
items_list = [ search("/(\d+)\?",item).group(1) for item in items_list] | |
#offload the items to file | |
with open('/tmp/links_'+str(getpid), 'a') as f: | |
for i in items_list: | |
f.write(str(i)+'\n') | |
#empty it back again | |
self.all_items = [] | |
self.close_driver() | |
if self.all_items: | |
return self.all_items | |
else: | |
return False, False | |
# Create the empty file to store in | |
open('/tmp/links_' + str(getpid()), 'w').close() | |
# Run spider | |
Link = LinkSpider() | |
items_list = Link.parse() | |
#items_list = list(set(items_list)) | |
# extract links | |
#items_list = [ search("/(\d+)\?",item).group(1) for item in items_list] | |
#print(len(items_list)) | |
# Do something with the data touched | |
#with open("/tmp/links", 'w+') as f: | |
# for item in items_list: | |
#print(item) | |
# f.write(item+'\n') | |
print("================ DONE ===============") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Each set scrapes around 300 links, so there are presumably 20 page loads per set, AirBNB starts blocking the scraper after the 8th set so thats about 160+ page loads, which is pretty decent for a small amount of scraping :)