Skip to content

Instantly share code, notes, and snippets.

@prehensile
Created July 18, 2017 21:04
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save prehensile/aed87c98677da07c8f042cd01904ebf0 to your computer and use it in GitHub Desktop.
Save prehensile/aed87c98677da07c8f042cd01904ebf0 to your computer and use it in GitHub Desktop.
A Python script which crawls airbnb search results for a given set of keywords.
#!/usr/bin/env python
#########################
# airbnb-search.py
#########################
# by prehensile, 18/07/17
#########################
# Crawl airbnb search results (descriptions and reviews) for keywords.
# A quick, dirty and brittle set of hacks.
# Very likely to break the next time anything changes in airbnb's HTML.
# Essentially, if it works for you, be pleasantly surprised.
##########################
# Written for python 2.7
# Requires:
# requests
# http://docs.python-requests.org/en/master/
# beautifulsoup
# https://www.crummy.com/software/BeautifulSoup/bs4/doc/
##########################
import sys
import json
import requests
import time
import argparse
import urlparse
from bs4 import BeautifulSoup
def request_url( url, retries=3 ):
"""
GET a page from url with a number of retries and exponential backoff.
"""
print "request_url: %s" % url
r = None
wait = 2
for i in range( retries ):
r = requests.get( url )
if r.status_code >= 400:
print_error( "HTTP error: %d %s" % (r.status_code, r.reason) )
print_error( r.text )
# exponential backoff
time.sleep( wait )
wait *= 2
else:
break
return r
def print_error( message ):
"""
Print an error to stderr.
"""
print >> sys.stderr, message.encode('utf-8')
def room_url_for_id( room_id, airbnb_host="www.airbnb.com" ):
"""
Construct an airbnb url for a listing with room_id.
"""
return "https://%s/rooms/%d" % (airbnb_host, room_id)
def get_bootstrap_data_for_hypernova_key( body, hypernova_key ):
"""
Extract bootstrap JSON data from a page body.
No idea what hypernova is, but it sure requires some useful JSON.
"""
soup = BeautifulSoup( body, "html.parser" )
for tag in soup.find_all( "script", attrs={ "data-hypernova-key" : hypernova_key } ):
s = tag.string
if "bootstrapData" in s:
# HACK! remove html comment just by truncating string.
# will need to change if sting in html changes
return json.loads( s[4:-3] )
def fetch_listing_data( room_id, airbnb_host="www.airbnb.com" ):
"""
Fetch a listing page and extract embedded JSON representation.
"""
room_url = room_url_for_id( room_id, airbnb_host=airbnb_host )
r = request_url( room_url )
return get_bootstrap_data_for_hypernova_key( r.text, "p3indexbundlejs" )
def get_context( text, term, n ):
"""
Return the first occurence of term in text with n characters either side.
"""
idx = text.index( term )
start = max( 0, idx-n )
end = min( len(text), idx+len(term)+n )
return text[start:end]
def search_text_for_terms( text, search_terms ):
"""
Search some text for a set of search_terms.
"""
has_hits = False
for search_term in search_terms:
if search_term in text:
has_hits = True
print ">> Found search term: %s" % search_term
# print text
context = get_context( text, search_term, 30 )
print ">>> ...%s..." % context
return has_hits
def search_reviews( listing_data, search_terms ):
"""
Search through reviews contained in listing_data for search_terms.
"""
has_hits = False
try:
reviews = listing_data[ "bootstrapData" ][ "listing" ][ "sorted_reviews" ]
for review in reviews:
comments = review[ "comments" ]
if comments is not None:
has_hits = has_hits or search_text_for_terms( comments, search_terms )
# print comments.encode('utf-8')
except Exception as e:
print_error( repr(e) )
print_error( repr( listing_data ).encode('utf-8') )
return has_hits
def search_description( listing_data, search_terms ):
"""
Search through description fields contained in listing_data for search_terms.
"""
has_hits = False
try:
sectioned_description = listing_data[ "bootstrapData" ][ "listing" ][ "sectioned_description" ]
for k in sectioned_description:
section = sectioned_description[k]
if section is not None:
try:
has_hits = has_hits or search_text_for_terms( section, search_terms )
# print section.encode('utf-8')
except Exception as e:
print_error( repr(e) )
print_error( section.encode('utf-8') )
except Exception as e:
print_error( repr(e) )
print_error( repr( listing_data ).encode('utf-8') )
return has_hits
def parse_args():
"""
Parse commandline args using argparse.
"""
parser = argparse.ArgumentParser(
description='Crawl airbnb search results (descriptions and reviews) for keywords.'
)
parser.add_argument(
'keywords',
nargs = '+',
help = 'Keywords to search for in listings.',
default = [ "kids", "baby", "cot", "babies" ]
)
parser.add_argument(
'--search-url',
"-u",
help = 'URL for an airbnb search. For example: https://www.airbnb.com/s/New-York--NY--United-States',
default = 'https://www.airbnb.com/s/New-York--NY--United-States'
)
return parser.parse_args()
if __name__ == '__main__':
# parse commandline args
args = parse_args()
# set up some working vars
search_terms = args.keywords
base_url = args.search_url
page_url = base_url
host = urlparse.urlparse( base_url ).hostname
i = 0
items_offset = 0
print "Crawling search results at %s for keywords: %s" %( base_url, ",".join(search_terms) )
while True:
print "> Fetch search page #%d: %s\n" % ( i, page_url )
# fetch search page text
r = request_url( page_url )
buf = r.text
# get page, section & listing data
listings = None
page_data = None
try:
# pull JSON data from page
page_data = get_bootstrap_data_for_hypernova_key( buf, "spaspabundlejs" )
section = page_data["bootstrapData"]["reduxData"]["exploreTab"]["response"]["explore_tabs"][0]["sections"][0]
listings = section["listings"]
items_offset = page_data["bootstrapData"]["reduxData"]["exploreTab"]["response"]["explore_tabs"][0]["pagination_metadata"]["items_offset"]
except Exception as e:
print_error( repr(e) )
if page_data is not None:
print_error( page_data.encode( 'utf-8' ) )
else:
print_error( "HTTP response: %d %s" % (r.status_code, r.reason) )
print_error( buf )
if listings is not None:
# cycle through listings
for listing in listings:
room_id = listing["listing"][ "id" ]
listing_data = fetch_listing_data( room_id, airbnb_host=host )
if search_description( listing_data, search_terms ):
print ">>>> Found search terms in description for room: %s" % room_url_for_id( room_id, airbnb_host=host )
print "\n"
if search_reviews( listing_data, search_terms ):
print ">>>> Found search terms in reviews for room: %s" % room_url_for_id( room_id, airbnb_host=host )
print "\n"
# find link to next search page
soup = BeautifulSoup( buf, "html.parser" )
link_next = soup.find( "link", rel="next" )
if link_next:
page_url = link_next.get( "href" )
elif i < items_offset:
# HACK - if we fail to get next link, use items_offset to construct it
page_url = "%s?section_offset=%d" % (base_url,i+1)
print_error( "Failed to get next_link from page, constructed it: %s" % page_url )
else:
print_error( buf )
break
i += 1
print "Done!"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment