Skip to content

Instantly share code, notes, and snippets.

@tishmen
Created August 26, 2015 01:49
import json
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
from urllib import quote
from scrapertools import do_request, dump_scrape, set_no_cache
from scrapertools.geocoding import GeocodeApi
from scrapertools.timestamphelpers import get_timestamp, format_timestamp
url = 'airbnb.com'
refreshtime = 84600 # week
replaceall = True
country_codes = [
'AF', 'AX', 'AL', 'DZ', 'AS', 'AD', 'AO', 'AI', 'AQ', 'AG', 'AR', 'AM',
'AW', 'A', 'AT', 'AZ', 'BS', 'BH', 'BD', 'BB', 'BY', 'BE', 'BZ', 'BJ',
'BM', 'BT', ';BO', 'BA', 'BW', 'BV', 'BR', 'VG', 'IO', 'BN', 'BG', 'BF',
'BI', 'KH', 'CM', 'CA', 'CV', 'KY', 'CF', 'TD', 'CL', 'CN', 'HK', 'MO',
'CX', 'CC', 'CO', 'KM', 'CG', 'CD', 'CK', 'CR', 'CI', 'HR', 'C', 'CY',
'CZ', 'DK', 'DJ', 'DM', 'DO', 'EC', 'EG', 'SV', 'GQ', 'ER', 'EE', 'ET',
'FK', 'FO', 'FJ', 'FI', 'FR', 'GF', 'PF', 'TF', 'GA', 'GM', 'GE', 'DE',
'GH', 'GI', 'GR', 'GL', 'GD', 'GP', 'G', 'GT', 'GG', 'GN', 'GW', 'GY',
'HT', 'HM', 'VA', 'HN', 'H', 'IS', 'IN', 'ID', 'IR', 'IQ', 'IE', 'IM',
'IL', 'IT', 'JM', 'JP', 'JE', 'JO', 'KZ', 'KE', 'KI', 'KP', 'KR', 'KW',
'KG', 'LA', 'LV', 'LB', 'LS', 'LR', 'LY', 'LI', 'LT', 'L', 'MK', 'MG',
'MW', 'MY', 'MV', 'ML', 'MT', 'MH', 'MQ', 'MR', 'M', 'YT', 'MX', 'FM',
'MD', 'MC', 'MN', 'ME', 'MS', 'MA', 'MZ', 'MM', 'NA', 'NR', 'NP', 'NL',
'AN', 'NC', 'NZ', 'NI', 'NE', 'NG', 'N', 'NF', 'MP', 'NO', 'OM', 'PK',
'PW', 'PS', 'PA', 'PG', 'PY', 'PE', 'PH', 'PN', 'PL', 'PT', 'PR', 'QA',
'RE', 'RO', 'R', 'RW', 'BL', 'SH', 'KN', 'LC', 'MF', 'PM', 'VC', 'WS',
'SM', 'ST', 'SA', 'SN', 'RS', 'SC', 'SL', 'SG', 'SK', 'SI', 'SB', 'SO',
'ZA', 'GS', 'SS', 'ES', 'LK', 'SD', 'SR', 'SJ', 'SZ', 'SE', 'CH', 'SY',
'TW', 'TJ', 'TZ', 'TH', 'TL', 'TG', 'TK', 'TO', 'TT', 'TN', 'TR', 'TM',
'TC', 'TV', 'UG', 'UA', 'AE', 'GB', 'US', 'UM', 'UY', 'UZ', 'V', 'VE',
'VN', 'VI', 'WF', 'EH', 'YE', 'ZM', 'ZW'
]
def get_date_range():
now = datetime.now()
next_week = now + timedelta(days=7)
return now, next_week
def get_time(dt):
timestamp = get_timestamp(dt)
return format_timestamp(timestamp)
def get_viewport(country_code):
try:
data = GeocodeApi(country_code).json_response
results = data['results'][0]
ne_lat = results['geometry']['viewport']['northeast']['lat']
ne_lng = results['geometry']['viewport']['northeast']['lng']
sw_lat = results['geometry']['viewport']['southwest']['lat']
sw_lng = results['geometry']['viewport']['southwest']['lng']
return {
'ne_lat': ne_lat, 'ne_lng': ne_lng,
'sw_lat': sw_lat, 'sw_lng': sw_lng,
}
except IndexError:
pass
def uniquify_votes(votes):
return dict((vote['sourceid'], vote) for vote in votes).values()
def parse_votes(soup, starttime, stoptime):
votes = []
for entry in soup.select('.row .col-sm-12 > div'):
sourceid = entry['data-id']
comment = entry['data-name'] + ' #rent'
address = entry.find(class_='address').string
lat = float(entry['data-lat'])
lon = float(entry['data-lng'])
event_url = 'https://www.airbnb.com/rooms/' + sourceid
image = entry.find(class_='img-responsive-height')['src']
vote = {
'sourceid': sourceid,
'comment': comment,
'address': address,
'lat': lat,
'lon': lon,
'vote': 1,
'url': event_url,
'image': image,
'starttime': starttime,
'stoptime': stoptime,
}
votes.append(vote)
return votes
def scrape_votes(url, start, stop):
votes = []
starttime = get_time(start)
stoptime = get_time(stop)
page = '&page=1'
while True:
response = do_request(url + page)
data = json.loads(response)
result_soup = BeautifulSoup(data['results'], 'html.parser')
votes.extend(parse_votes(result_soup, starttime, stoptime))
pagination_soup = BeautifulSoup(
data['pagination_footer'], 'html.parser'
)
try:
page = '&page=' + pagination_soup.select('a[rel="next"]')[0].string
except IndexError:
break
return votes
def scrape():
set_no_cache()
votes = []
start, stop = get_date_range()
start_url = 'https://www.airbnb.com/search/search_results?checkin={}&chec'\
'kout={}&sw_lat={}&sw_lng={}&ne_lat={}&ne_lng={}'
for country_code in country_codes:
vp = get_viewport(country_code)
if vp:
url = start_url.format(
quote(start.strftime('%m/%d/%Y')),
quote(stop.strftime('%m/%d/%Y')),
vp['sw_lat'], vp['sw_lng'], vp['ne_lat'], vp['ne_lng']
)
votes.extend(scrape_votes(url, start, stop))
votes = uniquify_votes(votes)
return dump_scrape(votes)
if __name__ == '__main__':
print(scrape())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment