tishmen/airbnb.py

## airbnb.py
import json
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
from urllib import quote

from scrapertools import do_request, dump_scrape, set_no_cache
from scrapertools.geocoding import GeocodeApi
from scrapertools.timestamphelpers import get_timestamp, format_timestamp


url = 'airbnb.com'
refreshtime = 84600  # week
replaceall = True
country_codes = [
    'AF', 'AX', 'AL', 'DZ', 'AS', 'AD', 'AO', 'AI', 'AQ', 'AG', 'AR', 'AM',
    'AW', 'A', 'AT', 'AZ', 'BS', 'BH', 'BD', 'BB', 'BY', 'BE', 'BZ', 'BJ',
    'BM', 'BT', ';BO', 'BA', 'BW', 'BV', 'BR', 'VG', 'IO', 'BN', 'BG', 'BF',
    'BI', 'KH', 'CM', 'CA', 'CV', 'KY', 'CF', 'TD', 'CL', 'CN', 'HK', 'MO',
    'CX', 'CC', 'CO', 'KM', 'CG', 'CD', 'CK', 'CR', 'CI', 'HR', 'C', 'CY',
    'CZ', 'DK', 'DJ', 'DM', 'DO', 'EC', 'EG', 'SV', 'GQ', 'ER', 'EE', 'ET',
    'FK', 'FO', 'FJ', 'FI', 'FR', 'GF', 'PF', 'TF', 'GA', 'GM', 'GE', 'DE',
    'GH', 'GI', 'GR', 'GL', 'GD', 'GP', 'G', 'GT', 'GG', 'GN', 'GW', 'GY',
    'HT', 'HM', 'VA', 'HN', 'H', 'IS', 'IN', 'ID', 'IR', 'IQ', 'IE', 'IM',
    'IL', 'IT', 'JM', 'JP', 'JE', 'JO', 'KZ', 'KE', 'KI', 'KP', 'KR', 'KW',
    'KG', 'LA', 'LV', 'LB', 'LS', 'LR', 'LY', 'LI', 'LT', 'L', 'MK', 'MG',
    'MW', 'MY', 'MV', 'ML', 'MT', 'MH', 'MQ', 'MR', 'M', 'YT', 'MX', 'FM',
    'MD', 'MC', 'MN', 'ME', 'MS', 'MA', 'MZ', 'MM', 'NA', 'NR', 'NP', 'NL',
    'AN', 'NC', 'NZ', 'NI', 'NE', 'NG', 'N', 'NF', 'MP', 'NO', 'OM', 'PK',
    'PW', 'PS', 'PA', 'PG', 'PY', 'PE', 'PH', 'PN', 'PL', 'PT', 'PR', 'QA',
    'RE', 'RO', 'R', 'RW', 'BL', 'SH', 'KN', 'LC', 'MF', 'PM', 'VC', 'WS',
    'SM', 'ST', 'SA', 'SN', 'RS', 'SC', 'SL', 'SG', 'SK', 'SI', 'SB', 'SO',
    'ZA', 'GS', 'SS', 'ES', 'LK', 'SD', 'SR', 'SJ', 'SZ', 'SE', 'CH', 'SY',
    'TW', 'TJ', 'TZ', 'TH', 'TL', 'TG', 'TK', 'TO', 'TT', 'TN', 'TR', 'TM',
    'TC', 'TV', 'UG', 'UA', 'AE', 'GB', 'US', 'UM', 'UY', 'UZ', 'V', 'VE',
    'VN', 'VI', 'WF', 'EH', 'YE', 'ZM', 'ZW'
]


def get_date_range():
    now = datetime.now()
    next_week = now + timedelta(days=7)
    return now, next_week


def get_time(dt):
    timestamp = get_timestamp(dt)
    return format_timestamp(timestamp)


def get_viewport(country_code):
    try:
        data = GeocodeApi(country_code).json_response
        results = data['results'][0]
        ne_lat = results['geometry']['viewport']['northeast']['lat']
        ne_lng = results['geometry']['viewport']['northeast']['lng']
        sw_lat = results['geometry']['viewport']['southwest']['lat']
        sw_lng = results['geometry']['viewport']['southwest']['lng']
        return {
            'ne_lat': ne_lat, 'ne_lng': ne_lng,
            'sw_lat': sw_lat, 'sw_lng': sw_lng,
        }
    except IndexError:
        pass


def uniquify_votes(votes):
    return dict((vote['sourceid'], vote) for vote in votes).values()


def parse_votes(soup, starttime, stoptime):
    votes = []
    for entry in soup.select('.row .col-sm-12 > div'):
        sourceid = entry['data-id']
        comment = entry['data-name'] + ' #rent'
        address = entry.find(class_='address').string
        lat = float(entry['data-lat'])
        lon = float(entry['data-lng'])
        event_url = 'https://www.airbnb.com/rooms/' + sourceid
        image = entry.find(class_='img-responsive-height')['src']
        vote = {
            'sourceid': sourceid,
            'comment': comment,
            'address': address,
            'lat': lat,
            'lon': lon,
            'vote': 1,
            'url': event_url,
            'image': image,
            'starttime': starttime,
            'stoptime': stoptime,
        }
        votes.append(vote)
    return votes


def scrape_votes(url, start, stop):
    votes = []
    starttime = get_time(start)
    stoptime = get_time(stop)
    page = '&page=1'
    while True:
        response = do_request(url + page)
        data = json.loads(response)
        result_soup = BeautifulSoup(data['results'], 'html.parser')
        votes.extend(parse_votes(result_soup, starttime, stoptime))
        pagination_soup = BeautifulSoup(
            data['pagination_footer'], 'html.parser'
        )
        try:
            page = '&page=' + pagination_soup.select('a[rel="next"]')[0].string
        except IndexError:
            break
    return votes


def scrape():
    set_no_cache()
    votes = []
    start, stop = get_date_range()
    start_url = 'https://www.airbnb.com/search/search_results?checkin={}&chec'\
        'kout={}&sw_lat={}&sw_lng={}&ne_lat={}&ne_lng={}'
    for country_code in country_codes:
        vp = get_viewport(country_code)
        if vp:
            url = start_url.format(
                quote(start.strftime('%m/%d/%Y')),
                quote(stop.strftime('%m/%d/%Y')),
                vp['sw_lat'], vp['sw_lng'], vp['ne_lat'], vp['ne_lng']
            )
            votes.extend(scrape_votes(url, start, stop))
    votes = uniquify_votes(votes)
    return dump_scrape(votes)


if __name__ == '__main__':
    print(scrape())
	import json
	from bs4 import BeautifulSoup
	from datetime import datetime, timedelta
	from urllib import quote

	from scrapertools import do_request, dump_scrape, set_no_cache
	from scrapertools.geocoding import GeocodeApi
	from scrapertools.timestamphelpers import get_timestamp, format_timestamp


	url = 'airbnb.com'
	refreshtime = 84600 # week
	replaceall = True
	country_codes = [
	'AF', 'AX', 'AL', 'DZ', 'AS', 'AD', 'AO', 'AI', 'AQ', 'AG', 'AR', 'AM',
	'AW', 'A', 'AT', 'AZ', 'BS', 'BH', 'BD', 'BB', 'BY', 'BE', 'BZ', 'BJ',
	'BM', 'BT', ';BO', 'BA', 'BW', 'BV', 'BR', 'VG', 'IO', 'BN', 'BG', 'BF',
	'BI', 'KH', 'CM', 'CA', 'CV', 'KY', 'CF', 'TD', 'CL', 'CN', 'HK', 'MO',
	'CX', 'CC', 'CO', 'KM', 'CG', 'CD', 'CK', 'CR', 'CI', 'HR', 'C', 'CY',
	'CZ', 'DK', 'DJ', 'DM', 'DO', 'EC', 'EG', 'SV', 'GQ', 'ER', 'EE', 'ET',
	'FK', 'FO', 'FJ', 'FI', 'FR', 'GF', 'PF', 'TF', 'GA', 'GM', 'GE', 'DE',
	'GH', 'GI', 'GR', 'GL', 'GD', 'GP', 'G', 'GT', 'GG', 'GN', 'GW', 'GY',
	'HT', 'HM', 'VA', 'HN', 'H', 'IS', 'IN', 'ID', 'IR', 'IQ', 'IE', 'IM',
	'IL', 'IT', 'JM', 'JP', 'JE', 'JO', 'KZ', 'KE', 'KI', 'KP', 'KR', 'KW',
	'KG', 'LA', 'LV', 'LB', 'LS', 'LR', 'LY', 'LI', 'LT', 'L', 'MK', 'MG',
	'MW', 'MY', 'MV', 'ML', 'MT', 'MH', 'MQ', 'MR', 'M', 'YT', 'MX', 'FM',
	'MD', 'MC', 'MN', 'ME', 'MS', 'MA', 'MZ', 'MM', 'NA', 'NR', 'NP', 'NL',
	'AN', 'NC', 'NZ', 'NI', 'NE', 'NG', 'N', 'NF', 'MP', 'NO', 'OM', 'PK',
	'PW', 'PS', 'PA', 'PG', 'PY', 'PE', 'PH', 'PN', 'PL', 'PT', 'PR', 'QA',
	'RE', 'RO', 'R', 'RW', 'BL', 'SH', 'KN', 'LC', 'MF', 'PM', 'VC', 'WS',
	'SM', 'ST', 'SA', 'SN', 'RS', 'SC', 'SL', 'SG', 'SK', 'SI', 'SB', 'SO',
	'ZA', 'GS', 'SS', 'ES', 'LK', 'SD', 'SR', 'SJ', 'SZ', 'SE', 'CH', 'SY',
	'TW', 'TJ', 'TZ', 'TH', 'TL', 'TG', 'TK', 'TO', 'TT', 'TN', 'TR', 'TM',
	'TC', 'TV', 'UG', 'UA', 'AE', 'GB', 'US', 'UM', 'UY', 'UZ', 'V', 'VE',
	'VN', 'VI', 'WF', 'EH', 'YE', 'ZM', 'ZW'
	]


	def get_date_range():
	now = datetime.now()
	next_week = now + timedelta(days=7)
	return now, next_week


	def get_time(dt):
	timestamp = get_timestamp(dt)
	return format_timestamp(timestamp)


	def get_viewport(country_code):
	try:
	data = GeocodeApi(country_code).json_response
	results = data['results'][0]
	ne_lat = results['geometry']['viewport']['northeast']['lat']
	ne_lng = results['geometry']['viewport']['northeast']['lng']
	sw_lat = results['geometry']['viewport']['southwest']['lat']
	sw_lng = results['geometry']['viewport']['southwest']['lng']
	return {
	'ne_lat': ne_lat, 'ne_lng': ne_lng,
	'sw_lat': sw_lat, 'sw_lng': sw_lng,
	}
	except IndexError:
	pass


	def uniquify_votes(votes):
	return dict((vote['sourceid'], vote) for vote in votes).values()


	def parse_votes(soup, starttime, stoptime):
	votes = []
	for entry in soup.select('.row .col-sm-12 > div'):
	sourceid = entry['data-id']
	comment = entry['data-name'] + ' #rent'
	address = entry.find(class_='address').string
	lat = float(entry['data-lat'])
	lon = float(entry['data-lng'])
	event_url = 'https://www.airbnb.com/rooms/' + sourceid
	image = entry.find(class_='img-responsive-height')['src']
	vote = {
	'sourceid': sourceid,
	'comment': comment,
	'address': address,
	'lat': lat,
	'lon': lon,
	'vote': 1,
	'url': event_url,
	'image': image,
	'starttime': starttime,
	'stoptime': stoptime,
	}
	votes.append(vote)
	return votes


	def scrape_votes(url, start, stop):
	votes = []
	starttime = get_time(start)
	stoptime = get_time(stop)
	page = '&page=1'
	while True:
	response = do_request(url + page)
	data = json.loads(response)
	result_soup = BeautifulSoup(data['results'], 'html.parser')
	votes.extend(parse_votes(result_soup, starttime, stoptime))
	pagination_soup = BeautifulSoup(
	data['pagination_footer'], 'html.parser'
	)
	try:
	page = '&page=' + pagination_soup.select('a[rel="next"]')[0].string
	except IndexError:
	break
	return votes


	def scrape():
	set_no_cache()
	votes = []
	start, stop = get_date_range()
	start_url = 'https://www.airbnb.com/search/search_results?checkin={}&chec'\
	'kout={}&sw_lat={}&sw_lng={}&ne_lat={}&ne_lng={}'
	for country_code in country_codes:
	vp = get_viewport(country_code)
	if vp:
	url = start_url.format(
	quote(start.strftime('%m/%d/%Y')),
	quote(stop.strftime('%m/%d/%Y')),
	vp['sw_lat'], vp['sw_lng'], vp['ne_lat'], vp['ne_lng']
	)
	votes.extend(scrape_votes(url, start, stop))
	votes = uniquify_votes(votes)
	return dump_scrape(votes)


	if __name__ == '__main__':
	print(scrape())