Created
August 20, 2016 01:26
-
-
Save abelsonlive/d2ca349beb96d19c1269bcdead5ca6a5 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import urllib | |
import json | |
ZIP_CODES = [ | |
'11211' | |
] | |
def get_rect_from_zip(session, zip_code): | |
""" | |
Given a zipcode, retrieve a coordinate bounding box for use with Zillow Search | |
""" | |
url = "http://www.zillow.com/search/RealEstateSearch.htm?" + str(zip_code) | |
r = session.get(url) | |
cookie = r.headers['Set-Cookie'].split(';')[0] | |
cookie_decoded = urllib.unquote(cookie).decode('utf8') | |
rect = urllib.unquote(cookie_decoded.split('rect=')[-1]).split('&')[0] | |
rect = [str(float(r) * 1e6).split('.')[0] for r in rect.split(',')] | |
rect = [rect[3], rect[2], rect[1], rect[0]] | |
return ",".join(rect) | |
def scrape_rect(session, page, rect): | |
""" | |
Given a coordinate bounding box, get rental results. | |
""" | |
url = "http://www.zillow.com/search/GetResults.htm?spt=homes&status=000010<=000000&ht=111101&pr=,&mp=,&bd=0%2C&ba=0%2C&sf=,&lot=0%2C&yr=,&singlestory=0&hoa=0%2C&pho=0&pets=0&parking=0&laundry=0&income-restricted=0&pnd=0&red=0&zso=0&days=any&ds=all&pmf=0&pf=0&sch=100111&sort=days&search=maplist&disp=1&rid=62022&rt=7&listright=true&photoCardsEnabled=true&isMapSearch=true&zoom=15" | |
url += "&rect={0}&p={1}".format(rect, page) | |
r = session.get(url) | |
return r.json() | |
def get_num_pages_from_rect_data(data): | |
""" | |
Determine number of pages in result set for a given coordinate bounding box. | |
""" | |
return int(data['list']['numPages']) | |
def parse_rect_data(data): | |
""" | |
Fetch list of buildings in result set. | |
""" | |
return data['map']['buildings'] | |
def scrape_zip_code(session, zip_code): | |
""" | |
Steps: | |
1. Fetch bounding box for a zip code. | |
2. Fetch first page of results | |
3. Paginate through all results. | |
4. Return Data as a list of buildings. | |
""" | |
rect = get_rect_from_zip(session, zip_code) | |
print "Fetching page 1 for zip code {}".format(zip_code) | |
data = scrape_rect(session, 1, rect) | |
num_pages = get_num_pages_from_rect_data(data) | |
print "Found {} pages for zip code {}".format(num_pages, zip_code) | |
# for testing purposes, set num pages to 10 | |
num_pages = 10 | |
# parse initial output to append to | |
data = parse_rect_data(data) | |
# fetch all pages | |
for page in range(2, num_pages+1): | |
print "Fetching page {} for zip code {}".format(page, zip_code) | |
page_data = parse_rect_data(scrape_rect(session, page, rect)) | |
data.extend(page_data) | |
return data | |
def scrape_zip_codes(zip_codes): | |
""" | |
Scrape multiple zip codes | |
""" | |
session = requests.Session() | |
data = [] | |
for zc in zip_codes: | |
data.extend(scrape_zip_code(session, zc)) | |
return data | |
if __name__ == '__main__': | |
# run the scraper and write the results to a json file. | |
output = scrape_zip_codes(ZIP_CODES) | |
print "Outputting {} records to output.json".format(len(output)) | |
with open('output.json', 'wb') as f: | |
f.write(json.dumps(output)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment