Skip to content

Instantly share code, notes, and snippets.

@abelsonlive
Created August 20, 2016 01:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save abelsonlive/d2ca349beb96d19c1269bcdead5ca6a5 to your computer and use it in GitHub Desktop.
Save abelsonlive/d2ca349beb96d19c1269bcdead5ca6a5 to your computer and use it in GitHub Desktop.
import requests
import urllib
import json
ZIP_CODES = [
'11211'
]
def get_rect_from_zip(session, zip_code):
"""
Given a zipcode, retrieve a coordinate bounding box for use with Zillow Search
"""
url = "http://www.zillow.com/search/RealEstateSearch.htm?" + str(zip_code)
r = session.get(url)
cookie = r.headers['Set-Cookie'].split(';')[0]
cookie_decoded = urllib.unquote(cookie).decode('utf8')
rect = urllib.unquote(cookie_decoded.split('rect=')[-1]).split('&')[0]
rect = [str(float(r) * 1e6).split('.')[0] for r in rect.split(',')]
rect = [rect[3], rect[2], rect[1], rect[0]]
return ",".join(rect)
def scrape_rect(session, page, rect):
"""
Given a coordinate bounding box, get rental results.
"""
url = "http://www.zillow.com/search/GetResults.htm?spt=homes&status=000010&lt=000000&ht=111101&pr=,&mp=,&bd=0%2C&ba=0%2C&sf=,&lot=0%2C&yr=,&singlestory=0&hoa=0%2C&pho=0&pets=0&parking=0&laundry=0&income-restricted=0&pnd=0&red=0&zso=0&days=any&ds=all&pmf=0&pf=0&sch=100111&sort=days&search=maplist&disp=1&rid=62022&rt=7&listright=true&photoCardsEnabled=true&isMapSearch=true&zoom=15"
url += "&rect={0}&p={1}".format(rect, page)
r = session.get(url)
return r.json()
def get_num_pages_from_rect_data(data):
"""
Determine number of pages in result set for a given coordinate bounding box.
"""
return int(data['list']['numPages'])
def parse_rect_data(data):
"""
Fetch list of buildings in result set.
"""
return data['map']['buildings']
def scrape_zip_code(session, zip_code):
"""
Steps:
1. Fetch bounding box for a zip code.
2. Fetch first page of results
3. Paginate through all results.
4. Return Data as a list of buildings.
"""
rect = get_rect_from_zip(session, zip_code)
print "Fetching page 1 for zip code {}".format(zip_code)
data = scrape_rect(session, 1, rect)
num_pages = get_num_pages_from_rect_data(data)
print "Found {} pages for zip code {}".format(num_pages, zip_code)
# for testing purposes, set num pages to 10
num_pages = 10
# parse initial output to append to
data = parse_rect_data(data)
# fetch all pages
for page in range(2, num_pages+1):
print "Fetching page {} for zip code {}".format(page, zip_code)
page_data = parse_rect_data(scrape_rect(session, page, rect))
data.extend(page_data)
return data
def scrape_zip_codes(zip_codes):
"""
Scrape multiple zip codes
"""
session = requests.Session()
data = []
for zc in zip_codes:
data.extend(scrape_zip_code(session, zc))
return data
if __name__ == '__main__':
# run the scraper and write the results to a json file.
output = scrape_zip_codes(ZIP_CODES)
print "Outputting {} records to output.json".format(len(output))
with open('output.json', 'wb') as f:
f.write(json.dumps(output))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment