Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Python 3 script to find real estate listings of properties up for sale on zillow.com
from lxml import html
import requests
import unicodecsv as csv
import argparse
import json
def clean(text):
if text:
return ' '.join(' '.join(text).split())
return None
def get_headers():
# Creating headers.
headers = {'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'accept-encoding': 'gzip, deflate, sdch, br',
'accept-language': 'en-GB,en;q=0.8,en-US;q=0.6,ml;q=0.4',
'cache-control': 'max-age=0',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}
return headers
def create_url(zipcode, filter):
# Creating Zillow URL based on the filter.
if filter == "newest":
url = "https://www.zillow.com/homes/for_sale/{0}/0_singlestory/days_sort".format(zipcode)
elif filter == "cheapest":
url = "https://www.zillow.com/homes/for_sale/{0}/0_singlestory/pricea_sort/".format(zipcode)
else:
url = "https://www.zillow.com/homes/for_sale/{0}_rb/?fromHomePage=true&shouldFireSellPageImplicitClaimGA=false&fromHomePageTab=buy".format(zipcode)
print(url)
return url
def save_to_file(response):
# saving response to `response.html`
with open("response.html", 'w') as fp:
fp.write(response.text)
def write_data_to_csv(data):
# saving scraped data to csv.
with open("properties-%s.csv" % (zipcode), 'wb') as csvfile:
fieldnames = ['title', 'address', 'city', 'state', 'postal_code', 'price', 'facts and features', 'real estate provider', 'url']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for row in data:
writer.writerow(row)
def get_response(url):
# Getting response from zillow.com.
for i in range(5):
response = requests.get(url, headers=get_headers())
print("status code received:", response.status_code)
if response.status_code != 200:
# saving response to file for debugging purpose.
save_to_file(response)
continue
else:
save_to_file(response)
return response
return None
def get_data_from_json(raw_json_data):
# getting data from json (type 2 of their A/B testing page)
cleaned_data = clean(raw_json_data).replace('<!--', "").replace("-->", "")
properties_list = []
try:
json_data = json.loads(cleaned_data)
search_results = json_data.get('searchResults').get('listResults', [])
for properties in search_results:
address = properties.get('addressWithZip')
property_info = properties.get('hdpData', {}).get('homeInfo')
city = property_info.get('city')
state = property_info.get('state')
postal_code = property_info.get('zipcode')
price = properties.get('price')
bedrooms = properties.get('beds')
bathrooms = properties.get('baths')
area = properties.get('area')
info = f'{bedrooms} bds, {bathrooms} ba ,{area} sqft'
broker = properties.get('brokerName')
property_url = properties.get('detailUrl')
title = properties.get('statusText')
data = {'address': address,
'city': city,
'state': state,
'postal_code': postal_code,
'price': price,
'facts and features': info,
'real estate provider': broker,
'url': property_url,
'title': title}
properties_list.append(data)
return properties_list
except ValueError:
print("Invalid json")
return None
def parse(zipcode, filter=None):
url = create_url(zipcode, filter)
response = get_response(url)
if not response:
print("Failed to fetch the page, please check `response.html` to see the response received from zillow.com.")
return None
parser = html.fromstring(response.text)
search_results = parser.xpath("//div[@id='search-results']//article")
if not search_results:
print("parsing from json data")
# identified as type 2 page
raw_json_data = parser.xpath('//script[@data-zrr-shared-data-key="mobileSearchPageStore"]//text()')
return get_data_from_json(raw_json_data)
print("parsing from html page")
properties_list = []
for properties in search_results:
raw_address = properties.xpath(".//span[@itemprop='address']//span[@itemprop='streetAddress']//text()")
raw_city = properties.xpath(".//span[@itemprop='address']//span[@itemprop='addressLocality']//text()")
raw_state = properties.xpath(".//span[@itemprop='address']//span[@itemprop='addressRegion']//text()")
raw_postal_code = properties.xpath(".//span[@itemprop='address']//span[@itemprop='postalCode']//text()")
raw_price = properties.xpath(".//span[@class='zsg-photo-card-price']//text()")
raw_info = properties.xpath(".//span[@class='zsg-photo-card-info']//text()")
raw_broker_name = properties.xpath(".//span[@class='zsg-photo-card-broker-name']//text()")
url = properties.xpath(".//a[contains(@class,'overlay-link')]/@href")
raw_title = properties.xpath(".//h4//text()")
address = clean(raw_address)
city = clean(raw_city)
state = clean(raw_state)
postal_code = clean(raw_postal_code)
price = clean(raw_price)
info = clean(raw_info).replace(u"\xb7", ',')
broker = clean(raw_broker_name)
title = clean(raw_title)
property_url = "https://www.zillow.com" + url[0] if url else None
is_forsale = properties.xpath('.//span[@class="zsg-icon-for-sale"]')
properties = {'address': address,
'city': city,
'state': state,
'postal_code': postal_code,
'price': price,
'facts and features': info,
'real estate provider': broker,
'url': property_url,
'title': title}
if is_forsale:
properties_list.append(properties)
return properties_list
if __name__ == "__main__":
# Reading arguments
argparser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
argparser.add_argument('zipcode', help='')
sortorder_help = """
available sort orders are :
newest : Latest property details,
cheapest : Properties with cheapest price
"""
argparser.add_argument('sort', nargs='?', help=sortorder_help, default='Homes For You')
args = argparser.parse_args()
zipcode = args.zipcode
sort = args.sort
print ("Fetching data for %s" % (zipcode))
scraped_data = parse(zipcode, sort)
if scraped_data:
print ("Writing data to output file")
write_data_to_csv(scraped_data)
@edwardrusch3

This comment has been minimized.

Copy link

commented Jun 28, 2018

I continue to get blank data returned in the csv even though the script is giving a 200? Any idea?

@salimoha

This comment has been minimized.

Copy link

commented Aug 11, 2018

Thanks for your code. I have two questions: 1) how can I get the zestimte price? 2) what is "url = "https://www.zillow.com/homes/for_sale/{0}_rb/?fromHomePage=true&shouldFireSellPageImplicitClaimGA=false&fromHomePageTab=buy".format(zipcode)
"
I want to get the list of houses from loopnet. What should I put instead of the above url?
Thanks

@NovTangoPapa

This comment has been minimized.

Copy link

commented Sep 27, 2018

I continue to get blank data returned in the csv even though the script is giving a 200? Any idea?

I am having this same issue.

Edit:

It looks like it has to do with when you try to use a parsing argument (newest/cheapest). I am guessing that the associated links are bad.

Another Edit:

Doesn't look like the links.

@gumdropsteve

This comment has been minimized.

Copy link

commented Oct 2, 2018

Hi, newer to Python, active Realtor® in SF Bay Area. End goal is to have this running on my site, but starting with it as is to better understand.

What would I need to change to have this execute in PyCharm (Windows 10)? Thanks.

@andresmillang

This comment has been minimized.

Copy link

commented Nov 15, 2018

Hi, reason for the blank data is recaptcha

@keddisa

This comment has been minimized.

Copy link

commented Nov 16, 2018

I keep getting this error message, can someone help?

usage: ipykernel_launcher.py [-h] zipcode [sort]
ipykernel_launcher.py: error: unrecognized arguments: -f

An exception has occurred, use %tb to see the full traceback.

SystemExit: 2

C:\Users\keddi\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py:2969: UserWarning: To exit: use 'exit', 'quit', or Ctrl-D.
warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)

@guyamir

This comment has been minimized.

Copy link

commented Dec 23, 2018

I keep getting this error message, can someone help?

usage: ipykernel_launcher.py [-h] zipcode [sort]
ipykernel_launcher.py: error: unrecognized arguments: -f

An exception has occurred, use %tb to see the full traceback.

SystemExit: 2

C:\Users\keddi\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py:2969: UserWarning: To exit: use 'exit', 'quit', or Ctrl-D.
warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)

save the file and run it from the terminal with a zipcode. For instance:

~$ python3 zillow.py 10118

@wwolfgang11

This comment has been minimized.

Copy link

commented Jan 2, 2019

I get the error "ZillowError: HTTPSConnectionPool(host='www.zillow.com', port=443): Max retries exceeded with url: /webservice/GetDeepSearchResults.htm?zws-id=X1-ZWz1gtmiat11xn_7ew1d&address=3400+Pacific+Ave.%2C+Marina+Del+Rey%2C+CA&citystatezip=90292 (Caused by SSLError(SSLError("bad handshake: Error([('SSL routines', 'ssl3_get_server_certificate', 'certificate verify failed')],)",),))"
any idea how to fix this?

@philxiao

This comment has been minimized.

Copy link

commented Jan 4, 2019

This would generate empty result as-is due to Zillow's implementation of recaptcha. For people who would like to use this script, I suggest you use the crawl proxy to bypass recaptcha. https://proxycrawl.com/ Once you have that, simply prefix the zillow url with the proxycrawl endpoint provided in your dashboard.

@corpulent

This comment has been minimized.

Copy link

commented Jan 14, 2019

Hi, newer to Python, active Realtor® in SF Bay Area. End goal is to have this running on my site, but starting with it as is to better understand.

What would I need to change to have this execute in PyCharm (Windows 10)? Thanks.

@gumdropsteve are you a realtor? I might be able to help.

@sarojrout

This comment has been minimized.

Copy link

commented Feb 12, 2019

I continue to get blank data returned in the csv even though the script is giving a 200? Any idea?

I am also getting blank. did you get any solution?

@sebasslash

This comment has been minimized.

Copy link

commented Mar 14, 2019

It works for me if you don't pass the second argument. So:

python main.py zipcode

Seems like the second argument is causing the scraper to yield no results...

@JCP281

This comment has been minimized.

Copy link

commented Mar 28, 2019

Can someone clue me in as to why the results always cap at 200? Is Zillow limiting the return? Seems no matter what zipcode I try, only 200 results ever come back. No errors or anything.

@ghost

This comment has been minimized.

Copy link
Owner Author

commented May 16, 2019

Can someone clue me in as to why the results always cap at 200? Is Zillow limiting the return? Seems no matter what zipcode I try, only 200 results ever come back. No errors or anything.

200 is not the result count. Its the status code received from zillow.com. The script will fetch only first page results.

@rahb3rt

This comment has been minimized.

Copy link

commented Jun 10, 2019

Can someone clue me in as to why the results always cap at 200? Is Zillow limiting the return? Seems no matter what zipcode I try, only 200 results ever come back. No errors or anything.

200 is not the result count. Its the status code received from zillow.com. The script will fetch only first page results.

any way of getting all the pages for the given result?

@benyamin-persia

This comment has been minimized.

Copy link

commented Jun 15, 2019

awesome code ,,,,
plz let me know how to fetch "contactPhone"
or what is the contactphone's XPATH ?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.