Last active December 13, 2023 16:05
Python 3 script to find real estate listings of properties up for sale on
from lxml import html
import requests
import unicodecsv as csv
import argparse
import json
def clean(text):
if text:
return ' '.join(' '.join(text).split())
return None
def get_headers():
# Creating headers.
headers = {'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'accept-encoding': 'gzip, deflate, sdch, br',
'accept-language': 'en-GB,en;q=0.8,en-US;q=0.6,ml;q=0.4',
'cache-control': 'max-age=0',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}
return headers
def create_url(zipcode, filter):
# Creating Zillow URL based on the filter.
if filter == "newest":
url = "{0}/0_singlestory/days_sort".format(zipcode)
elif filter == "cheapest":
url = "{0}/0_singlestory/pricea_sort/".format(zipcode)
url = "{0}_rb/?fromHomePage=true&shouldFireSellPageImplicitClaimGA=false&fromHomePageTab=buy".format(zipcode)
return url
def save_to_file(response):
# saving response to `response.html`
with open("response.html", 'w') as fp:
def write_data_to_csv(data):
# saving scraped data to csv.
with open("properties-%s.csv" % (zipcode), 'wb') as csvfile:
fieldnames = ['title', 'address', 'city', 'state', 'postal_code', 'price', 'facts and features', 'real estate provider', 'url']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
for row in data:
def get_response(url):
# Getting response from
for i in range(5):
response = requests.get(url, headers=get_headers())
print("status code received:", response.status_code)
if response.status_code != 200:
# saving response to file for debugging purpose.
return response
return None
def get_data_from_json(raw_json_data):
# getting data from json (type 2 of their A/B testing page)
cleaned_data = clean(raw_json_data).replace('<!--', "").replace("-->", "")
properties_list = []
json_data = json.loads(cleaned_data)
search_results = json_data.get('searchResults').get('listResults', [])
for properties in search_results:
address = properties.get('addressWithZip')
property_info = properties.get('hdpData', {}).get('homeInfo')
city = property_info.get('city')
state = property_info.get('state')
postal_code = property_info.get('zipcode')
price = properties.get('price')
bedrooms = properties.get('beds')
bathrooms = properties.get('baths')
area = properties.get('area')
info = f'{bedrooms} bds, {bathrooms} ba ,{area} sqft'
broker = properties.get('brokerName')
property_url = properties.get('detailUrl')
title = properties.get('statusText')
data = {'address': address,
'city': city,
'state': state,
'postal_code': postal_code,
'price': price,
'facts and features': info,
'real estate provider': broker,
'url': property_url,
'title': title}
return properties_list
except ValueError:
print("Invalid json")
return None
def parse(zipcode, filter=None):
url = create_url(zipcode, filter)
response = get_response(url)
if not response:
print("Failed to fetch the page, please check `response.html` to see the response received from")
return None
parser = html.fromstring(response.text)
search_results = parser.xpath("//div[@id='search-results']//article")
if not search_results:
print("parsing from json data")
# identified as type 2 page
raw_json_data = parser.xpath('//script[@data-zrr-shared-data-key="mobileSearchPageStore"]//text()')
return get_data_from_json(raw_json_data)
print("parsing from html page")
properties_list = []
for properties in search_results:
raw_address = properties.xpath(".//span[@itemprop='address']//span[@itemprop='streetAddress']//text()")
raw_city = properties.xpath(".//span[@itemprop='address']//span[@itemprop='addressLocality']//text()")
raw_state = properties.xpath(".//span[@itemprop='address']//span[@itemprop='addressRegion']//text()")
raw_postal_code = properties.xpath(".//span[@itemprop='address']//span[@itemprop='postalCode']//text()")
raw_price = properties.xpath(".//span[@class='zsg-photo-card-price']//text()")
raw_info = properties.xpath(".//span[@class='zsg-photo-card-info']//text()")
raw_broker_name = properties.xpath(".//span[@class='zsg-photo-card-broker-name']//text()")
url = properties.xpath(".//a[contains(@class,'overlay-link')]/@href")
raw_title = properties.xpath(".//h4//text()")
address = clean(raw_address)
city = clean(raw_city)
state = clean(raw_state)
postal_code = clean(raw_postal_code)
price = clean(raw_price)
info = clean(raw_info).replace(u"\xb7", ',')
broker = clean(raw_broker_name)
title = clean(raw_title)
property_url = "" + url[0] if url else None
is_forsale = properties.xpath('.//span[@class="zsg-icon-for-sale"]')
properties = {'address': address,
'city': city,
'state': state,
'postal_code': postal_code,
'price': price,
'facts and features': info,
'real estate provider': broker,
'url': property_url,
'title': title}
if is_forsale:
return properties_list
if __name__ == "__main__":
# Reading arguments
argparser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
argparser.add_argument('zipcode', help='')
sortorder_help = """
available sort orders are :
newest : Latest property details,
cheapest : Properties with cheapest price
argparser.add_argument('sort', nargs='?', help=sortorder_help, default='Homes For You')
args = argparser.parse_args()
zipcode = args.zipcode
sort = args.sort
print ("Fetching data for %s" % (zipcode))
scraped_data = parse(zipcode, sort)
if scraped_data:
print ("Writing data to output file")
JCP281 commented Mar 28, 2019

rahb3rt commented Jun 10, 2019

awesome code ,,,,
plz let me know how to fetch "contactPhone"
or what is the contactphone's XPATH ?

Copy link

zhuchangzhan commented Nov 6, 2019

Here is how to fix the code:

Add this line to the beginning import section:

from urllib.request import Request, urlopen

Then rewrite part of the "parse" function with the below code:

def parse(zipcode, filter=None):
    url = create_url(zipcode, filter)
    response = get_response(url)
    if not response:
        print("Failed to fetch the page, please check `response.html` to see the response received from")
        return None

    # These two new lines are added
    req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
    webpage = urlopen(req).read()

    #replace the parser to take input added above
    #parser = html.fromstring(response.text)
    parser = html.fromstring(webpage)

Hi there - how can I make this iterate through more pages?

Copy link

Xiyu-1 commented Mar 7, 2020

Here is how to fix the code:

Add this line to the beginning import section:

from urllib.request import Request, urlopen

Then rewrite part of the "parse" function with the below code:

def parse(zipcode, filter=None):
    url = create_url(zipcode, filter)
    response = get_response(url)
    if not response:
        print("Failed to fetch the page, please check `response.html` to see the response received from")
        return None

    # These two new lines are added
    req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
    webpage = urlopen(req).read()

    #replace the parser to take input added above
    #parser = html.fromstring(response.text)
    parser = html.fromstring(webpage)

Thank you! It works out!!!

mcadhoc commented May 27, 2020

I want to make two modifications: scrape all results and adding additional filters. I already made the corrections above and have code working. If anyone wants to help send me a message.

chinmaykhole commented Aug 7, 2020

nkpenta commented Oct 23, 2020

thom1879 commented Jan 3, 2021

cbonsig commented Jan 3, 2021

Copy link

xzmerry commented Feb 25, 2021

Thanks for all the comments above! I fix the code based on your comments and it works!

I modify two parts:

First, replace the row 81:
the wrong version: search_results = json_data.get('searchResults').get('listResults', [])
the correct version: search_results = json_data.get('cat1').get('searchResults').get('listResults', [])
Second, import io, and then:
replace "with open("response.html", 'w') as fp:" with:
with"response.html", 'w', encoding="utf-8") as fp:

Notice, to run the code, run "python 60615" in the terminal, where 60615 is an example of zip code.

And if you want to scrape Zillow data of multiple zip codes, use the following:
`run_cmd = 'python '

for zc in zip_code_lst:
    z_run_cmd = run_cmd + zc

where zip_code_lst is a list of zip codes.

jogcruz commented Mar 19, 2021

I was able to get past the captcha changing the headers:
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,/;q=0.8',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.8',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'

