Skip to content

Instantly share code, notes, and snippets.

@scrapehero
Last active December 13, 2023 16:05
Show Gist options
  • Star 53 You must be signed in to star a gist
  • Fork 53 You must be signed in to fork a gist
  • Save scrapehero/5f51f344d68cf2c022eb2d23a2f1cf95 to your computer and use it in GitHub Desktop.
Save scrapehero/5f51f344d68cf2c022eb2d23a2f1cf95 to your computer and use it in GitHub Desktop.
Python 3 script to find real estate listings of properties up for sale on zillow.com
from lxml import html
import requests
import unicodecsv as csv
import argparse
import json
def clean(text):
if text:
return ' '.join(' '.join(text).split())
return None
def get_headers():
# Creating headers.
headers = {'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'accept-encoding': 'gzip, deflate, sdch, br',
'accept-language': 'en-GB,en;q=0.8,en-US;q=0.6,ml;q=0.4',
'cache-control': 'max-age=0',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}
return headers
def create_url(zipcode, filter):
# Creating Zillow URL based on the filter.
if filter == "newest":
url = "https://www.zillow.com/homes/for_sale/{0}/0_singlestory/days_sort".format(zipcode)
elif filter == "cheapest":
url = "https://www.zillow.com/homes/for_sale/{0}/0_singlestory/pricea_sort/".format(zipcode)
else:
url = "https://www.zillow.com/homes/for_sale/{0}_rb/?fromHomePage=true&shouldFireSellPageImplicitClaimGA=false&fromHomePageTab=buy".format(zipcode)
print(url)
return url
def save_to_file(response):
# saving response to `response.html`
with open("response.html", 'w') as fp:
fp.write(response.text)
def write_data_to_csv(data):
# saving scraped data to csv.
with open("properties-%s.csv" % (zipcode), 'wb') as csvfile:
fieldnames = ['title', 'address', 'city', 'state', 'postal_code', 'price', 'facts and features', 'real estate provider', 'url']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for row in data:
writer.writerow(row)
def get_response(url):
# Getting response from zillow.com.
for i in range(5):
response = requests.get(url, headers=get_headers())
print("status code received:", response.status_code)
if response.status_code != 200:
# saving response to file for debugging purpose.
save_to_file(response)
continue
else:
save_to_file(response)
return response
return None
def get_data_from_json(raw_json_data):
# getting data from json (type 2 of their A/B testing page)
cleaned_data = clean(raw_json_data).replace('<!--', "").replace("-->", "")
properties_list = []
try:
json_data = json.loads(cleaned_data)
search_results = json_data.get('searchResults').get('listResults', [])
for properties in search_results:
address = properties.get('addressWithZip')
property_info = properties.get('hdpData', {}).get('homeInfo')
city = property_info.get('city')
state = property_info.get('state')
postal_code = property_info.get('zipcode')
price = properties.get('price')
bedrooms = properties.get('beds')
bathrooms = properties.get('baths')
area = properties.get('area')
info = f'{bedrooms} bds, {bathrooms} ba ,{area} sqft'
broker = properties.get('brokerName')
property_url = properties.get('detailUrl')
title = properties.get('statusText')
data = {'address': address,
'city': city,
'state': state,
'postal_code': postal_code,
'price': price,
'facts and features': info,
'real estate provider': broker,
'url': property_url,
'title': title}
properties_list.append(data)
return properties_list
except ValueError:
print("Invalid json")
return None
def parse(zipcode, filter=None):
url = create_url(zipcode, filter)
response = get_response(url)
if not response:
print("Failed to fetch the page, please check `response.html` to see the response received from zillow.com.")
return None
parser = html.fromstring(response.text)
search_results = parser.xpath("//div[@id='search-results']//article")
if not search_results:
print("parsing from json data")
# identified as type 2 page
raw_json_data = parser.xpath('//script[@data-zrr-shared-data-key="mobileSearchPageStore"]//text()')
return get_data_from_json(raw_json_data)
print("parsing from html page")
properties_list = []
for properties in search_results:
raw_address = properties.xpath(".//span[@itemprop='address']//span[@itemprop='streetAddress']//text()")
raw_city = properties.xpath(".//span[@itemprop='address']//span[@itemprop='addressLocality']//text()")
raw_state = properties.xpath(".//span[@itemprop='address']//span[@itemprop='addressRegion']//text()")
raw_postal_code = properties.xpath(".//span[@itemprop='address']//span[@itemprop='postalCode']//text()")
raw_price = properties.xpath(".//span[@class='zsg-photo-card-price']//text()")
raw_info = properties.xpath(".//span[@class='zsg-photo-card-info']//text()")
raw_broker_name = properties.xpath(".//span[@class='zsg-photo-card-broker-name']//text()")
url = properties.xpath(".//a[contains(@class,'overlay-link')]/@href")
raw_title = properties.xpath(".//h4//text()")
address = clean(raw_address)
city = clean(raw_city)
state = clean(raw_state)
postal_code = clean(raw_postal_code)
price = clean(raw_price)
info = clean(raw_info).replace(u"\xb7", ',')
broker = clean(raw_broker_name)
title = clean(raw_title)
property_url = "https://www.zillow.com" + url[0] if url else None
is_forsale = properties.xpath('.//span[@class="zsg-icon-for-sale"]')
properties = {'address': address,
'city': city,
'state': state,
'postal_code': postal_code,
'price': price,
'facts and features': info,
'real estate provider': broker,
'url': property_url,
'title': title}
if is_forsale:
properties_list.append(properties)
return properties_list
if __name__ == "__main__":
# Reading arguments
argparser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
argparser.add_argument('zipcode', help='')
sortorder_help = """
available sort orders are :
newest : Latest property details,
cheapest : Properties with cheapest price
"""
argparser.add_argument('sort', nargs='?', help=sortorder_help, default='Homes For You')
args = argparser.parse_args()
zipcode = args.zipcode
sort = args.sort
print ("Fetching data for %s" % (zipcode))
scraped_data = parse(zipcode, sort)
if scraped_data:
print ("Writing data to output file")
write_data_to_csv(scraped_data)
@mcadhoc
Copy link

mcadhoc commented May 27, 2020

I want to make two modifications: scrape all results and adding additional filters. I already made the corrections above and have code working. If anyone wants to help send me a message.

@fuji1024
Copy link

Hi there - how can I make this iterate through more pages?

I want to make two modifications: scrape all results and adding additional filters. I already made the corrections above and have code working. If anyone wants to help send me a message.

Has anyone been able to figure out how to iterate this through all pages?

@SamsunNaher
Copy link

Capture
Hi, I am completely novice to python and web scraping and I badly need to extract data from Zillow. Could anyone please help me to fix this problem? Thanks in advance!

@chinmaykhole
Copy link

chinmaykhole commented Aug 7, 2020

Can someone help me with this error?
%run scraped.py 92115 newest
Fetching data for 92115
https://www.zillow.com/homes/for_sale/92115/0_singlestory/days_sort
status code received: 200
Traceback (most recent call last):

File "C:\Users\user\Desktop\scraped.py", line 185, in
scraped_data = parse(zipcode, sort)

File "C:\Users\user\Desktop\scraped.py", line 116, in parse
response = get_response(url)

File "C:\Users\user\Desktop\scraped.py", line 67, in get_response
save_to_file(response)

File "C:\Users\user\Desktop\scraped.py", line 42, in save_to_file
fp.write(response.text)

File "C:\Users\user\Anaconda3\lib\encodings\cp1252.py", line 19, in encode
return codecs.charmap_encode(input,self.errors,encoding_table)[0]
UnicodeEncodeError: 'charmap' codec can't encode character '\u0100' in position 29381: character maps to

@johnragsdell
Copy link

I want to make two modifications: scrape all results and adding additional filters. I already made the corrections above and have code working. If anyone wants to help send me a message.

Hi friends. Has anyone helped you with this?

@datageneralist
Copy link

File "C:\Users\user\Anaconda3\lib\encodings\cp1252.py", line 19, in encode
return codecs.charmap_encode(input,self.errors,encoding_table)[0]
UnicodeEncodeError: 'charmap' codec can't encode character '\u0100' in position 29381: character maps to

I think adding a UTF-8 parameter will fix this. Take a look at this stack overflow link. It worked for me.

@nkpenta
Copy link

nkpenta commented Oct 23, 2020

can anyone help me with this error?

https://www.zillow.com/homes/for_sale/98075/0_singlestory/days_sort
status code received: 200
parsing from json data
Traceback (most recent call last):
File "C:/Users/Downloads/zillow.py", line 191, in
scraped_data = parse(zipcode, sort)
File "C:/Users/Downloads/zillow.py", line 135, in parse
return get_data_from_json(raw_json_data)
File "C:/Users/Downloads/zillow.py", line 80, in get_data_from_json
search_results = json_data.get('searchResults').get('listResults', [])
AttributeError: 'NoneType' object has no attribute 'get'

@seffignoz
Copy link

replace this
search_results = json_data.get('searchResults').get('listResults', [])

with this
search_results = json_data.get('cat1').get('searchResults').get('listResults', [])

@thom1879
Copy link

thom1879 commented Jan 3, 2021

I can't get past captcha, anyone have any ideas?

@cbonsig
Copy link

cbonsig commented Jan 3, 2021

I gave up on this, and instead used the Zillow to Excel plugin with Chromium. It is not completely automated, but was sufficient to get the data that I wanted with a few minutes of effort (for sale and sold for several zip code regions). https://chrome.google.com/webstore/detail/zillow-to-excel/aecdekdgjlncaadbdiciepplaobhcjgi?hl=en

@skurjekar
Copy link

Looks like Zillow has been blocking almost all kinds of automated requests. Have tried multiple user agents, and it lets me create a soup once, but not iteratively. If someone finds a solution, please feel free to share it here!

@BenCratsley
Copy link

I want to make two modifications: scrape all results and adding additional filters. I already made the corrections above and have code working. If anyone wants to help send me a message.

can anyone help me with this error?

https://www.zillow.com/homes/for_sale/98075/0_singlestory/days_sort
status code received: 200
parsing from json data
Traceback (most recent call last):
File "C:/Users/Downloads/zillow.py", line 191, in
scraped_data = parse(zipcode, sort)
File "C:/Users/Downloads/zillow.py", line 135, in parse
return get_data_from_json(raw_json_data)
File "C:/Users/Downloads/zillow.py", line 80, in get_data_from_json
search_results = json_data.get('searchResults').get('listResults', [])
AttributeError: 'NoneType' object has no attribute 'get'

Any luck? I'm getting the same error.

@xzmerry
Copy link

xzmerry commented Feb 25, 2021

Thanks for all the comments above! I fix the code based on your comments and it works!

I modify two parts:

First, replace the row 81:
the wrong version: search_results = json_data.get('searchResults').get('listResults', [])
the correct version: search_results = json_data.get('cat1').get('searchResults').get('listResults', [])
Second, import io, and then:
replace "with open("response.html", 'w') as fp:" with:
with io.open("response.html", 'w', encoding="utf-8") as fp:

Notice, to run the code, run "python zillow.py 60615" in the terminal, where 60615 is an example of zip code.

And if you want to scrape Zillow data of multiple zip codes, use the following:
`run_cmd = 'python zillow.py '

for zc in zip_code_lst:
    z_run_cmd = run_cmd + zc
    os.system(z_run_cmd)`

where zip_code_lst is a list of zip codes.

@jogcruz
Copy link

jogcruz commented Mar 19, 2021

I was able to get past the captcha changing the headers:
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,/;q=0.8',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.8',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
}

@FlyingTerran
Copy link

Can someone help me with this error?
%run scraped.py 92115 newest
Fetching data for 92115
https://www.zillow.com/homes/for_sale/92115/0_singlestory/days_sort
status code received: 200
Traceback (most recent call last):

File "C:\Users\user\Desktop\scraped.py", line 185, in
scraped_data = parse(zipcode, sort)

File "C:\Users\user\Desktop\scraped.py", line 116, in parse
response = get_response(url)

File "C:\Users\user\Desktop\scraped.py", line 67, in get_response
save_to_file(response)

File "C:\Users\user\Desktop\scraped.py", line 42, in save_to_file
fp.write(response.text)

File "C:\Users\user\Anaconda3\lib\encodings\cp1252.py", line 19, in encode
return codecs.charmap_encode(input,self.errors,encoding_table)[0]
UnicodeEncodeError: 'charmap' codec can't encode character '\u0100' in position 29381: character maps to

You need to write with binary mode i.e.
with open(YOURFILE, "b") as f:
f.write(SOMETHING)

@grole01
Copy link

grole01 commented Dec 23, 2022

Looks like Zillow has been blocking almost all kinds of automated requests. Have tried multiple user agents, and it lets me create a soup once, but not iteratively. If someone finds a solution, please feel free to share it here!

???

@Chulisnapp
Copy link

Hi can someone help me, i keep getting this error code

usage: zillow.py [-h] zipcode [sort]
zillow.py: error: the following arguments are required: zipcode

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment