Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Python 3 script to find real estate listings of properties up for sale on zillow.com
from lxml import html
import requests
import unicodecsv as csv
import argparse
import json
def clean(text):
if text:
return ' '.join(' '.join(text).split())
return None
def get_headers():
# Creating headers.
headers = {'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'accept-encoding': 'gzip, deflate, sdch, br',
'accept-language': 'en-GB,en;q=0.8,en-US;q=0.6,ml;q=0.4',
'cache-control': 'max-age=0',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}
return headers
def create_url(zipcode, filter):
# Creating Zillow URL based on the filter.
if filter == "newest":
url = "https://www.zillow.com/homes/for_sale/{0}/0_singlestory/days_sort".format(zipcode)
elif filter == "cheapest":
url = "https://www.zillow.com/homes/for_sale/{0}/0_singlestory/pricea_sort/".format(zipcode)
else:
url = "https://www.zillow.com/homes/for_sale/{0}_rb/?fromHomePage=true&shouldFireSellPageImplicitClaimGA=false&fromHomePageTab=buy".format(zipcode)
print(url)
return url
def save_to_file(response):
# saving response to `response.html`
with open("response.html", 'w') as fp:
fp.write(response.text)
def write_data_to_csv(data):
# saving scraped data to csv.
with open("properties-%s.csv" % (zipcode), 'wb') as csvfile:
fieldnames = ['title', 'address', 'city', 'state', 'postal_code', 'price', 'facts and features', 'real estate provider', 'url']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for row in data:
writer.writerow(row)
def get_response(url):
# Getting response from zillow.com.
for i in range(5):
response = requests.get(url, headers=get_headers())
print("status code received:", response.status_code)
if response.status_code != 200:
# saving response to file for debugging purpose.
save_to_file(response)
continue
else:
save_to_file(response)
return response
return None
def get_data_from_json(raw_json_data):
# getting data from json (type 2 of their A/B testing page)
cleaned_data = clean(raw_json_data).replace('<!--', "").replace("-->", "")
properties_list = []
try:
json_data = json.loads(cleaned_data)
search_results = json_data.get('searchResults').get('listResults', [])
for properties in search_results:
address = properties.get('addressWithZip')
property_info = properties.get('hdpData', {}).get('homeInfo')
city = property_info.get('city')
state = property_info.get('state')
postal_code = property_info.get('zipcode')
price = properties.get('price')
bedrooms = properties.get('beds')
bathrooms = properties.get('baths')
area = properties.get('area')
info = f'{bedrooms} bds, {bathrooms} ba ,{area} sqft'
broker = properties.get('brokerName')
property_url = properties.get('detailUrl')
title = properties.get('statusText')
data = {'address': address,
'city': city,
'state': state,
'postal_code': postal_code,
'price': price,
'facts and features': info,
'real estate provider': broker,
'url': property_url,
'title': title}
properties_list.append(data)
return properties_list
except ValueError:
print("Invalid json")
return None
def parse(zipcode, filter=None):
url = create_url(zipcode, filter)
response = get_response(url)
if not response:
print("Failed to fetch the page, please check `response.html` to see the response received from zillow.com.")
return None
parser = html.fromstring(response.text)
search_results = parser.xpath("//div[@id='search-results']//article")
if not search_results:
print("parsing from json data")
# identified as type 2 page
raw_json_data = parser.xpath('//script[@data-zrr-shared-data-key="mobileSearchPageStore"]//text()')
return get_data_from_json(raw_json_data)
print("parsing from html page")
properties_list = []
for properties in search_results:
raw_address = properties.xpath(".//span[@itemprop='address']//span[@itemprop='streetAddress']//text()")
raw_city = properties.xpath(".//span[@itemprop='address']//span[@itemprop='addressLocality']//text()")
raw_state = properties.xpath(".//span[@itemprop='address']//span[@itemprop='addressRegion']//text()")
raw_postal_code = properties.xpath(".//span[@itemprop='address']//span[@itemprop='postalCode']//text()")
raw_price = properties.xpath(".//span[@class='zsg-photo-card-price']//text()")
raw_info = properties.xpath(".//span[@class='zsg-photo-card-info']//text()")
raw_broker_name = properties.xpath(".//span[@class='zsg-photo-card-broker-name']//text()")
url = properties.xpath(".//a[contains(@class,'overlay-link')]/@href")
raw_title = properties.xpath(".//h4//text()")
address = clean(raw_address)
city = clean(raw_city)
state = clean(raw_state)
postal_code = clean(raw_postal_code)
price = clean(raw_price)
info = clean(raw_info).replace(u"\xb7", ',')
broker = clean(raw_broker_name)
title = clean(raw_title)
property_url = "https://www.zillow.com" + url[0] if url else None
is_forsale = properties.xpath('.//span[@class="zsg-icon-for-sale"]')
properties = {'address': address,
'city': city,
'state': state,
'postal_code': postal_code,
'price': price,
'facts and features': info,
'real estate provider': broker,
'url': property_url,
'title': title}
if is_forsale:
properties_list.append(properties)
return properties_list
if __name__ == "__main__":
# Reading arguments
argparser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
argparser.add_argument('zipcode', help='')
sortorder_help = """
available sort orders are :
newest : Latest property details,
cheapest : Properties with cheapest price
"""
argparser.add_argument('sort', nargs='?', help=sortorder_help, default='Homes For You')
args = argparser.parse_args()
zipcode = args.zipcode
sort = args.sort
print ("Fetching data for %s" % (zipcode))
scraped_data = parse(zipcode, sort)
if scraped_data:
print ("Writing data to output file")
write_data_to_csv(scraped_data)
@edwardrusch3

This comment has been minimized.

Copy link

@edwardrusch3 edwardrusch3 commented Jun 28, 2018

I continue to get blank data returned in the csv even though the script is giving a 200? Any idea?

@salimoha

This comment has been minimized.

Copy link

@salimoha salimoha commented Aug 11, 2018

Thanks for your code. I have two questions: 1) how can I get the zestimte price? 2) what is "url = "https://www.zillow.com/homes/for_sale/{0}_rb/?fromHomePage=true&shouldFireSellPageImplicitClaimGA=false&fromHomePageTab=buy".format(zipcode)
"
I want to get the list of houses from loopnet. What should I put instead of the above url?
Thanks

@NovTangoPapa

This comment has been minimized.

Copy link

@NovTangoPapa NovTangoPapa commented Sep 27, 2018

I continue to get blank data returned in the csv even though the script is giving a 200? Any idea?

I am having this same issue.

Edit:

It looks like it has to do with when you try to use a parsing argument (newest/cheapest). I am guessing that the associated links are bad.

Another Edit:

Doesn't look like the links.

@gumdropsteve

This comment has been minimized.

Copy link

@gumdropsteve gumdropsteve commented Oct 2, 2018

Hi, newer to Python, active Realtor® in SF Bay Area. End goal is to have this running on my site, but starting with it as is to better understand.

What would I need to change to have this execute in PyCharm (Windows 10)? Thanks.

@andresmillang

This comment has been minimized.

Copy link

@andresmillang andresmillang commented Nov 15, 2018

Hi, reason for the blank data is recaptcha

@keddisa

This comment has been minimized.

Copy link

@keddisa keddisa commented Nov 16, 2018

I keep getting this error message, can someone help?

usage: ipykernel_launcher.py [-h] zipcode [sort]
ipykernel_launcher.py: error: unrecognized arguments: -f

An exception has occurred, use %tb to see the full traceback.

SystemExit: 2

C:\Users\keddi\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py:2969: UserWarning: To exit: use 'exit', 'quit', or Ctrl-D.
warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)

@guyamir

This comment has been minimized.

Copy link

@guyamir guyamir commented Dec 23, 2018

I keep getting this error message, can someone help?

usage: ipykernel_launcher.py [-h] zipcode [sort]
ipykernel_launcher.py: error: unrecognized arguments: -f

An exception has occurred, use %tb to see the full traceback.

SystemExit: 2

C:\Users\keddi\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py:2969: UserWarning: To exit: use 'exit', 'quit', or Ctrl-D.
warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)

save the file and run it from the terminal with a zipcode. For instance:

~$ python3 zillow.py 10118

@MorganWeiss

This comment has been minimized.

Copy link

@MorganWeiss MorganWeiss commented Jan 2, 2019

I get the error "ZillowError: HTTPSConnectionPool(host='www.zillow.com', port=443): Max retries exceeded with url: /webservice/GetDeepSearchResults.htm?zws-id=X1-ZWz1gtmiat11xn_7ew1d&address=3400+Pacific+Ave.%2C+Marina+Del+Rey%2C+CA&citystatezip=90292 (Caused by SSLError(SSLError("bad handshake: Error([('SSL routines', 'ssl3_get_server_certificate', 'certificate verify failed')],)",),))"
any idea how to fix this?

@philxiao

This comment has been minimized.

Copy link

@philxiao philxiao commented Jan 4, 2019

This would generate empty result as-is due to Zillow's implementation of recaptcha. For people who would like to use this script, I suggest you use the crawl proxy to bypass recaptcha. https://proxycrawl.com/ Once you have that, simply prefix the zillow url with the proxycrawl endpoint provided in your dashboard.

@corpulent

This comment has been minimized.

Copy link

@corpulent corpulent commented Jan 14, 2019

Hi, newer to Python, active Realtor® in SF Bay Area. End goal is to have this running on my site, but starting with it as is to better understand.

What would I need to change to have this execute in PyCharm (Windows 10)? Thanks.

@gumdropsteve are you a realtor? I might be able to help.

@sarojrout

This comment has been minimized.

Copy link

@sarojrout sarojrout commented Feb 12, 2019

I continue to get blank data returned in the csv even though the script is giving a 200? Any idea?

I am also getting blank. did you get any solution?

@sebasslash

This comment has been minimized.

Copy link

@sebasslash sebasslash commented Mar 14, 2019

It works for me if you don't pass the second argument. So:

python main.py zipcode

Seems like the second argument is causing the scraper to yield no results...

@JCP281

This comment has been minimized.

Copy link

@JCP281 JCP281 commented Mar 28, 2019

Can someone clue me in as to why the results always cap at 200? Is Zillow limiting the return? Seems no matter what zipcode I try, only 200 results ever come back. No errors or anything.

@ghost

This comment has been minimized.

Copy link
Owner Author

@ghost ghost commented May 16, 2019

Can someone clue me in as to why the results always cap at 200? Is Zillow limiting the return? Seems no matter what zipcode I try, only 200 results ever come back. No errors or anything.

200 is not the result count. Its the status code received from zillow.com. The script will fetch only first page results.

@rahb3rt

This comment has been minimized.

Copy link

@rahb3rt rahb3rt commented Jun 10, 2019

Can someone clue me in as to why the results always cap at 200? Is Zillow limiting the return? Seems no matter what zipcode I try, only 200 results ever come back. No errors or anything.

200 is not the result count. Its the status code received from zillow.com. The script will fetch only first page results.

any way of getting all the pages for the given result?

@benyamin-persia

This comment has been minimized.

Copy link

@benyamin-persia benyamin-persia commented Jun 15, 2019

awesome code ,,,,
plz let me know how to fetch "contactPhone"
or what is the contactphone's XPATH ?

@robstryker

This comment has been minimized.

Copy link

@robstryker robstryker commented Jun 27, 2019

[rob@rawbdorable zillow] (master)$ python3 zillow.py 02126
Fetching data for 02126
https://www.zillow.com/homes/for_sale/02126_rb/?fromHomePage=true&shouldFireSellPageImplicitClaimGA=false&fromHomePageTab=buy
status code received: 200
parsing from json data
Traceback (most recent call last):
File "zillow.py", line 185, in
scraped_data = parse(zipcode, sort)
File "zillow.py", line 129, in parse
return get_data_from_json(raw_json_data)
File "zillow.py", line 74, in get_data_from_json
cleaned_data = clean(raw_json_data).replace('", "")
AttributeError: 'NoneType' object has no attribute 'replace'

@kyle-moeller

This comment has been minimized.

Copy link

@kyle-moeller kyle-moeller commented Jul 7, 2019

I'm having the same problem as robstryker. Been a Node.js dev for about a year, but first time doing scraping.

My issues are on lines 123/128:
search_results = parser.xpath("//div[@id='search-results']//article")
raw_json_data = parser.xpath('//script[@data-zrr-shared-data-key="mobileSearchPageStore"]//text()')

Both of these are returning empty arrays. I can see from viewing the html that there isn't a 'search-results' div, but there is a 'grid-search-results'. Editing this has not changed the result though, always an empty array. Similarly, I can find 'mobileSeachPageStore' in the html. In both places, I see the text we are trying to filter down to (address, price, etc.).

I'm not looking at the generated 'response.html' when you run the command though, this is me manually saving the webpage as html.

I added a few print statements, for response.text, search_results, and raw_json_data below. My next thought is the response mentions recaptcha, and has a header with text saying to verify that I'm a human to continue. I am more likely to think I'm doing something wrong, rather than the method is suddenly blocked though.

Did some more research while I was composing this, I noticed that in their robots.txt there is a disallow line for /homes/*_rb, which seems to be exactly what we are doing. Would that be a sufficient reason or am I reading too much into this?

PS E:\Projects\Zillow> python zillow.py 02126 newest
Fetching data for 02126
https://www.zillow.com/homes/for_sale/02126/0_singlestory/days_sort
status code received: 200
parser <html><head><meta name="robots" content="noindex, nofollow"/><script src="https://www.google.com/recaptcha/api.js"></script><link href="https://www.zillowstatic.com/vstatic/80d5e73/static/css/z-pages/captcha.css" type="text/css" rel="stylesheet" media="screen"/><script>

            function handleCaptcha(response) {
                var vid = getQueryString("vid"); // getQueryString is implemented below
                var uuid = getQueryString("uuid");
                var name = '_pxCaptcha';
                var cookieValue =  btoa(JSON.stringify({r:response,v:vid,u:uuid}));
                var cookieParts = [name, '=', cookieValue, '; path=/'];
                cookieParts.push('; domain=' + window.location.hostname);
                cookieParts.push('; max-age=10');//expire after 10 seconds
                document.cookie = cookieParts.join('');
                var originalURL = getOriginalUrl("url");
                var originalHost = window.location.host;
                var newHref = window.location.protocol + "//" + originalHost;
                originalURL = originalURL || '/';
                newHref = newHref + originalURL;
                window.location.href = newHref;
            }

            function getQueryString(name, url) {
                if (!url) url = window.location.href;
                name = name.replace(/[\[\]]/g, "\\$&");
                var regex = new RegExp("[?&]" + name + "(=([^&#]*)|&|#|$)"),
                results = regex.exec(url);
                if (!results) return null;
                if (!results[2]) return '';
                return decodeURIComponent(results[2].replace(/\+/g, " "));
            }

            function getOriginalUrl(name) {
                var url = getQueryString(name);
                if (!url) return null;
                var regExMatcher = new RegExp("(([^&#@]*)|&|#|$)");
                var matches = regExMatcher.exec(url);
                if (!matches) return null;
                return matches[0];
            }

            document.addEventListener("DOMContentLoaded", function (e) {
                var uuidVerifyRegExp = /^\{?[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\}?$/i;
                document.getElementById("uuid").innerText = "UUID: " + uuidVerifyRegExp.exec(getQueryString("uuid"));
            });
        </script></head><body><main class="zsg-layout-content"><div class="error-content-block"><div class="error-text-content"><!-- <h1>Captcha</h1> --><h5>Please verify you're a human to continue.</h5><div id="content" class="captcha-container"><div class="g-recaptcha" data-theme="white" data-callback="handleCaptcha" data-sitekey="6Lcj-R8TAAAAABs3FrRPuQhLMbp5QrHsHufzLf7b"></div></div><img src="https://www.zillowstatic.com/static/logos/logo-65x14.png" width="65" alt="Zillow" height="14"></img></div></div></main><h4 id="uuid" class="uuid-string zsg-fineprint"></h4></body></html><!-- H:028  T:17ms  S:2686  R:Sun Jul 07 16:21:49 PDT 2019  B:5.0.61033-master.e804620~delivery_ready.cd00c91 -->
search_results []
parsing from json data
raw json data []
Traceback (most recent call last):
  File "zillow.py", line 188, in <module>
    scraped_data = parse(zipcode, sort)
  File "zillow.py", line 132, in parse
    return get_data_from_json(raw_json_data)
  File "zillow.py", line 74, in get_data_from_json
    cleaned_data = clean(raw_json_data).replace('<!--', "").replace("-->", "")
AttributeError: 'NoneType' object has no attribute 'replace'
@sakibbuddy

This comment has been minimized.

Copy link

@sakibbuddy sakibbuddy commented Aug 7, 2019

@kyle-moeller, any update?

@zhuchangzhan

This comment has been minimized.

Copy link

@zhuchangzhan zhuchangzhan commented Nov 6, 2019

Here is how to fix the code:

Add this line to the beginning import section:

from urllib.request import Request, urlopen

Then rewrite part of the "parse" function with the below code:

def parse(zipcode, filter=None):
    url = create_url(zipcode, filter)
    response = get_response(url)
   
    if not response:
        print("Failed to fetch the page, please check `response.html` to see the response received from zillow.com.")
        return None

    # These two new lines are added
    req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
    webpage = urlopen(req).read()

    #replace the parser to take input added above
    #parser = html.fromstring(response.text)
    parser = html.fromstring(webpage)
@john9josi

This comment has been minimized.

Copy link

@john9josi john9josi commented Feb 3, 2020

Hi there - how can I make this iterate through more pages?

@Xiyu-1

This comment has been minimized.

Copy link

@Xiyu-1 Xiyu-1 commented Mar 7, 2020

Here is how to fix the code:

Add this line to the beginning import section:

from urllib.request import Request, urlopen

Then rewrite part of the "parse" function with the below code:

def parse(zipcode, filter=None):
    url = create_url(zipcode, filter)
    response = get_response(url)
   
    if not response:
        print("Failed to fetch the page, please check `response.html` to see the response received from zillow.com.")
        return None

    # These two new lines are added
    req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
    webpage = urlopen(req).read()

    #replace the parser to take input added above
    #parser = html.fromstring(response.text)
    parser = html.fromstring(webpage)

Thank you! It works out!!!

@mcadhoc

This comment has been minimized.

Copy link

@mcadhoc mcadhoc commented May 27, 2020

I want to make two modifications: scrape all results and adding additional filters. I already made the corrections above and have code working. If anyone wants to help send me a message.

@fuji1024

This comment has been minimized.

Copy link

@fuji1024 fuji1024 commented May 31, 2020

Hi there - how can I make this iterate through more pages?

I want to make two modifications: scrape all results and adding additional filters. I already made the corrections above and have code working. If anyone wants to help send me a message.

Has anyone been able to figure out how to iterate this through all pages?

@SamsunNaher

This comment has been minimized.

Copy link

@SamsunNaher SamsunNaher commented Jul 14, 2020

Capture
Hi, I am completely novice to python and web scraping and I badly need to extract data from Zillow. Could anyone please help me to fix this problem? Thanks in advance!

@chinmaykhole

This comment has been minimized.

Copy link

@chinmaykhole chinmaykhole commented Aug 7, 2020

Can someone help me with this error?
%run scraped.py 92115 newest
Fetching data for 92115
https://www.zillow.com/homes/for_sale/92115/0_singlestory/days_sort
status code received: 200
Traceback (most recent call last):

File "C:\Users\user\Desktop\scraped.py", line 185, in
scraped_data = parse(zipcode, sort)

File "C:\Users\user\Desktop\scraped.py", line 116, in parse
response = get_response(url)

File "C:\Users\user\Desktop\scraped.py", line 67, in get_response
save_to_file(response)

File "C:\Users\user\Desktop\scraped.py", line 42, in save_to_file
fp.write(response.text)

File "C:\Users\user\Anaconda3\lib\encodings\cp1252.py", line 19, in encode
return codecs.charmap_encode(input,self.errors,encoding_table)[0]
UnicodeEncodeError: 'charmap' codec can't encode character '\u0100' in position 29381: character maps to

@thefirstrea1

This comment has been minimized.

Copy link

@thefirstrea1 thefirstrea1 commented Sep 2, 2020

I want to make two modifications: scrape all results and adding additional filters. I already made the corrections above and have code working. If anyone wants to help send me a message.

Hi friends. Has anyone helped you with this?

@datageneralist

This comment has been minimized.

Copy link

@datageneralist datageneralist commented Sep 6, 2020

File "C:\Users\user\Anaconda3\lib\encodings\cp1252.py", line 19, in encode
return codecs.charmap_encode(input,self.errors,encoding_table)[0]
UnicodeEncodeError: 'charmap' codec can't encode character '\u0100' in position 29381: character maps to

I think adding a UTF-8 parameter will fix this. Take a look at this stack overflow link. It worked for me.

@nkpenta

This comment has been minimized.

Copy link

@nkpenta nkpenta commented Oct 23, 2020

can anyone help me with this error?

https://www.zillow.com/homes/for_sale/98075/0_singlestory/days_sort
status code received: 200
parsing from json data
Traceback (most recent call last):
File "C:/Users/Downloads/zillow.py", line 191, in
scraped_data = parse(zipcode, sort)
File "C:/Users/Downloads/zillow.py", line 135, in parse
return get_data_from_json(raw_json_data)
File "C:/Users/Downloads/zillow.py", line 80, in get_data_from_json
search_results = json_data.get('searchResults').get('listResults', [])
AttributeError: 'NoneType' object has no attribute 'get'

@seffignoz

This comment has been minimized.

Copy link

@seffignoz seffignoz commented Nov 6, 2020

replace this
search_results = json_data.get('searchResults').get('listResults', [])

with this
search_results = json_data.get('cat1').get('searchResults').get('listResults', [])

@thom1879

This comment has been minimized.

Copy link

@thom1879 thom1879 commented Jan 3, 2021

I can't get past captcha, anyone have any ideas?

@cbonsig

This comment has been minimized.

Copy link

@cbonsig cbonsig commented Jan 3, 2021

I gave up on this, and instead used the Zillow to Excel plugin with Chromium. It is not completely automated, but was sufficient to get the data that I wanted with a few minutes of effort (for sale and sold for several zip code regions). https://chrome.google.com/webstore/detail/zillow-to-excel/aecdekdgjlncaadbdiciepplaobhcjgi?hl=en

@skurjekar

This comment has been minimized.

Copy link

@skurjekar skurjekar commented Jan 11, 2021

Looks like Zillow has been blocking almost all kinds of automated requests. Have tried multiple user agents, and it lets me create a soup once, but not iteratively. If someone finds a solution, please feel free to share it here!

@BenCratsley

This comment has been minimized.

Copy link

@BenCratsley BenCratsley commented Feb 1, 2021

I want to make two modifications: scrape all results and adding additional filters. I already made the corrections above and have code working. If anyone wants to help send me a message.

can anyone help me with this error?

https://www.zillow.com/homes/for_sale/98075/0_singlestory/days_sort
status code received: 200
parsing from json data
Traceback (most recent call last):
File "C:/Users/Downloads/zillow.py", line 191, in
scraped_data = parse(zipcode, sort)
File "C:/Users/Downloads/zillow.py", line 135, in parse
return get_data_from_json(raw_json_data)
File "C:/Users/Downloads/zillow.py", line 80, in get_data_from_json
search_results = json_data.get('searchResults').get('listResults', [])
AttributeError: 'NoneType' object has no attribute 'get'

Any luck? I'm getting the same error.

@xzmerry

This comment has been minimized.

Copy link

@xzmerry xzmerry commented Feb 25, 2021

Thanks for all the comments above! I fix the code based on your comments and it works!

I modify two parts:

First, replace the row 81:
the wrong version: search_results = json_data.get('searchResults').get('listResults', [])
the correct version: search_results = json_data.get('cat1').get('searchResults').get('listResults', [])
Second, import io, and then:
replace "with open("response.html", 'w') as fp:" with:
with io.open("response.html", 'w', encoding="utf-8") as fp:

Notice, to run the code, run "python zillow.py 60615" in the terminal, where 60615 is an example of zip code.

And if you want to scrape Zillow data of multiple zip codes, use the following:
`run_cmd = 'python zillow.py '

for zc in zip_code_lst:
    z_run_cmd = run_cmd + zc
    os.system(z_run_cmd)`

where zip_code_lst is a list of zip codes.

@jogcruz

This comment has been minimized.

Copy link

@jogcruz jogcruz commented Mar 19, 2021

I was able to get past the captcha changing the headers:
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,/;q=0.8',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.8',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment