from lxml import html | |
import requests | |
import unicodecsv as csv | |
import argparse | |
import json | |
def clean(text): | |
if text: | |
return ' '.join(' '.join(text).split()) | |
return None | |
def get_headers(): | |
# Creating headers. | |
headers = {'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', | |
'accept-encoding': 'gzip, deflate, sdch, br', | |
'accept-language': 'en-GB,en;q=0.8,en-US;q=0.6,ml;q=0.4', | |
'cache-control': 'max-age=0', | |
'upgrade-insecure-requests': '1', | |
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'} | |
return headers | |
def create_url(zipcode, filter): | |
# Creating Zillow URL based on the filter. | |
if filter == "newest": | |
url = "https://www.zillow.com/homes/for_sale/{0}/0_singlestory/days_sort".format(zipcode) | |
elif filter == "cheapest": | |
url = "https://www.zillow.com/homes/for_sale/{0}/0_singlestory/pricea_sort/".format(zipcode) | |
else: | |
url = "https://www.zillow.com/homes/for_sale/{0}_rb/?fromHomePage=true&shouldFireSellPageImplicitClaimGA=false&fromHomePageTab=buy".format(zipcode) | |
print(url) | |
return url | |
def save_to_file(response): | |
# saving response to `response.html` | |
with open("response.html", 'w') as fp: | |
fp.write(response.text) | |
def write_data_to_csv(data): | |
# saving scraped data to csv. | |
with open("properties-%s.csv" % (zipcode), 'wb') as csvfile: | |
fieldnames = ['title', 'address', 'city', 'state', 'postal_code', 'price', 'facts and features', 'real estate provider', 'url'] | |
writer = csv.DictWriter(csvfile, fieldnames=fieldnames) | |
writer.writeheader() | |
for row in data: | |
writer.writerow(row) | |
def get_response(url): | |
# Getting response from zillow.com. | |
for i in range(5): | |
response = requests.get(url, headers=get_headers()) | |
print("status code received:", response.status_code) | |
if response.status_code != 200: | |
# saving response to file for debugging purpose. | |
save_to_file(response) | |
continue | |
else: | |
save_to_file(response) | |
return response | |
return None | |
def get_data_from_json(raw_json_data): | |
# getting data from json (type 2 of their A/B testing page) | |
cleaned_data = clean(raw_json_data).replace('<!--', "").replace("-->", "") | |
properties_list = [] | |
try: | |
json_data = json.loads(cleaned_data) | |
search_results = json_data.get('searchResults').get('listResults', []) | |
for properties in search_results: | |
address = properties.get('addressWithZip') | |
property_info = properties.get('hdpData', {}).get('homeInfo') | |
city = property_info.get('city') | |
state = property_info.get('state') | |
postal_code = property_info.get('zipcode') | |
price = properties.get('price') | |
bedrooms = properties.get('beds') | |
bathrooms = properties.get('baths') | |
area = properties.get('area') | |
info = f'{bedrooms} bds, {bathrooms} ba ,{area} sqft' | |
broker = properties.get('brokerName') | |
property_url = properties.get('detailUrl') | |
title = properties.get('statusText') | |
data = {'address': address, | |
'city': city, | |
'state': state, | |
'postal_code': postal_code, | |
'price': price, | |
'facts and features': info, | |
'real estate provider': broker, | |
'url': property_url, | |
'title': title} | |
properties_list.append(data) | |
return properties_list | |
except ValueError: | |
print("Invalid json") | |
return None | |
def parse(zipcode, filter=None): | |
url = create_url(zipcode, filter) | |
response = get_response(url) | |
if not response: | |
print("Failed to fetch the page, please check `response.html` to see the response received from zillow.com.") | |
return None | |
parser = html.fromstring(response.text) | |
search_results = parser.xpath("//div[@id='search-results']//article") | |
if not search_results: | |
print("parsing from json data") | |
# identified as type 2 page | |
raw_json_data = parser.xpath('//script[@data-zrr-shared-data-key="mobileSearchPageStore"]//text()') | |
return get_data_from_json(raw_json_data) | |
print("parsing from html page") | |
properties_list = [] | |
for properties in search_results: | |
raw_address = properties.xpath(".//span[@itemprop='address']//span[@itemprop='streetAddress']//text()") | |
raw_city = properties.xpath(".//span[@itemprop='address']//span[@itemprop='addressLocality']//text()") | |
raw_state = properties.xpath(".//span[@itemprop='address']//span[@itemprop='addressRegion']//text()") | |
raw_postal_code = properties.xpath(".//span[@itemprop='address']//span[@itemprop='postalCode']//text()") | |
raw_price = properties.xpath(".//span[@class='zsg-photo-card-price']//text()") | |
raw_info = properties.xpath(".//span[@class='zsg-photo-card-info']//text()") | |
raw_broker_name = properties.xpath(".//span[@class='zsg-photo-card-broker-name']//text()") | |
url = properties.xpath(".//a[contains(@class,'overlay-link')]/@href") | |
raw_title = properties.xpath(".//h4//text()") | |
address = clean(raw_address) | |
city = clean(raw_city) | |
state = clean(raw_state) | |
postal_code = clean(raw_postal_code) | |
price = clean(raw_price) | |
info = clean(raw_info).replace(u"\xb7", ',') | |
broker = clean(raw_broker_name) | |
title = clean(raw_title) | |
property_url = "https://www.zillow.com" + url[0] if url else None | |
is_forsale = properties.xpath('.//span[@class="zsg-icon-for-sale"]') | |
properties = {'address': address, | |
'city': city, | |
'state': state, | |
'postal_code': postal_code, | |
'price': price, | |
'facts and features': info, | |
'real estate provider': broker, | |
'url': property_url, | |
'title': title} | |
if is_forsale: | |
properties_list.append(properties) | |
return properties_list | |
if __name__ == "__main__": | |
# Reading arguments | |
argparser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter) | |
argparser.add_argument('zipcode', help='') | |
sortorder_help = """ | |
available sort orders are : | |
newest : Latest property details, | |
cheapest : Properties with cheapest price | |
""" | |
argparser.add_argument('sort', nargs='?', help=sortorder_help, default='Homes For You') | |
args = argparser.parse_args() | |
zipcode = args.zipcode | |
sort = args.sort | |
print ("Fetching data for %s" % (zipcode)) | |
scraped_data = parse(zipcode, sort) | |
if scraped_data: | |
print ("Writing data to output file") | |
write_data_to_csv(scraped_data) |
This comment has been minimized.
This comment has been minimized.
Thanks for your code. I have two questions: 1) how can I get the zestimte price? 2) what is "url = "https://www.zillow.com/homes/for_sale/{0}_rb/?fromHomePage=true&shouldFireSellPageImplicitClaimGA=false&fromHomePageTab=buy".format(zipcode) |
This comment has been minimized.
This comment has been minimized.
I am having this same issue. Edit: It looks like it has to do with when you try to use a parsing argument (newest/cheapest). I am guessing that the associated links are bad. Another Edit: Doesn't look like the links. |
This comment has been minimized.
This comment has been minimized.
Hi, newer to Python, active Realtor® in SF Bay Area. End goal is to have this running on my site, but starting with it as is to better understand. What would I need to change to have this execute in PyCharm (Windows 10)? Thanks. |
This comment has been minimized.
This comment has been minimized.
Hi, reason for the blank data is recaptcha |
This comment has been minimized.
This comment has been minimized.
I keep getting this error message, can someone help? usage: ipykernel_launcher.py [-h] zipcode [sort] An exception has occurred, use %tb to see the full traceback. SystemExit: 2 C:\Users\keddi\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py:2969: UserWarning: To exit: use 'exit', 'quit', or Ctrl-D. |
This comment has been minimized.
This comment has been minimized.
save the file and run it from the terminal with a zipcode. For instance: ~$ python3 zillow.py 10118 |
This comment has been minimized.
This comment has been minimized.
I get the error "ZillowError: HTTPSConnectionPool(host='www.zillow.com', port=443): Max retries exceeded with url: /webservice/GetDeepSearchResults.htm?zws-id=X1-ZWz1gtmiat11xn_7ew1d&address=3400+Pacific+Ave.%2C+Marina+Del+Rey%2C+CA&citystatezip=90292 (Caused by SSLError(SSLError("bad handshake: Error([('SSL routines', 'ssl3_get_server_certificate', 'certificate verify failed')],)",),))" |
This comment has been minimized.
This comment has been minimized.
This would generate empty result as-is due to Zillow's implementation of recaptcha. For people who would like to use this script, I suggest you use the crawl proxy to bypass recaptcha. https://proxycrawl.com/ Once you have that, simply prefix the zillow url with the proxycrawl endpoint provided in your dashboard. |
This comment has been minimized.
This comment has been minimized.
@gumdropsteve are you a realtor? I might be able to help. |
This comment has been minimized.
This comment has been minimized.
I am also getting blank. did you get any solution? |
This comment has been minimized.
This comment has been minimized.
It works for me if you don't pass the second argument. So: python main.py zipcode Seems like the second argument is causing the scraper to yield no results... |
This comment has been minimized.
This comment has been minimized.
Can someone clue me in as to why the results always cap at 200? Is Zillow limiting the return? Seems no matter what zipcode I try, only 200 results ever come back. No errors or anything. |
This comment has been minimized.
This comment has been minimized.
200 is not the result count. Its the status code received from zillow.com. The script will fetch only first page results. |
This comment has been minimized.
This comment has been minimized.
any way of getting all the pages for the given result? |
This comment has been minimized.
This comment has been minimized.
awesome code ,,,, |
This comment has been minimized.
This comment has been minimized.
[rob@rawbdorable zillow] (master)$ python3 zillow.py 02126 |
This comment has been minimized.
This comment has been minimized.
I'm having the same problem as robstryker. Been a Node.js dev for about a year, but first time doing scraping. My issues are on lines 123/128: Both of these are returning empty arrays. I can see from viewing the html that there isn't a 'search-results' div, but there is a 'grid-search-results'. Editing this has not changed the result though, always an empty array. Similarly, I can find 'mobileSeachPageStore' in the html. In both places, I see the text we are trying to filter down to (address, price, etc.). I'm not looking at the generated 'response.html' when you run the command though, this is me manually saving the webpage as html. I added a few print statements, for response.text, search_results, and raw_json_data below. My next thought is the response mentions recaptcha, and has a header with text saying to verify that I'm a human to continue. I am more likely to think I'm doing something wrong, rather than the method is suddenly blocked though. Did some more research while I was composing this, I noticed that in their robots.txt there is a disallow line for /homes/*_rb, which seems to be exactly what we are doing. Would that be a sufficient reason or am I reading too much into this?
|
This comment has been minimized.
This comment has been minimized.
@kyle-moeller, any update? |
This comment has been minimized.
This comment has been minimized.
Here is how to fix the code: Add this line to the beginning import section:
Then rewrite part of the "parse" function with the below code:
|
This comment has been minimized.
This comment has been minimized.
Hi there - how can I make this iterate through more pages? |
This comment has been minimized.
This comment has been minimized.
Thank you! It works out!!! |
This comment has been minimized.
This comment has been minimized.
I want to make two modifications: scrape all results and adding additional filters. I already made the corrections above and have code working. If anyone wants to help send me a message. |
This comment has been minimized.
This comment has been minimized.
Has anyone been able to figure out how to iterate this through all pages? |
This comment has been minimized.
This comment has been minimized.
This comment has been minimized.
This comment has been minimized.
Can someone help me with this error? File "C:\Users\user\Desktop\scraped.py", line 185, in File "C:\Users\user\Desktop\scraped.py", line 116, in parse File "C:\Users\user\Desktop\scraped.py", line 67, in get_response File "C:\Users\user\Desktop\scraped.py", line 42, in save_to_file File "C:\Users\user\Anaconda3\lib\encodings\cp1252.py", line 19, in encode |
This comment has been minimized.
This comment has been minimized.
Hi friends. Has anyone helped you with this? |
This comment has been minimized.
This comment has been minimized.
I think adding a UTF-8 parameter will fix this. Take a look at this stack overflow link. It worked for me. |
This comment has been minimized.
This comment has been minimized.
can anyone help me with this error? https://www.zillow.com/homes/for_sale/98075/0_singlestory/days_sort |
This comment has been minimized.
This comment has been minimized.
replace this with this |
This comment has been minimized.
This comment has been minimized.
I can't get past captcha, anyone have any ideas? |
This comment has been minimized.
This comment has been minimized.
I gave up on this, and instead used the Zillow to Excel plugin with Chromium. It is not completely automated, but was sufficient to get the data that I wanted with a few minutes of effort (for sale and sold for several zip code regions). https://chrome.google.com/webstore/detail/zillow-to-excel/aecdekdgjlncaadbdiciepplaobhcjgi?hl=en |
This comment has been minimized.
This comment has been minimized.
Looks like Zillow has been blocking almost all kinds of automated requests. Have tried multiple user agents, and it lets me create a soup once, but not iteratively. If someone finds a solution, please feel free to share it here! |
This comment has been minimized.
This comment has been minimized.
Any luck? I'm getting the same error. |
This comment has been minimized.
This comment has been minimized.
Thanks for all the comments above! I fix the code based on your comments and it works! I modify two parts: First, replace the row 81: Notice, to run the code, run " And if you want to scrape Zillow data of multiple zip codes, use the following:
where |
This comment has been minimized.
This comment has been minimized.
I was able to get past the captcha changing the headers: |
This comment has been minimized.
I continue to get blank data returned in the csv even though the script is giving a 200? Any idea?