Skip to content

Instantly share code, notes, and snippets.

@websitescraper
Created June 9, 2021 08:58
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save websitescraper/27994d3c71fcd420c9b0f15b83e63960 to your computer and use it in GitHub Desktop.
Save websitescraper/27994d3c71fcd420c9b0f15b83e63960 to your computer and use it in GitHub Desktop.
Python 3 script to find real estate listings of properties up for sale on zillow.com
from lxml import html
import requests
import unicodecsv as csv
import argparse
import json
def clean(text):
if text:
return ' '.join(' '.join(text).split())
return None
def get_headers():
# Creating headers.
headers = {'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'accept-encoding': 'gzip, deflate, sdch, br',
'accept-language': 'en-GB,en;q=0.8,en-US;q=0.6,ml;q=0.4',
'cache-control': 'max-age=0',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}
return headers
def create_url(zipcode, filter):
# Creating Zillow URL based on the filter.
if filter == "newest":
url = "https://www.zillow.com/homes/for_sale/{0}/0_singlestory/days_sort".format(zipcode)
elif filter == "cheapest":
url = "https://www.zillow.com/homes/for_sale/{0}/0_singlestory/pricea_sort/".format(zipcode)
else:
url = "https://www.zillow.com/homes/for_sale/{0}_rb/?fromHomePage=true&shouldFireSellPageImplicitClaimGA=false&fromHomePageTab=buy".format(zipcode)
print(url)
return url
def save_to_file(response):
# saving response to `response.html`
with open("response.html", 'w') as fp:
fp.write(response.text)
def write_data_to_csv(data):
# saving scraped data to csv.
with open("properties-%s.csv" % (zipcode), 'wb') as csvfile:
fieldnames = ['title', 'address', 'city', 'state', 'postal_code', 'price', 'facts and features', 'real estate provider', 'url']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for row in data:
writer.writerow(row)
def get_response(url):
# Getting response from zillow.com.
for i in range(5):
response = requests.get(url, headers=get_headers())
print("status code received:", response.status_code)
if response.status_code != 200:
# saving response to file for debugging purpose.
save_to_file(response)
continue
else:
save_to_file(response)
return response
return None
def get_data_from_json(raw_json_data):
# getting data from json (type 2 of their A/B testing page)
cleaned_data = clean(raw_json_data).replace('<!--', "").replace("-->", "")
properties_list = []
try:
json_data = json.loads(cleaned_data)
search_results = json_data.get('searchResults').get('listResults', [])
for properties in search_results:
address = properties.get('addressWithZip')
property_info = properties.get('hdpData', {}).get('homeInfo')
city = property_info.get('city')
state = property_info.get('state')
postal_code = property_info.get('zipcode')
price = properties.get('price')
bedrooms = properties.get('beds')
bathrooms = properties.get('baths')
area = properties.get('area')
info = f'{bedrooms} bds, {bathrooms} ba ,{area} sqft'
broker = properties.get('brokerName')
property_url = properties.get('detailUrl')
title = properties.get('statusText')
data = {'address': address,
'city': city,
'state': state,
'postal_code': postal_code,
'price': price,
'facts and features': info,
'real estate provider': broker,
'url': property_url,
'title': title}
properties_list.append(data)
return properties_list
except ValueError:
print("Invalid json")
return None
def parse(zipcode, filter=None):
url = create_url(zipcode, filter)
response = get_response(url)
if not response:
print("Failed to fetch the page, please check `response.html` to see the response received from zillow.com.")
return None
parser = html.fromstring(response.text)
search_results = parser.xpath("//div[@id='search-results']//article")
if not search_results:
print("parsing from json data")
# identified as type 2 page
raw_json_data = parser.xpath('//script[@data-zrr-shared-data-key="mobileSearchPageStore"]//text()')
return get_data_from_json(raw_json_data)
print("parsing from html page")
properties_list = []
for properties in search_results:
raw_address = properties.xpath(".//span[@itemprop='address']//span[@itemprop='streetAddress']//text()")
raw_city = properties.xpath(".//span[@itemprop='address']//span[@itemprop='addressLocality']//text()")
raw_state = properties.xpath(".//span[@itemprop='address']//span[@itemprop='addressRegion']//text()")
raw_postal_code = properties.xpath(".//span[@itemprop='address']//span[@itemprop='postalCode']//text()")
raw_price = properties.xpath(".//span[@class='zsg-photo-card-price']//text()")
raw_info = properties.xpath(".//span[@class='zsg-photo-card-info']//text()")
raw_broker_name = properties.xpath(".//span[@class='zsg-photo-card-broker-name']//text()")
url = properties.xpath(".//a[contains(@class,'overlay-link')]/@href")
raw_title = properties.xpath(".//h4//text()")
address = clean(raw_address)
city = clean(raw_city)
state = clean(raw_state)
postal_code = clean(raw_postal_code)
price = clean(raw_price)
info = clean(raw_info).replace(u"\xb7", ',')
broker = clean(raw_broker_name)
title = clean(raw_title)
property_url = "https://www.zillow.com" + url[0] if url else None
is_forsale = properties.xpath('.//span[@class="zsg-icon-for-sale"]')
properties = {'address': address,
'city': city,
'state': state,
'postal_code': postal_code,
'price': price,
'facts and features': info,
'real estate provider': broker,
'url': property_url,
'title': title}
if is_forsale:
properties_list.append(properties)
return properties_list
if __name__ == "__main__":
# Reading arguments
argparser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
argparser.add_argument('zipcode', help='')
sortorder_help = """
available sort orders are :
newest : Latest property details,
cheapest : Properties with cheapest price
"""
argparser.add_argument('sort', nargs='?', help=sortorder_help, default='Homes For You')
args = argparser.parse_args()
zipcode = args.zipcode
sort = args.sort
print ("Fetching data for %s" % (zipcode))
scraped_data = parse(zipcode, sort)
if scraped_data:
print ("Writing data to output file")
write_data_to_csv(scraped_data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment