Created
February 12, 2018 07:57
-
-
Save scrapehero/2dd61d0f1bd5222a4c9ae76465990cbd to your computer and use it in GitHub Desktop.
Python 2 code to extract real estate listings from zillow.com
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from lxml import html | |
import requests | |
import unicodecsv as csv | |
from exceptions import ValueError | |
import argparse | |
def parse(zipcode,filter=None): | |
if filter=="newest": | |
url = "https://www.zillow.com/homes/for_sale/{0}/0_singlestory/days_sort".format(zipcode) | |
elif filter == "cheapest": | |
url = "https://www.zillow.com/homes/for_sale/{0}/0_singlestory/pricea_sort/".format(zipcode) | |
else: | |
url = "https://www.zillow.com/homes/for_sale/{0}_rb/?fromHomePage=true&shouldFireSellPageImplicitClaimGA=false&fromHomePageTab=buy".format(zipcode) | |
for i in range(5): | |
try: | |
headers= { | |
'accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', | |
'accept-encoding':'gzip, deflate, sdch, br', | |
'accept-language':'en-GB,en;q=0.8,en-US;q=0.6,ml;q=0.4', | |
'cache-control':'max-age=0', | |
'upgrade-insecure-requests':'1', | |
'user-agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' | |
} | |
response = requests.get(url, headers=headers, verify=False) | |
parser = html.fromstring(response.text) | |
search_results = parser.xpath("//div[@id='search-results']//article") | |
properties_list = [] | |
for properties in search_results: | |
raw_address = properties.xpath(".//span[@itemprop='address']//span[@itemprop='streetAddress']//text()") | |
raw_city = properties.xpath(".//span[@itemprop='address']//span[@itemprop='addressLocality']//text()") | |
raw_state= properties.xpath(".//span[@itemprop='address']//span[@itemprop='addressRegion']//text()") | |
raw_postal_code= properties.xpath(".//span[@itemprop='address']//span[@itemprop='postalCode']//text()") | |
raw_price = properties.xpath(".//span[@class='zsg-photo-card-price']//text()") | |
raw_info = properties.xpath(".//span[@class='zsg-photo-card-info']//text()") | |
raw_broker_name = properties.xpath(".//span[@class='zsg-photo-card-broker-name']//text()") | |
url = properties.xpath(".//a[contains(@class,'overlay-link')]/@href") | |
raw_title = properties.xpath(".//h4//text()") | |
address = ' '.join(' '.join(raw_address).split()) if raw_address else None | |
city = ''.join(raw_city).strip() if raw_city else None | |
state = ''.join(raw_state).strip() if raw_state else None | |
postal_code = ''.join(raw_postal_code).strip() if raw_postal_code else None | |
price = ''.join(raw_price).strip() if raw_price else None | |
info = ' '.join(' '.join(raw_info).split()).replace(u"\xb7",',') | |
broker = ''.join(raw_broker_name).strip() if raw_broker_name else None | |
title = ''.join(raw_title) if raw_title else None | |
property_url = "https://www.zillow.com"+url[0] if url else None | |
is_forsale = properties.xpath('.//span[@class="zsg-icon-for-sale"]') | |
properties = { | |
'address':address, | |
'city':city, | |
'state':state, | |
'postal_code':postal_code, | |
'price':price, | |
'facts and features':info, | |
'real estate provider':broker, | |
'url':property_url, | |
'title':title | |
} | |
if is_forsale: | |
properties_list.append(properties) | |
return properties_list | |
except: | |
print "Failed to process the page",url | |
if __name__=="__main__": | |
argparser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter) | |
argparser.add_argument('zipcode',help = '') | |
sortorder_help = """ | |
available sort orders are : | |
newest : Latest property details, | |
cheapest : Properties with cheapest price | |
""" | |
argparser.add_argument('sort',nargs='?',help = sortorder_help,default ='Homes For You') | |
args = argparser.parse_args() | |
zipcode = args.zipcode | |
sort = args.sort | |
print "Fetching data for %s"%(zipcode) | |
scraped_data = parse(zipcode,sort) | |
print "Writing data to output file" | |
with open("properties-%s.csv"%(zipcode),'w')as csvfile: | |
fieldnames = ['title','address','city','state','postal_code','price','facts and features','real estate provider','url'] | |
writer = csv.DictWriter(csvfile, fieldnames=fieldnames) | |
writer.writeheader() | |
for row in scraped_data: | |
writer.writerow(row) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I can run it with no errors, but returns an empty csv file other than headers for any zip code / sorting combination. Output shown below:
C:\py>python zillow.py 77354 newest
Fetching data for 77354
C:\Python27\lib\site-packages\urllib3\connectionpool.py:858: InsecureRequestWarning: Unverified HTTPS request is being made. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings InsecureRequestWarning)
C:\Python27\lib\site-packages\urllib3\connectionpool.py:858: InsecureRequestWarning: Unverified HTTPS request is being made. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings InsecureRequestWarning)
Writing data to output file