Skip to content

Instantly share code, notes, and snippets.

@damc-dev
Created August 14, 2020 19:43
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save damc-dev/7f89718e5374d4e9e3eff0529eb2bcfd to your computer and use it in GitHub Desktop.
Save damc-dev/7f89718e5374d4e9e3eff0529eb2bcfd to your computer and use it in GitHub Desktop.
Script to scrape addresses from Zillow and write them to CSV's
from lxml import html
import requests
import unicodecsv as csv
import argparse
def parse(zipcode,page=0,filter=None):
if filter=="newest":
url = "https://www.zillow.com/homes/for_sale/{0}/{1}_p/0_singlestory/days_sort".format(zipcode, page)
elif filter == "cheapest":
url = "https://www.zillow.com/homes/for_sale/{0}/{1}_p/0_singlestory/pricea_sort/".format(zipcode, page)
else:
url = "https://www.zillow.com/homes/for_sale/{0}_rb/{1}_p/?fromHomePage=true&shouldFireSellPageImplicitClaimGA=false&fromHomePageTab=buy".format(zipcode, page)
for page in range(5):
# try:
headers= {
'accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'accept-encoding':'gzip, deflate, sdch, br',
'accept-language':'en-GB,en;q=0.8,en-US;q=0.6,ml;q=0.4',
'cache-control':'max-age=0',
'upgrade-insecure-requests':'1',
'user-agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
}
response = requests.get(url,headers=headers)
print(response.status_code)
parser = html.fromstring(response.text)
search_results = parser.xpath("//div[@id='search-results']//article")
properties_list = []
for properties in search_results:
raw_address = properties.xpath(".//span[@itemprop='address']//span[@itemprop='streetAddress']//text()")
raw_city = properties.xpath(".//span[@itemprop='address']//span[@itemprop='addressLocality']//text()")
raw_state= properties.xpath(".//span[@itemprop='address']//span[@itemprop='addressRegion']//text()")
raw_postal_code= properties.xpath(".//span[@itemprop='address']//span[@itemprop='postalCode']//text()")
raw_price = properties.xpath(".//span[@class='zsg-photo-card-price']//text()")
raw_info = properties.xpath(".//span[@class='zsg-photo-card-info']//text()")
raw_broker_name = properties.xpath(".//span[@class='zsg-photo-card-broker-name']//text()")
url = properties.xpath(".//a[contains(@class,'overlay-link')]/@href")
raw_title = properties.xpath(".//h4//text()")
address = ' '.join(' '.join(raw_address).split()) if raw_address else None
city = ''.join(raw_city).strip() if raw_city else None
state = ''.join(raw_state).strip() if raw_state else None
postal_code = ''.join(raw_postal_code).strip() if raw_postal_code else None
price = ''.join(raw_price).strip() if raw_price else None
info = ' '.join(' '.join(raw_info).split()).replace(u"\xb7",',')
broker = ''.join(raw_broker_name).strip() if raw_broker_name else None
title = ''.join(raw_title) if raw_title else None
property_url = "https://www.zillow.com"+url[0] if url else None
is_forsale = properties.xpath('.//span[@class="zsg-icon-for-sale"]')
properties = {
'address': address + ", " + city + ", " + state + ", " + postal_code
}
if is_forsale and address[0].isdigit():
properties_list.append(properties)
return properties_list
# except:
# print ("Failed to process the page",url)
if __name__=="__main__":
argparser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
argparser.add_argument('--zipcode', help = '')
argparser.add_argument('--pages', type=int, default=0, help = '')
sortorder_help = """
available sort orders are :
newest : Latest property details,
cheapest : Properties with cheapest price
"""
#argparser.add_argument('sort',nargs='?',help = sortorder_help,default ='Homes For You')
args = argparser.parse_args()
zipcodes = ["AL","AK","AZ","AR","CA","CO","CT","DE","FL","GA","HI","ID","IL","IN","IA","KS","KY","LA","ME","MD","MA","MI","MN","MS","MO","MT","NE","NV","NH","NJ","NM","NY","NC","ND","OH","OK","OR","PA","RI","SC","SD","TN","TX","UT","VT","VA","WA","WV","WI","WY"]
if args.zipcode:
zipcodes = [args.zipcode]
pages = args.pages
print ("Requesting pages %i"%(pages))
#sort = args.sort
scraped_data = []
for zipcode in zipcodes:
for page in range(pages):
print ("Fetching data for %s"%(zipcode))
scraped_data.extend(parse(zipcode, page))
print ("Writing data to output file")
with open("addresses-%s.csv"%(zipcode),'w') as writer:
for row in scraped_data:
writer.write(row["address"] + "\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment