Created
August 14, 2020 19:43
-
-
Save damc-dev/7f89718e5374d4e9e3eff0529eb2bcfd to your computer and use it in GitHub Desktop.
Script to scrape addresses from Zillow and write them to CSV's
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from lxml import html | |
import requests | |
import unicodecsv as csv | |
import argparse | |
def parse(zipcode,page=0,filter=None): | |
if filter=="newest": | |
url = "https://www.zillow.com/homes/for_sale/{0}/{1}_p/0_singlestory/days_sort".format(zipcode, page) | |
elif filter == "cheapest": | |
url = "https://www.zillow.com/homes/for_sale/{0}/{1}_p/0_singlestory/pricea_sort/".format(zipcode, page) | |
else: | |
url = "https://www.zillow.com/homes/for_sale/{0}_rb/{1}_p/?fromHomePage=true&shouldFireSellPageImplicitClaimGA=false&fromHomePageTab=buy".format(zipcode, page) | |
for page in range(5): | |
# try: | |
headers= { | |
'accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', | |
'accept-encoding':'gzip, deflate, sdch, br', | |
'accept-language':'en-GB,en;q=0.8,en-US;q=0.6,ml;q=0.4', | |
'cache-control':'max-age=0', | |
'upgrade-insecure-requests':'1', | |
'user-agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' | |
} | |
response = requests.get(url,headers=headers) | |
print(response.status_code) | |
parser = html.fromstring(response.text) | |
search_results = parser.xpath("//div[@id='search-results']//article") | |
properties_list = [] | |
for properties in search_results: | |
raw_address = properties.xpath(".//span[@itemprop='address']//span[@itemprop='streetAddress']//text()") | |
raw_city = properties.xpath(".//span[@itemprop='address']//span[@itemprop='addressLocality']//text()") | |
raw_state= properties.xpath(".//span[@itemprop='address']//span[@itemprop='addressRegion']//text()") | |
raw_postal_code= properties.xpath(".//span[@itemprop='address']//span[@itemprop='postalCode']//text()") | |
raw_price = properties.xpath(".//span[@class='zsg-photo-card-price']//text()") | |
raw_info = properties.xpath(".//span[@class='zsg-photo-card-info']//text()") | |
raw_broker_name = properties.xpath(".//span[@class='zsg-photo-card-broker-name']//text()") | |
url = properties.xpath(".//a[contains(@class,'overlay-link')]/@href") | |
raw_title = properties.xpath(".//h4//text()") | |
address = ' '.join(' '.join(raw_address).split()) if raw_address else None | |
city = ''.join(raw_city).strip() if raw_city else None | |
state = ''.join(raw_state).strip() if raw_state else None | |
postal_code = ''.join(raw_postal_code).strip() if raw_postal_code else None | |
price = ''.join(raw_price).strip() if raw_price else None | |
info = ' '.join(' '.join(raw_info).split()).replace(u"\xb7",',') | |
broker = ''.join(raw_broker_name).strip() if raw_broker_name else None | |
title = ''.join(raw_title) if raw_title else None | |
property_url = "https://www.zillow.com"+url[0] if url else None | |
is_forsale = properties.xpath('.//span[@class="zsg-icon-for-sale"]') | |
properties = { | |
'address': address + ", " + city + ", " + state + ", " + postal_code | |
} | |
if is_forsale and address[0].isdigit(): | |
properties_list.append(properties) | |
return properties_list | |
# except: | |
# print ("Failed to process the page",url) | |
if __name__=="__main__": | |
argparser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter) | |
argparser.add_argument('--zipcode', help = '') | |
argparser.add_argument('--pages', type=int, default=0, help = '') | |
sortorder_help = """ | |
available sort orders are : | |
newest : Latest property details, | |
cheapest : Properties with cheapest price | |
""" | |
#argparser.add_argument('sort',nargs='?',help = sortorder_help,default ='Homes For You') | |
args = argparser.parse_args() | |
zipcodes = ["AL","AK","AZ","AR","CA","CO","CT","DE","FL","GA","HI","ID","IL","IN","IA","KS","KY","LA","ME","MD","MA","MI","MN","MS","MO","MT","NE","NV","NH","NJ","NM","NY","NC","ND","OH","OK","OR","PA","RI","SC","SD","TN","TX","UT","VT","VA","WA","WV","WI","WY"] | |
if args.zipcode: | |
zipcodes = [args.zipcode] | |
pages = args.pages | |
print ("Requesting pages %i"%(pages)) | |
#sort = args.sort | |
scraped_data = [] | |
for zipcode in zipcodes: | |
for page in range(pages): | |
print ("Fetching data for %s"%(zipcode)) | |
scraped_data.extend(parse(zipcode, page)) | |
print ("Writing data to output file") | |
with open("addresses-%s.csv"%(zipcode),'w') as writer: | |
for row in scraped_data: | |
writer.write(row["address"] + "\n") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment