damc-dev/zillow_address_scraper.py

## zillow_address_scraper.py
from lxml import html
import requests
import unicodecsv as csv
import argparse

def parse(zipcode,page=0,filter=None):

	if filter=="newest":
		url = "https://www.zillow.com/homes/for_sale/{0}/{1}_p/0_singlestory/days_sort".format(zipcode, page)
	elif filter == "cheapest":
		url = "https://www.zillow.com/homes/for_sale/{0}/{1}_p/0_singlestory/pricea_sort/".format(zipcode, page)
	else:
		url = "https://www.zillow.com/homes/for_sale/{0}_rb/{1}_p/?fromHomePage=true&shouldFireSellPageImplicitClaimGA=false&fromHomePageTab=buy".format(zipcode, page)

	for page in range(5):
		# try:
		headers= {
					'accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
					'accept-encoding':'gzip, deflate, sdch, br',
					'accept-language':'en-GB,en;q=0.8,en-US;q=0.6,ml;q=0.4',
					'cache-control':'max-age=0',
					'upgrade-insecure-requests':'1',
					'user-agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
		}
		response = requests.get(url,headers=headers)
		print(response.status_code)
		parser = html.fromstring(response.text)
		search_results = parser.xpath("//div[@id='search-results']//article")
		properties_list = []

		for properties in search_results:
			raw_address = properties.xpath(".//span[@itemprop='address']//span[@itemprop='streetAddress']//text()")
			raw_city = properties.xpath(".//span[@itemprop='address']//span[@itemprop='addressLocality']//text()")
			raw_state= properties.xpath(".//span[@itemprop='address']//span[@itemprop='addressRegion']//text()")
			raw_postal_code= properties.xpath(".//span[@itemprop='address']//span[@itemprop='postalCode']//text()")
			raw_price = properties.xpath(".//span[@class='zsg-photo-card-price']//text()")
			raw_info = properties.xpath(".//span[@class='zsg-photo-card-info']//text()")
			raw_broker_name = properties.xpath(".//span[@class='zsg-photo-card-broker-name']//text()")
			url = properties.xpath(".//a[contains(@class,'overlay-link')]/@href")
			raw_title = properties.xpath(".//h4//text()")

			address = ' '.join(' '.join(raw_address).split()) if raw_address else None
			city = ''.join(raw_city).strip() if raw_city else None
			state = ''.join(raw_state).strip() if raw_state else None
			postal_code = ''.join(raw_postal_code).strip() if raw_postal_code else None
			price = ''.join(raw_price).strip() if raw_price else None
			info = ' '.join(' '.join(raw_info).split()).replace(u"\xb7",',')
			broker = ''.join(raw_broker_name).strip() if raw_broker_name else None
			title = ''.join(raw_title) if raw_title else None
			property_url = "https://www.zillow.com"+url[0] if url else None
			is_forsale = properties.xpath('.//span[@class="zsg-icon-for-sale"]')
			properties = {
							'address': address + ", " + city + ", " + state + ", " + postal_code
			}
			if is_forsale and address[0].isdigit():
				properties_list.append(properties)
		return properties_list
		# except:
		# 	print ("Failed to process the page",url)

if __name__=="__main__":
	argparser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
	argparser.add_argument('--zipcode', help = '')
	argparser.add_argument('--pages', type=int, default=0, help = '')
	sortorder_help = """
    available sort orders are :
    newest : Latest property details,
    cheapest : Properties with cheapest price
    """
	#argparser.add_argument('sort',nargs='?',help = sortorder_help,default ='Homes For You')
	args = argparser.parse_args()
	zipcodes = ["AL","AK","AZ","AR","CA","CO","CT","DE","FL","GA","HI","ID","IL","IN","IA","KS","KY","LA","ME","MD","MA","MI","MN","MS","MO","MT","NE","NV","NH","NJ","NM","NY","NC","ND","OH","OK","OR","PA","RI","SC","SD","TN","TX","UT","VT","VA","WA","WV","WI","WY"]
	if args.zipcode:
		zipcodes = [args.zipcode]

	pages = args.pages
	print ("Requesting pages %i"%(pages))
	#sort = args.sort
	scraped_data = []
	for zipcode in zipcodes:
		for page in range(pages):
			print ("Fetching data for %s"%(zipcode))
			scraped_data.extend(parse(zipcode, page))
		print ("Writing data to output file")
		with open("addresses-%s.csv"%(zipcode),'w') as writer:
			for row in scraped_data:
				writer.write(row["address"] + "\n")
	from lxml import html
	import requests
	import unicodecsv as csv
	import argparse

	def parse(zipcode,page=0,filter=None):

	if filter=="newest":
	url = "https://www.zillow.com/homes/for_sale/{0}/{1}_p/0_singlestory/days_sort".format(zipcode, page)
	elif filter == "cheapest":
	url = "https://www.zillow.com/homes/for_sale/{0}/{1}_p/0_singlestory/pricea_sort/".format(zipcode, page)
	else:
	url = "https://www.zillow.com/homes/for_sale/{0}_rb/{1}_p/?fromHomePage=true&shouldFireSellPageImplicitClaimGA=false&fromHomePageTab=buy".format(zipcode, page)

	for page in range(5):
	# try:
	headers= {
	'accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8',
	'accept-encoding':'gzip, deflate, sdch, br',
	'accept-language':'en-GB,en;q=0.8,en-US;q=0.6,ml;q=0.4',
	'cache-control':'max-age=0',
	'upgrade-insecure-requests':'1',
	'user-agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
	}
	response = requests.get(url,headers=headers)
	print(response.status_code)
	parser = html.fromstring(response.text)
	search_results = parser.xpath("//div[@id='search-results']//article")
	properties_list = []

	for properties in search_results:
	raw_address = properties.xpath(".//span[@itemprop='address']//span[@itemprop='streetAddress']//text()")
	raw_city = properties.xpath(".//span[@itemprop='address']//span[@itemprop='addressLocality']//text()")
	raw_state= properties.xpath(".//span[@itemprop='address']//span[@itemprop='addressRegion']//text()")
	raw_postal_code= properties.xpath(".//span[@itemprop='address']//span[@itemprop='postalCode']//text()")
	raw_price = properties.xpath(".//span[@class='zsg-photo-card-price']//text()")
	raw_info = properties.xpath(".//span[@class='zsg-photo-card-info']//text()")
	raw_broker_name = properties.xpath(".//span[@class='zsg-photo-card-broker-name']//text()")
	url = properties.xpath(".//a[contains(@class,'overlay-link')]/@href")
	raw_title = properties.xpath(".//h4//text()")

	address = ' '.join(' '.join(raw_address).split()) if raw_address else None
	city = ''.join(raw_city).strip() if raw_city else None
	state = ''.join(raw_state).strip() if raw_state else None
	postal_code = ''.join(raw_postal_code).strip() if raw_postal_code else None
	price = ''.join(raw_price).strip() if raw_price else None
	info = ' '.join(' '.join(raw_info).split()).replace(u"\xb7",',')
	broker = ''.join(raw_broker_name).strip() if raw_broker_name else None
	title = ''.join(raw_title) if raw_title else None
	property_url = "https://www.zillow.com"+url[0] if url else None
	is_forsale = properties.xpath('.//span[@class="zsg-icon-for-sale"]')
	properties = {
	'address': address + ", " + city + ", " + state + ", " + postal_code
	}
	if is_forsale and address[0].isdigit():
	properties_list.append(properties)
	return properties_list
	# except:
	# print ("Failed to process the page",url)

	if __name__=="__main__":
	argparser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
	argparser.add_argument('--zipcode', help = '')
	argparser.add_argument('--pages', type=int, default=0, help = '')
	sortorder_help = """
	available sort orders are :
	newest : Latest property details,
	cheapest : Properties with cheapest price
	"""
	#argparser.add_argument('sort',nargs='?',help = sortorder_help,default ='Homes For You')
	args = argparser.parse_args()
	zipcodes = ["AL","AK","AZ","AR","CA","CO","CT","DE","FL","GA","HI","ID","IL","IN","IA","KS","KY","LA","ME","MD","MA","MI","MN","MS","MO","MT","NE","NV","NH","NJ","NM","NY","NC","ND","OH","OK","OR","PA","RI","SC","SD","TN","TX","UT","VT","VA","WA","WV","WI","WY"]
	if args.zipcode:
	zipcodes = [args.zipcode]

	pages = args.pages
	print ("Requesting pages %i"%(pages))
	#sort = args.sort
	scraped_data = []
	for zipcode in zipcodes:
	for page in range(pages):
	print ("Fetching data for %s"%(zipcode))
	scraped_data.extend(parse(zipcode, page))
	print ("Writing data to output file")
	with open("addresses-%s.csv"%(zipcode),'w') as writer:
	for row in scraped_data:
	writer.write(row["address"] + "\n")