-
-
Save scrapehero/5f51f344d68cf2c022eb2d23a2f1cf95 to your computer and use it in GitHub Desktop.
from lxml import html | |
import requests | |
import unicodecsv as csv | |
import argparse | |
import json | |
def clean(text): | |
if text: | |
return ' '.join(' '.join(text).split()) | |
return None | |
def get_headers(): | |
# Creating headers. | |
headers = {'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', | |
'accept-encoding': 'gzip, deflate, sdch, br', | |
'accept-language': 'en-GB,en;q=0.8,en-US;q=0.6,ml;q=0.4', | |
'cache-control': 'max-age=0', | |
'upgrade-insecure-requests': '1', | |
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'} | |
return headers | |
def create_url(zipcode, filter): | |
# Creating Zillow URL based on the filter. | |
if filter == "newest": | |
url = "https://www.zillow.com/homes/for_sale/{0}/0_singlestory/days_sort".format(zipcode) | |
elif filter == "cheapest": | |
url = "https://www.zillow.com/homes/for_sale/{0}/0_singlestory/pricea_sort/".format(zipcode) | |
else: | |
url = "https://www.zillow.com/homes/for_sale/{0}_rb/?fromHomePage=true&shouldFireSellPageImplicitClaimGA=false&fromHomePageTab=buy".format(zipcode) | |
print(url) | |
return url | |
def save_to_file(response): | |
# saving response to `response.html` | |
with open("response.html", 'w') as fp: | |
fp.write(response.text) | |
def write_data_to_csv(data): | |
# saving scraped data to csv. | |
with open("properties-%s.csv" % (zipcode), 'wb') as csvfile: | |
fieldnames = ['title', 'address', 'city', 'state', 'postal_code', 'price', 'facts and features', 'real estate provider', 'url'] | |
writer = csv.DictWriter(csvfile, fieldnames=fieldnames) | |
writer.writeheader() | |
for row in data: | |
writer.writerow(row) | |
def get_response(url): | |
# Getting response from zillow.com. | |
for i in range(5): | |
response = requests.get(url, headers=get_headers()) | |
print("status code received:", response.status_code) | |
if response.status_code != 200: | |
# saving response to file for debugging purpose. | |
save_to_file(response) | |
continue | |
else: | |
save_to_file(response) | |
return response | |
return None | |
def get_data_from_json(raw_json_data): | |
# getting data from json (type 2 of their A/B testing page) | |
cleaned_data = clean(raw_json_data).replace('<!--', "").replace("-->", "") | |
properties_list = [] | |
try: | |
json_data = json.loads(cleaned_data) | |
search_results = json_data.get('searchResults').get('listResults', []) | |
for properties in search_results: | |
address = properties.get('addressWithZip') | |
property_info = properties.get('hdpData', {}).get('homeInfo') | |
city = property_info.get('city') | |
state = property_info.get('state') | |
postal_code = property_info.get('zipcode') | |
price = properties.get('price') | |
bedrooms = properties.get('beds') | |
bathrooms = properties.get('baths') | |
area = properties.get('area') | |
info = f'{bedrooms} bds, {bathrooms} ba ,{area} sqft' | |
broker = properties.get('brokerName') | |
property_url = properties.get('detailUrl') | |
title = properties.get('statusText') | |
data = {'address': address, | |
'city': city, | |
'state': state, | |
'postal_code': postal_code, | |
'price': price, | |
'facts and features': info, | |
'real estate provider': broker, | |
'url': property_url, | |
'title': title} | |
properties_list.append(data) | |
return properties_list | |
except ValueError: | |
print("Invalid json") | |
return None | |
def parse(zipcode, filter=None): | |
url = create_url(zipcode, filter) | |
response = get_response(url) | |
if not response: | |
print("Failed to fetch the page, please check `response.html` to see the response received from zillow.com.") | |
return None | |
parser = html.fromstring(response.text) | |
search_results = parser.xpath("//div[@id='search-results']//article") | |
if not search_results: | |
print("parsing from json data") | |
# identified as type 2 page | |
raw_json_data = parser.xpath('//script[@data-zrr-shared-data-key="mobileSearchPageStore"]//text()') | |
return get_data_from_json(raw_json_data) | |
print("parsing from html page") | |
properties_list = [] | |
for properties in search_results: | |
raw_address = properties.xpath(".//span[@itemprop='address']//span[@itemprop='streetAddress']//text()") | |
raw_city = properties.xpath(".//span[@itemprop='address']//span[@itemprop='addressLocality']//text()") | |
raw_state = properties.xpath(".//span[@itemprop='address']//span[@itemprop='addressRegion']//text()") | |
raw_postal_code = properties.xpath(".//span[@itemprop='address']//span[@itemprop='postalCode']//text()") | |
raw_price = properties.xpath(".//span[@class='zsg-photo-card-price']//text()") | |
raw_info = properties.xpath(".//span[@class='zsg-photo-card-info']//text()") | |
raw_broker_name = properties.xpath(".//span[@class='zsg-photo-card-broker-name']//text()") | |
url = properties.xpath(".//a[contains(@class,'overlay-link')]/@href") | |
raw_title = properties.xpath(".//h4//text()") | |
address = clean(raw_address) | |
city = clean(raw_city) | |
state = clean(raw_state) | |
postal_code = clean(raw_postal_code) | |
price = clean(raw_price) | |
info = clean(raw_info).replace(u"\xb7", ',') | |
broker = clean(raw_broker_name) | |
title = clean(raw_title) | |
property_url = "https://www.zillow.com" + url[0] if url else None | |
is_forsale = properties.xpath('.//span[@class="zsg-icon-for-sale"]') | |
properties = {'address': address, | |
'city': city, | |
'state': state, | |
'postal_code': postal_code, | |
'price': price, | |
'facts and features': info, | |
'real estate provider': broker, | |
'url': property_url, | |
'title': title} | |
if is_forsale: | |
properties_list.append(properties) | |
return properties_list | |
if __name__ == "__main__": | |
# Reading arguments | |
argparser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter) | |
argparser.add_argument('zipcode', help='') | |
sortorder_help = """ | |
available sort orders are : | |
newest : Latest property details, | |
cheapest : Properties with cheapest price | |
""" | |
argparser.add_argument('sort', nargs='?', help=sortorder_help, default='Homes For You') | |
args = argparser.parse_args() | |
zipcode = args.zipcode | |
sort = args.sort | |
print ("Fetching data for %s" % (zipcode)) | |
scraped_data = parse(zipcode, sort) | |
if scraped_data: | |
print ("Writing data to output file") | |
write_data_to_csv(scraped_data) |
Here is how to fix the code:
Add this line to the beginning import section:
from urllib.request import Request, urlopen
Then rewrite part of the "parse" function with the below code:
def parse(zipcode, filter=None): url = create_url(zipcode, filter) response = get_response(url) if not response: print("Failed to fetch the page, please check `response.html` to see the response received from zillow.com.") return None # These two new lines are added req = Request(url, headers={'User-Agent': 'Mozilla/5.0'}) webpage = urlopen(req).read() #replace the parser to take input added above #parser = html.fromstring(response.text) parser = html.fromstring(webpage)
Thank you! It works out!!!
I want to make two modifications: scrape all results and adding additional filters. I already made the corrections above and have code working. If anyone wants to help send me a message.
Hi there - how can I make this iterate through more pages?
I want to make two modifications: scrape all results and adding additional filters. I already made the corrections above and have code working. If anyone wants to help send me a message.
Has anyone been able to figure out how to iterate this through all pages?
Can someone help me with this error?
%run scraped.py 92115 newest
Fetching data for 92115
https://www.zillow.com/homes/for_sale/92115/0_singlestory/days_sort
status code received: 200
Traceback (most recent call last):
File "C:\Users\user\Desktop\scraped.py", line 185, in
scraped_data = parse(zipcode, sort)
File "C:\Users\user\Desktop\scraped.py", line 116, in parse
response = get_response(url)
File "C:\Users\user\Desktop\scraped.py", line 67, in get_response
save_to_file(response)
File "C:\Users\user\Desktop\scraped.py", line 42, in save_to_file
fp.write(response.text)
File "C:\Users\user\Anaconda3\lib\encodings\cp1252.py", line 19, in encode
return codecs.charmap_encode(input,self.errors,encoding_table)[0]
UnicodeEncodeError: 'charmap' codec can't encode character '\u0100' in position 29381: character maps to
I want to make two modifications: scrape all results and adding additional filters. I already made the corrections above and have code working. If anyone wants to help send me a message.
Hi friends. Has anyone helped you with this?
File "C:\Users\user\Anaconda3\lib\encodings\cp1252.py", line 19, in encode
return codecs.charmap_encode(input,self.errors,encoding_table)[0]
UnicodeEncodeError: 'charmap' codec can't encode character '\u0100' in position 29381: character maps to
I think adding a UTF-8 parameter will fix this. Take a look at this stack overflow link. It worked for me.
can anyone help me with this error?
https://www.zillow.com/homes/for_sale/98075/0_singlestory/days_sort
status code received: 200
parsing from json data
Traceback (most recent call last):
File "C:/Users/Downloads/zillow.py", line 191, in
scraped_data = parse(zipcode, sort)
File "C:/Users/Downloads/zillow.py", line 135, in parse
return get_data_from_json(raw_json_data)
File "C:/Users/Downloads/zillow.py", line 80, in get_data_from_json
search_results = json_data.get('searchResults').get('listResults', [])
AttributeError: 'NoneType' object has no attribute 'get'
replace this
search_results = json_data.get('searchResults').get('listResults', [])
with this
search_results = json_data.get('cat1').get('searchResults').get('listResults', [])
I can't get past captcha, anyone have any ideas?
I gave up on this, and instead used the Zillow to Excel plugin with Chromium. It is not completely automated, but was sufficient to get the data that I wanted with a few minutes of effort (for sale and sold for several zip code regions). https://chrome.google.com/webstore/detail/zillow-to-excel/aecdekdgjlncaadbdiciepplaobhcjgi?hl=en
Looks like Zillow has been blocking almost all kinds of automated requests. Have tried multiple user agents, and it lets me create a soup once, but not iteratively. If someone finds a solution, please feel free to share it here!
I want to make two modifications: scrape all results and adding additional filters. I already made the corrections above and have code working. If anyone wants to help send me a message.
can anyone help me with this error?
https://www.zillow.com/homes/for_sale/98075/0_singlestory/days_sort
status code received: 200
parsing from json data
Traceback (most recent call last):
File "C:/Users/Downloads/zillow.py", line 191, in
scraped_data = parse(zipcode, sort)
File "C:/Users/Downloads/zillow.py", line 135, in parse
return get_data_from_json(raw_json_data)
File "C:/Users/Downloads/zillow.py", line 80, in get_data_from_json
search_results = json_data.get('searchResults').get('listResults', [])
AttributeError: 'NoneType' object has no attribute 'get'
Any luck? I'm getting the same error.
Thanks for all the comments above! I fix the code based on your comments and it works!
I modify two parts:
First, replace the row 81:
the wrong version: search_results = json_data.get('searchResults').get('listResults', [])
the correct version: search_results = json_data.get('cat1').get('searchResults').get('listResults', [])
Second, import io, and then:
replace "with open("response.html", 'w') as fp:
" with:
with io.open("response.html", 'w', encoding="utf-8") as fp:
Notice, to run the code, run "python zillow.py 60615
" in the terminal, where 60615
is an example of zip code.
And if you want to scrape Zillow data of multiple zip codes, use the following:
`run_cmd = 'python zillow.py '
for zc in zip_code_lst:
z_run_cmd = run_cmd + zc
os.system(z_run_cmd)`
where zip_code_lst
is a list of zip codes.
I was able to get past the captcha changing the headers:
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,/;q=0.8',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.8',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
}
Can someone help me with this error?
%run scraped.py 92115 newest
Fetching data for 92115
https://www.zillow.com/homes/for_sale/92115/0_singlestory/days_sort
status code received: 200
Traceback (most recent call last):File "C:\Users\user\Desktop\scraped.py", line 185, in
scraped_data = parse(zipcode, sort)File "C:\Users\user\Desktop\scraped.py", line 116, in parse
response = get_response(url)File "C:\Users\user\Desktop\scraped.py", line 67, in get_response
save_to_file(response)File "C:\Users\user\Desktop\scraped.py", line 42, in save_to_file
fp.write(response.text)File "C:\Users\user\Anaconda3\lib\encodings\cp1252.py", line 19, in encode
return codecs.charmap_encode(input,self.errors,encoding_table)[0]
UnicodeEncodeError: 'charmap' codec can't encode character '\u0100' in position 29381: character maps to
You need to write with binary mode i.e.
with open(YOURFILE, "b") as f:
f.write(SOMETHING)
Looks like Zillow has been blocking almost all kinds of automated requests. Have tried multiple user agents, and it lets me create a soup once, but not iteratively. If someone finds a solution, please feel free to share it here!
???
Hi can someone help me, i keep getting this error code
usage: zillow.py [-h] zipcode [sort]
zillow.py: error: the following arguments are required: zipcode
Hi there - how can I make this iterate through more pages?