Created
March 6, 2017 05:48
-
-
Save tylerjw/14a64099f8fe20733f27ac2e7dedee74 to your computer and use it in GitHub Desktop.
realtor.com parsing example for blog post
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
This file is for parsing realtor.com for house listings. | |
''' | |
from lxml import html | |
import requests | |
from pprint import pprint | |
from bs4 import BeautifulSoup | |
import re | |
base_url = 'http://www.realtor.com' | |
session = requests.Session() | |
session.headers['User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36' | |
colorado_url = ''.join([base_url, '/local/Colorado']) | |
print('Requesting Page: {}'.format(colorado_url)) | |
page = session.get(colorado_url) | |
tree = html.fromstring(page.content) | |
#column 2 is the homes for sale, we only want these links | |
cities = tree.xpath('//*[@id="top-cities"]/table/tbody/tr/td[2]/a') | |
#number of homes for sale | |
sale = tree.xpath('//*[@id="top-cities"]/table/tbody/tr/td[2]/a/text()') | |
#number of homes for rent | |
rent = tree.xpath('//*[@id="top-cities"]/table/tbody/tr/td[3]/a/text()') | |
#remove commas from number of houses for rent and for sale | |
sale = [s.replace(',','') for s in sale] | |
rent = [r.replace(',','') for r in rent] | |
#build the names list - city names | |
names = [c.get('title').replace('homes for sale in ','') for c in cities] | |
#build the link lists | |
links = [c.get('href') for c in cities] | |
#add argument to make all sale listings show on one page | |
links = [''.join([base_url,l,'?pgsz={}'.format(s)]) for l,s in zip(links,sale)] | |
#for testing, only return one result | |
# links = [''.join([base_url,l,'?pgsz=1']) for l,s in zip(links,sale)] | |
#build the dictionary of cities | |
city_dict = {n:{'url':l,'sale':s,'rent':r} for n,l,s,r in zip(names,links,sale,rent)} | |
#iterate through all the cities | |
# for city,data in city_dict.items(): | |
# sale = data['sale'] | |
# rent = data['rent'] | |
# url = data['url'] | |
# print("{} ({}, {}): {}".format(city,sale,rent,url)) | |
city = 'Aurora' | |
data = city_dict[city] | |
url = data['url'] | |
print('Requesting page: {}'.format(url)) | |
page = session.get(url).content.decode('utf8', 'ignore') | |
soup = BeautifulSoup(page, 'html.parser') | |
# print(soup.prettify().encode('utf8')) | |
address_links = set() | |
remove_arguments = re.compile('[^?]*') #match until the question mark | |
for link in soup.find_all(href=re.compile('detail')): | |
# pprint(link.encode('utf8')) | |
url = link.get('href',None) | |
if url: | |
url = remove_arguments.search(url).group(0) | |
address_links.add(''.join([base_url, url.format('utf8')])) | |
for url in address_links: | |
# print('Requesting page: {}'.format(url)) | |
page = session.get(url).content.decode('utf8', 'ignore') | |
soup = BeautifulSoup(page, 'html.parser') | |
# print(soup.prettify().encode('utf8')) | |
address = soup.find(itemprop='streetAddress').string.encode('utf8') | |
price = soup.find(itemprop='price').string.encode('utf8') | |
print("Price: {}, Address: {}".format(price,address)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment