Skip to content

Instantly share code, notes, and snippets.

@tylerjw
Created March 6, 2017 05:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save tylerjw/14a64099f8fe20733f27ac2e7dedee74 to your computer and use it in GitHub Desktop.
Save tylerjw/14a64099f8fe20733f27ac2e7dedee74 to your computer and use it in GitHub Desktop.
realtor.com parsing example for blog post
'''
This file is for parsing realtor.com for house listings.
'''
from lxml import html
import requests
from pprint import pprint
from bs4 import BeautifulSoup
import re
base_url = 'http://www.realtor.com'
session = requests.Session()
session.headers['User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36'
colorado_url = ''.join([base_url, '/local/Colorado'])
print('Requesting Page: {}'.format(colorado_url))
page = session.get(colorado_url)
tree = html.fromstring(page.content)
#column 2 is the homes for sale, we only want these links
cities = tree.xpath('//*[@id="top-cities"]/table/tbody/tr/td[2]/a')
#number of homes for sale
sale = tree.xpath('//*[@id="top-cities"]/table/tbody/tr/td[2]/a/text()')
#number of homes for rent
rent = tree.xpath('//*[@id="top-cities"]/table/tbody/tr/td[3]/a/text()')
#remove commas from number of houses for rent and for sale
sale = [s.replace(',','') for s in sale]
rent = [r.replace(',','') for r in rent]
#build the names list - city names
names = [c.get('title').replace('homes for sale in ','') for c in cities]
#build the link lists
links = [c.get('href') for c in cities]
#add argument to make all sale listings show on one page
links = [''.join([base_url,l,'?pgsz={}'.format(s)]) for l,s in zip(links,sale)]
#for testing, only return one result
# links = [''.join([base_url,l,'?pgsz=1']) for l,s in zip(links,sale)]
#build the dictionary of cities
city_dict = {n:{'url':l,'sale':s,'rent':r} for n,l,s,r in zip(names,links,sale,rent)}
#iterate through all the cities
# for city,data in city_dict.items():
# sale = data['sale']
# rent = data['rent']
# url = data['url']
# print("{} ({}, {}): {}".format(city,sale,rent,url))
city = 'Aurora'
data = city_dict[city]
url = data['url']
print('Requesting page: {}'.format(url))
page = session.get(url).content.decode('utf8', 'ignore')
soup = BeautifulSoup(page, 'html.parser')
# print(soup.prettify().encode('utf8'))
address_links = set()
remove_arguments = re.compile('[^?]*') #match until the question mark
for link in soup.find_all(href=re.compile('detail')):
# pprint(link.encode('utf8'))
url = link.get('href',None)
if url:
url = remove_arguments.search(url).group(0)
address_links.add(''.join([base_url, url.format('utf8')]))
for url in address_links:
# print('Requesting page: {}'.format(url))
page = session.get(url).content.decode('utf8', 'ignore')
soup = BeautifulSoup(page, 'html.parser')
# print(soup.prettify().encode('utf8'))
address = soup.find(itemprop='streetAddress').string.encode('utf8')
price = soup.find(itemprop='price').string.encode('utf8')
print("Price: {}, Address: {}".format(price,address))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment