tylerjw/realtor.py

## realtor.py
'''
This file is for parsing realtor.com for house listings.
'''

from lxml import html
import requests
from pprint import pprint
from bs4 import BeautifulSoup
import re

base_url = 'http://www.realtor.com'
session = requests.Session()
session.headers['User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36'

colorado_url = ''.join([base_url, '/local/Colorado'])
print('Requesting Page: {}'.format(colorado_url))
page = session.get(colorado_url)
tree = html.fromstring(page.content)

#column 2 is the homes for sale, we only want these links
cities = tree.xpath('//*[@id="top-cities"]/table/tbody/tr/td[2]/a')
#number of homes for sale
sale = tree.xpath('//*[@id="top-cities"]/table/tbody/tr/td[2]/a/text()')
#number of homes for rent
rent = tree.xpath('//*[@id="top-cities"]/table/tbody/tr/td[3]/a/text()')
#remove commas from number of houses for rent and for sale
sale = [s.replace(',','') for s in sale]
rent = [r.replace(',','') for r in rent]
#build the names list - city names
names = [c.get('title').replace('homes for sale in ','') for c in cities]
#build the link lists
links = [c.get('href') for c in cities]
#add argument to make all sale listings show on one page
links = [''.join([base_url,l,'?pgsz={}'.format(s)]) for l,s in zip(links,sale)]
#for testing, only return one result
# links = [''.join([base_url,l,'?pgsz=1']) for l,s in zip(links,sale)]
#build the dictionary of cities
city_dict = {n:{'url':l,'sale':s,'rent':r} for n,l,s,r in zip(names,links,sale,rent)}

#iterate through all the cities
# for city,data in city_dict.items():
#     sale = data['sale']
#     rent = data['rent']
#     url = data['url']
#     print("{} ({}, {}): {}".format(city,sale,rent,url))

city = 'Aurora'
data = city_dict[city]
url = data['url']
print('Requesting page: {}'.format(url))
page = session.get(url).content.decode('utf8', 'ignore')
soup = BeautifulSoup(page, 'html.parser')
# print(soup.prettify().encode('utf8'))

address_links = set()
remove_arguments = re.compile('[^?]*') #match until the question mark
for link in soup.find_all(href=re.compile('detail')):
    # pprint(link.encode('utf8'))
    url = link.get('href',None)
    if url:
        url = remove_arguments.search(url).group(0)
        address_links.add(''.join([base_url, url.format('utf8')]))

for url in address_links:
    # print('Requesting page: {}'.format(url))
    page = session.get(url).content.decode('utf8', 'ignore')
    soup = BeautifulSoup(page, 'html.parser')
    # print(soup.prettify().encode('utf8'))
    address = soup.find(itemprop='streetAddress').string.encode('utf8')
    price = soup.find(itemprop='price').string.encode('utf8')
    print("Price: {}, Address: {}".format(price,address))
	'''
	This file is for parsing realtor.com for house listings.
	'''

	from lxml import html
	import requests
	from pprint import pprint
	from bs4 import BeautifulSoup
	import re

	base_url = 'http://www.realtor.com'
	session = requests.Session()
	session.headers['User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36'

	colorado_url = ''.join([base_url, '/local/Colorado'])
	print('Requesting Page: {}'.format(colorado_url))
	page = session.get(colorado_url)
	tree = html.fromstring(page.content)

	#column 2 is the homes for sale, we only want these links
	cities = tree.xpath('//*[@id="top-cities"]/table/tbody/tr/td[2]/a')
	#number of homes for sale
	sale = tree.xpath('//*[@id="top-cities"]/table/tbody/tr/td[2]/a/text()')
	#number of homes for rent
	rent = tree.xpath('//*[@id="top-cities"]/table/tbody/tr/td[3]/a/text()')
	#remove commas from number of houses for rent and for sale
	sale = [s.replace(',','') for s in sale]
	rent = [r.replace(',','') for r in rent]
	#build the names list - city names
	names = [c.get('title').replace('homes for sale in ','') for c in cities]
	#build the link lists
	links = [c.get('href') for c in cities]
	#add argument to make all sale listings show on one page
	links = [''.join([base_url,l,'?pgsz={}'.format(s)]) for l,s in zip(links,sale)]
	#for testing, only return one result
	# links = [''.join([base_url,l,'?pgsz=1']) for l,s in zip(links,sale)]
	#build the dictionary of cities
	city_dict = {n:{'url':l,'sale':s,'rent':r} for n,l,s,r in zip(names,links,sale,rent)}

	#iterate through all the cities
	# for city,data in city_dict.items():
	# sale = data['sale']
	# rent = data['rent']
	# url = data['url']
	# print("{} ({}, {}): {}".format(city,sale,rent,url))

	city = 'Aurora'
	data = city_dict[city]
	url = data['url']
	print('Requesting page: {}'.format(url))
	page = session.get(url).content.decode('utf8', 'ignore')
	soup = BeautifulSoup(page, 'html.parser')
	# print(soup.prettify().encode('utf8'))

	address_links = set()
	remove_arguments = re.compile('[^?]*') #match until the question mark
	for link in soup.find_all(href=re.compile('detail')):
	# pprint(link.encode('utf8'))
	url = link.get('href',None)
	if url:
	url = remove_arguments.search(url).group(0)
	address_links.add(''.join([base_url, url.format('utf8')]))

	for url in address_links:
	# print('Requesting page: {}'.format(url))
	page = session.get(url).content.decode('utf8', 'ignore')
	soup = BeautifulSoup(page, 'html.parser')
	# print(soup.prettify().encode('utf8'))
	address = soup.find(itemprop='streetAddress').string.encode('utf8')
	price = soup.find(itemprop='price').string.encode('utf8')
	print("Price: {}, Address: {}".format(price,address))