Skip to content

Instantly share code, notes, and snippets.

@kraravind
Created March 31, 2016 21:07
Show Gist options
  • Select an option

  • Save kraravind/4ca82439ef710d4afc23a8478b86fdaf to your computer and use it in GitHub Desktop.

Select an option

Save kraravind/4ca82439ef710d4afc23a8478b86fdaf to your computer and use it in GitHub Desktop.
#Streeteasy webscrape
#Author - Aravind Kr
import time
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import unicodedata
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
driver = webdriver.Firefox()
driver.get('http://streeteasy.com/for-rent/nyc')
#i=2
#while i > 0: # while loop code
name = []
rent= []
type1= []
url=[]
# This portion scrapes the rental listing pages one by one and colelcts details of rent, name and Urls for 18000+ listings
for i in range(1587):
delay = 5 # seconds
try:
wait = WebDriverWait(driver, 5)
wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'next')))
print "Page is ready!"
# WebDriverWait(driver, delay).until(EC.presence_of_element_located(driver.find_element_by_class_name('next')))
html_doc = driver.page_source
soup = BeautifulSoup(html_doc, 'html.parser')
y=soup.find_all('div', class_="details-title")
for div in y:
name.append(div.find("a").string)
y1 = soup.find_all('span', class_="price")
for st in y1:
rent.append(st.string)
y2= soup.find_all('span', class_="first_detail_cell")
for st in y2:
type1.append(st.string)
for st in y:
url.append(st.a.get('href'))
#for i in range(0,14):
#print name[i]
# name[i]=unicodedata.normalize('NFKD', name[i]).encode('ascii','ignore')
# rent[i]=unicodedata.normalize('NFKD', rent[i]).encode('ascii','ignore')
# type1[i]=unicodedata.normalize('NFKD', type1[i]).encode('ascii','ignore')
# url[i]=unicodedata.normalize('NFKD', url[i]).encode('ascii','ignore')
driver.find_element_by_class_name('next').click()
except TimeoutException:
print "Loading took too much time!"
#driver.execute_script("window.scrollTo(0, document.body.scrollHeight-40);")
#time.sleep(5)
# def wait_until_visible_then_click(element):
# element = WebDriverWait(driver,5,poll_frequency=.2).until(EC.visibility_of(element))
link=['http://streeteasy.com' + s for s in url]
data = {'Apartment':name, 'Rent':rent, 'Url':link}
nycrental1= pd.DataFrame(data)
nycrental1= pd.read_csv('nycrental.csv')
import requests
columns = ['id','location','daysonmarket', 'pricechange','details','position','amenities','listings']
frame = pd.DataFrame(columns=columns)
# This second portion goes to each of the 18000+ urls to collect a lot of detailed information from the listing pages. This took
# about 6 -7 hrs overnight.
for i in range(0,18815):
soup1= BeautifulSoup(requests.get(nycrental['Url'][i]).text)
#listing availability
import re
#yy1=[]
#for elem in soup1(text=re.compile('Listing Availability')):
# yy1=elem.parent.parent
#if(yy1):
#yy1=yy1.text.split("\n")[2]
yy2=[]
k=soup1.find_all('div', class_="details_info")
for div in k:
if div.find("h6") != None:
yy2.append(div.find("h6"))
#for elem in soup1(text=re.compile('Available On')):
#yy1=elem.parent.parent
if(yy2):
yy2=yy2[0].parent.text
#days in market
days=[]
for div in k:
if div.find("p") != None:
days.append(div.find("p").string)
#location
k1=soup1.find_all('span', class_="nobreak")
loc=[]
for div in k1:
if div.find("a") != None:
loc.append(div.find("a").string)
#amenities
k2=soup1.find_all('div', class_="third")
amen=[]
for div in k2:
if div.find("li") != None:
amen.append(div.find("li").string)
#nearest subway
k3=soup1.find_all('p')
#subway=[]
#for div in k3:
# if div.find("b") != None:
#subway.append(div.find("b").string)
for elem in soup1(text=re.compile('subways')):
yy2=elem.parent.parent
#
k4=soup1.find_all('div', class_="inline")
listing=[]
for div in k4:
if div.find("a") != None:
listing.append(div.find("a").string)
#k5=soup1.find_all('span', class_="detail_cell first_detail_cell")
k6=soup1.find_all('span', class_="detail_cell")
#k7=soup1.find_all('span', class_="detail_cell last_detail_cell")
details=[]
for j in range(0,len(k6)):
details.append(k6[j].string)
#to locate length of detail
for p in range(0,len(k6)):
cls = k6[p].get('class')
if len(cls)==2:
if cls[1]=='last_detail_cell':
lim=p
break
details=details[0:lim+1]
#last price change
k8=soup1.find_all('span', class_="price_change")
pricechange=[]
#listing=[]
#for div in k8:
# if div.find("a") != None:
#listing.append(div.find("a").string)
#geo location
k9=soup1.find_all('meta')
lst=filter(lambda tag: tag.has_attr('name'),k9)
filt=filter(lambda tag: tag['name']=='geo.position', lst)
if(filt):
position=filt[0]['content']
else:
position=''
# concatenate fields
def xstr(s):
if s is None:
return ''
return str(s)
for q in range(len(amen)):
amen[q]=xstr(amen[q])
for q in range(len(details)):
details[q]=unicode(xstr(details[q]))
for q in range(len(listing)):
listing[q]=unicode(xstr(listing[q]))
listing=" ".join(listing)
details=" ".join(details)
print amen
if len(amen) > 0 and amen[0] != None:
amen=" ".join(amen)
else:
amen = " "
#update the Pandas Data frame
if(k8):
pricechange=k8[0].text
#temp1['days on market'][i]=days[0]
#temp1['location'][i]=loc[0]
#temp1['price change'][i]=pricechange
#temp1['details'][i]=details
#temp1['position'][i]=position
#temp1['amenities'][i]=amen
#temp1['listings'][i]=listing
location=[]
if(loc):
location=loc[0]
else:
location=unicode('')
daysonmarket=[]
if(days):
daysonmarket=days[0]
else:
daysonmarket=unicode('')
if(len(pricechange)==0):
pricechange=unicode('')
if(len(details)==0):
details=unicode('')
if(len(listing)==0):
listing=unicode('')
location=unicodedata.normalize('NFKD', location).encode('ascii','ignore')
daysonmarket=unicodedata.normalize('NFKD',daysonmarket).encode('ascii','ignore')
pricechange=unicodedata.normalize('NFKD', pricechange).encode('ascii','ignore')
details=unicodedata.normalize('NFKD', details).encode('ascii','ignore')
listing=unicodedata.normalize('NFKD', listing).encode('ascii','ignore')
print i
r=[i,location,daysonmarket,pricechange,details,position,amen,listing]
t=pd.DataFrame([r], columns=columns)
frame= pd.concat([t,frame],axis=0)
if (i%100==0):
frame.to_csv('nycrental'+str(i)+'.csv')
print i.text
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment