Created
March 31, 2016 21:07
-
-
Save kraravind/4ca82439ef710d4afc23a8478b86fdaf to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #Streeteasy webscrape | |
| #Author - Aravind Kr | |
| import time | |
| from bs4 import BeautifulSoup | |
| import pandas as pd | |
| import numpy as np | |
| import unicodedata | |
| from selenium import webdriver | |
| from selenium.webdriver.support.ui import WebDriverWait | |
| from selenium.webdriver.support import expected_conditions as EC | |
| from selenium.common.exceptions import TimeoutException | |
| from selenium.webdriver.common.by import By | |
| driver = webdriver.Firefox() | |
| driver.get('http://streeteasy.com/for-rent/nyc') | |
| #i=2 | |
| #while i > 0: # while loop code | |
| name = [] | |
| rent= [] | |
| type1= [] | |
| url=[] | |
| # This portion scrapes the rental listing pages one by one and colelcts details of rent, name and Urls for 18000+ listings | |
| for i in range(1587): | |
| delay = 5 # seconds | |
| try: | |
| wait = WebDriverWait(driver, 5) | |
| wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'next'))) | |
| print "Page is ready!" | |
| # WebDriverWait(driver, delay).until(EC.presence_of_element_located(driver.find_element_by_class_name('next'))) | |
| html_doc = driver.page_source | |
| soup = BeautifulSoup(html_doc, 'html.parser') | |
| y=soup.find_all('div', class_="details-title") | |
| for div in y: | |
| name.append(div.find("a").string) | |
| y1 = soup.find_all('span', class_="price") | |
| for st in y1: | |
| rent.append(st.string) | |
| y2= soup.find_all('span', class_="first_detail_cell") | |
| for st in y2: | |
| type1.append(st.string) | |
| for st in y: | |
| url.append(st.a.get('href')) | |
| #for i in range(0,14): | |
| #print name[i] | |
| # name[i]=unicodedata.normalize('NFKD', name[i]).encode('ascii','ignore') | |
| # rent[i]=unicodedata.normalize('NFKD', rent[i]).encode('ascii','ignore') | |
| # type1[i]=unicodedata.normalize('NFKD', type1[i]).encode('ascii','ignore') | |
| # url[i]=unicodedata.normalize('NFKD', url[i]).encode('ascii','ignore') | |
| driver.find_element_by_class_name('next').click() | |
| except TimeoutException: | |
| print "Loading took too much time!" | |
| #driver.execute_script("window.scrollTo(0, document.body.scrollHeight-40);") | |
| #time.sleep(5) | |
| # def wait_until_visible_then_click(element): | |
| # element = WebDriverWait(driver,5,poll_frequency=.2).until(EC.visibility_of(element)) | |
| link=['http://streeteasy.com' + s for s in url] | |
| data = {'Apartment':name, 'Rent':rent, 'Url':link} | |
| nycrental1= pd.DataFrame(data) | |
| nycrental1= pd.read_csv('nycrental.csv') | |
| import requests | |
| columns = ['id','location','daysonmarket', 'pricechange','details','position','amenities','listings'] | |
| frame = pd.DataFrame(columns=columns) | |
| # This second portion goes to each of the 18000+ urls to collect a lot of detailed information from the listing pages. This took | |
| # about 6 -7 hrs overnight. | |
| for i in range(0,18815): | |
| soup1= BeautifulSoup(requests.get(nycrental['Url'][i]).text) | |
| #listing availability | |
| import re | |
| #yy1=[] | |
| #for elem in soup1(text=re.compile('Listing Availability')): | |
| # yy1=elem.parent.parent | |
| #if(yy1): | |
| #yy1=yy1.text.split("\n")[2] | |
| yy2=[] | |
| k=soup1.find_all('div', class_="details_info") | |
| for div in k: | |
| if div.find("h6") != None: | |
| yy2.append(div.find("h6")) | |
| #for elem in soup1(text=re.compile('Available On')): | |
| #yy1=elem.parent.parent | |
| if(yy2): | |
| yy2=yy2[0].parent.text | |
| #days in market | |
| days=[] | |
| for div in k: | |
| if div.find("p") != None: | |
| days.append(div.find("p").string) | |
| #location | |
| k1=soup1.find_all('span', class_="nobreak") | |
| loc=[] | |
| for div in k1: | |
| if div.find("a") != None: | |
| loc.append(div.find("a").string) | |
| #amenities | |
| k2=soup1.find_all('div', class_="third") | |
| amen=[] | |
| for div in k2: | |
| if div.find("li") != None: | |
| amen.append(div.find("li").string) | |
| #nearest subway | |
| k3=soup1.find_all('p') | |
| #subway=[] | |
| #for div in k3: | |
| # if div.find("b") != None: | |
| #subway.append(div.find("b").string) | |
| for elem in soup1(text=re.compile('subways')): | |
| yy2=elem.parent.parent | |
| # | |
| k4=soup1.find_all('div', class_="inline") | |
| listing=[] | |
| for div in k4: | |
| if div.find("a") != None: | |
| listing.append(div.find("a").string) | |
| #k5=soup1.find_all('span', class_="detail_cell first_detail_cell") | |
| k6=soup1.find_all('span', class_="detail_cell") | |
| #k7=soup1.find_all('span', class_="detail_cell last_detail_cell") | |
| details=[] | |
| for j in range(0,len(k6)): | |
| details.append(k6[j].string) | |
| #to locate length of detail | |
| for p in range(0,len(k6)): | |
| cls = k6[p].get('class') | |
| if len(cls)==2: | |
| if cls[1]=='last_detail_cell': | |
| lim=p | |
| break | |
| details=details[0:lim+1] | |
| #last price change | |
| k8=soup1.find_all('span', class_="price_change") | |
| pricechange=[] | |
| #listing=[] | |
| #for div in k8: | |
| # if div.find("a") != None: | |
| #listing.append(div.find("a").string) | |
| #geo location | |
| k9=soup1.find_all('meta') | |
| lst=filter(lambda tag: tag.has_attr('name'),k9) | |
| filt=filter(lambda tag: tag['name']=='geo.position', lst) | |
| if(filt): | |
| position=filt[0]['content'] | |
| else: | |
| position='' | |
| # concatenate fields | |
| def xstr(s): | |
| if s is None: | |
| return '' | |
| return str(s) | |
| for q in range(len(amen)): | |
| amen[q]=xstr(amen[q]) | |
| for q in range(len(details)): | |
| details[q]=unicode(xstr(details[q])) | |
| for q in range(len(listing)): | |
| listing[q]=unicode(xstr(listing[q])) | |
| listing=" ".join(listing) | |
| details=" ".join(details) | |
| print amen | |
| if len(amen) > 0 and amen[0] != None: | |
| amen=" ".join(amen) | |
| else: | |
| amen = " " | |
| #update the Pandas Data frame | |
| if(k8): | |
| pricechange=k8[0].text | |
| #temp1['days on market'][i]=days[0] | |
| #temp1['location'][i]=loc[0] | |
| #temp1['price change'][i]=pricechange | |
| #temp1['details'][i]=details | |
| #temp1['position'][i]=position | |
| #temp1['amenities'][i]=amen | |
| #temp1['listings'][i]=listing | |
| location=[] | |
| if(loc): | |
| location=loc[0] | |
| else: | |
| location=unicode('') | |
| daysonmarket=[] | |
| if(days): | |
| daysonmarket=days[0] | |
| else: | |
| daysonmarket=unicode('') | |
| if(len(pricechange)==0): | |
| pricechange=unicode('') | |
| if(len(details)==0): | |
| details=unicode('') | |
| if(len(listing)==0): | |
| listing=unicode('') | |
| location=unicodedata.normalize('NFKD', location).encode('ascii','ignore') | |
| daysonmarket=unicodedata.normalize('NFKD',daysonmarket).encode('ascii','ignore') | |
| pricechange=unicodedata.normalize('NFKD', pricechange).encode('ascii','ignore') | |
| details=unicodedata.normalize('NFKD', details).encode('ascii','ignore') | |
| listing=unicodedata.normalize('NFKD', listing).encode('ascii','ignore') | |
| print i | |
| r=[i,location,daysonmarket,pricechange,details,position,amen,listing] | |
| t=pd.DataFrame([r], columns=columns) | |
| frame= pd.concat([t,frame],axis=0) | |
| if (i%100==0): | |
| frame.to_csv('nycrental'+str(i)+'.csv') | |
| print i.text |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment