Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
### NYC Housing Search ###
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import pandas as pd
# Read in NYC Zip Codes
zipcodes = pd.read_csv("/Users/erikgregorywebb/Documents/Python/nyc-housing/Data/nyc-zip-codes.csv")
zipcodes.head()
# Generate Craigslist Links
base_links = []
for i in range(0, len(zipcodes)):
link = "https://newyork.craigslist.org/search/aap?postal={}".format(zipcodes.iloc[i,2])
base_links.append(link)
# Extract Listing Data Function
def getZipListings(link):
# Open the driver
driver = webdriver.Chrome(executable_path="/Users/erikgregorywebb/Downloads/chromedriver 2")
driver.get(link)
# Prepare the vectors
titles = []
dates = []
prices = []
bedrooms = []
links = []
# Extract the data
items = driver.find_elements_by_class_name('result-info')
for item in items:
# Title
try:
titles.append(item.find_element_by_class_name('result-title').get_attribute('innerText'))
except:
titles.append("")
# Date
try:
dates.append(item.find_element_by_class_name('result-date').get_attribute('datetime'))
except:
dates.append("")
# Price
try:
prices.append(item.find_element_by_class_name('result-price').get_attribute('innerText'))
except:
prices.append("")
# Bedrooms
try:
bedrooms.append(item.find_element_by_class_name('housing').get_attribute('innerText'))
except:
bedrooms.append("")
# Link
try:
links.append(item.find_element_by_class_name('result-title').get_attribute('href'))
except:
links.append("")
driver.close()
data = [titles, dates, prices, bedrooms, links]
df = pd.DataFrame(data).transpose()
df.columns = ['Title', 'Date', 'Price', 'Bedrooms', 'Link']
df['Zipcode'] = int(link[-5:])
return df
# Loop over Zipcodes
housing = pd.DataFrame()
for link in base_links:
time.sleep(5)
try:
temp = getZipListings(link)
temp = temp.merge(zipcodes, on ='Zipcode', how='left')
housing = pd.concat([housing, temp])
except:
time.sleep(120)
housing = housing.merge(zipcodes, on ='Zipcode', how='left')
# Rearrange columns for order
housing = housing[['Borough', 'Neighborhood', 'Zipcode', 'Date', 'Price', 'Bedrooms', 'Title', 'Link']]
housing.head()
# Clean the Data
for i in range(0, len(housing)):
try: housing.iloc[i,4] = housing.iloc[i,4].replace('$', '')
except: housing.iloc[i,4] = housing.iloc[i,4]
try: housing.iloc[i,5] = housing.iloc[i,5].replace('\n', '')
except: housing.iloc[i,5] = housing.iloc[i,5]
try: housing.iloc[i,5] = housing.iloc[i,5].replace('-', '')
except: housing.iloc[i,5] = housing.iloc[i,5]
try: housing.iloc[i,5] = housing.iloc[i,5].strip()
except: housing.iloc[i,5] = housing.iloc[i,5]
try:
if housing.iloc[i,5].find('br') == True:
housing.iloc[i,5] = housing.iloc[i,5][0:3]
else:
housing.iloc[i,5] = None
except: None
# Remove Duplictates
housing = housing.drop_duplicates(subset = ['Zipcode', 'Price', 'Bedrooms', 'Title'], keep = 'first')
# Export the Data
housing.to_csv("nyc-housing.csv", index = False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.
You signed in with another tab or window. Reload to refresh your session. You signed out in another tab or window. Reload to refresh your session.