Last active
March 7, 2018 02:34
-
-
Save shivamy/83fe2f9d62bd868dcc90bb1ae15bcf3f to your computer and use it in GitHub Desktop.
craigslist_housing_analysis #craigslist #housing #rental
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#---------------------- Very intersting related links ------------------ | |
https://github.com/choldgraf/write-ups/blob/master/comp/craigslist_data_analysis.ipynb | |
https://github.com/choldgraf/write-ups/blob/master/comp/craigslist_scrape.ipynb | |
https://github.com/choldgraf/PythonDataScienceHandbook -- Book! | |
https://github.com/choldgraf?tab=repositories -- Hunderds of repos | |
#------------------------ This project | |
1. Use scrapy tool scrape the website with a specific url (See attached eby_housing.py) | |
2. Create a csv file with data elements | |
3. Run Jupyter with Panda | |
1. Crawl and scrape for needed data... | |
--------------------------------------- | |
For more info on running Scrapy take a look at: https://python.gotrained.com/scrapy-tutorial-web-scraping-craigslist/ | |
> source ~/venv/mine/bin/activate (or alias > actmypy) | |
> pip install scrapy | |
It creates a command in ~/venv/mine/bin/scrapy | |
> cd ~/bins/ | |
> scrapy startproject craig_03_2018 | |
> cd craig_03_2018 | |
> scrapy genspider eby_housing https://sfbay.craigslist.org/search/eby/hhh # create spider called "eby_housing" with East Bay housing link... | |
(mine) shiva_m[bins]> tree craig_03_2018/ | |
craig_03_2018/ | |
├── craig_03_2018 | |
│ ├── __init__.py | |
│ ├── __init__.pyc | |
│ ├── items.py | |
│ ├── pipelines.py | |
│ ├── settings.py | |
│ ├── settings.pyc | |
│ └── spiders | |
│ ├── __init__.py | |
│ ├── __init__.pyc | |
│ ├── eby_housing.py | |
│ └── eby_housing.pyc | |
└── scrapy.cfg | |
# edit eby_housing.py | |
> vim craig_03_2018/spiders/eby_housing.py # Edit parser... | |
# add and start_urls, "for loop" to parse through the xpath/html elements for required nodes... | |
> scrapy crawl eby_housing # Now run spider eby_housing | |
# Output will be from the parser | |
2. Create a csv file for Jupyter/Panda... | |
--------------------------------------- | |
> scrapy crawl eby_housing -o eby_records.csv | |
> vim eby_records.csv | |
# Clean records - remove $ and ft signs, remove " - " in housing, rename header appropriately, etc. | |
3. Start a notebook and run analysis | |
--------------------------------------- | |
# Go to https://hub.mybinder.org/user/jupyterlab-jupyterlab-demo-u71ciiw0/lab | |
# and create a new folder on the left side. Start a python notebook | |
# In the new folder upload the above (eby_records.csv) file | |
# Execute below notebooks |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import matplotlib | |
# matplotlib.style.use('ggplot') | |
# matplotlib.use('TkAgg') | |
#-------------------------- | |
rents = pd.read_csv('ebay_lists.csv') | |
# rents = pd.read_csv('eby_rents.csv') | |
new_rents_df = rents | |
new_rents_df['rent_per_sq_ft'] = new_rents_df['rent']/new_rents_df['sq_ft'] | |
# new_rents_df['rent_in_hundreds'] = new_rents_df['rent']/100 # For scaling later divide by 100 | |
# new_rents_df = new_rents_df.drop('rent', 1) | |
new_rents_df | |
#----------------------- | |
aggs2 = { | |
# 'rent_in_hundreds':{ | |
# # 'count': 'count', | |
# 'mean' : 'mean', | |
# 'max' : 'max', | |
# 'min' : 'min', | |
# 'median': 'median', | |
# 'std' : 'std', | |
# }, | |
'rent' : { | |
# 'count': 'count', | |
'mean' : 'mean', | |
'max' : 'max', | |
'min' : 'min', | |
'median': 'median', | |
'std' : 'std', | |
}, | |
'sq_ft' : { | |
'mean' : 'mean', | |
'max' : 'max', | |
'min' : 'min', | |
'median': 'median', | |
'std' : 'std' | |
}, | |
'rent_per_sq_ft' : { | |
'mean' : 'mean', | |
'max' : 'max', | |
'min' : 'min', | |
'median': 'median', | |
'std' : 'std' | |
}, | |
} | |
metrics = new_rents_df.groupby('bed_rooms').agg(aggs2) | |
metrics | |
#------------------------- | |
# fig = plt.subplots() | |
new_df = pd.DataFrame(new_rents_df, columns=('bed_rooms', 'rent', 'sq_ft')) | |
new_df.boxplot(by='bed_rooms') | |
plt.suptitle("") # Remove boxplot header | |
plt.show() | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import scrapy | |
import re | |
class EbyHousingSpider(scrapy.Spider): | |
name = "eby_housing" | |
allowed_domains = ["https://sfbay.craigslist.org/search/eby/hhh"] | |
# start_urls = ['https://sfbay.craigslist.org/search/eby/hhh/'] | |
# East bay/ San Ramon... | |
# start_urls = ['https://sfbay.craigslist.org/search/eby/apa?availabilityMode=0&max_bedrooms=5&min_bedrooms=4&query=san%20ramon'] # San ramon rental | |
# start_urls = ['https://sfbay.craigslist.org/search/eby/hhh?query=san+ramon&min_bedrooms=4&max_bedrooms=5&availabilityMode=0&sale_date=all+dates'] # San ramon rental and sales | |
# Bengaluru | |
start_urls = ['http://bangalore.craigslist.co.in/search/apa?min_bedrooms=4&max_bedrooms=5&availabilityMode=0&sale_date=all+dates'] # Rentals... | |
def parse(self, response): | |
# pass | |
# titles = response.xpath('//a[@class="result-title hdrlnk"]/text()').extract() | |
# print(titles) | |
# prices = response.xpath('//span[@class="result-price"]/text()').extract() | |
# print(prices) | |
houses = response.xpath('//p[@class="result-info"]') # Return tree nodes? | |
for h in houses: | |
title = h.xpath('a/text()').extract_first() | |
price = h.xpath('span[@class="result-meta"]/span[@class="result-price"]/text()').extract_first("") | |
housing = h.xpath('span[@class="result-meta"]/span[@class="housing"]/text()').extract_first("") | |
# housing has new lines and lots of spaces. Clean it up. | |
housing_c = re.sub('\n\s+', ' ', housing) | |
relative_url = h.xpath('a/@href').extract_first() | |
absolute_url = response.urljoin(relative_url) | |
yield{'Price':price, 'Housing':housing_c} # 'Title':title, 'URL': absolute_url |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment