shivamy/bengaluru_rents.ipynb

## bengaluru_rents.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              bengaluru_rents.ipynb
            
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## craigslist_housing_analysis.txt
#---------------------- Very intersting related links  ------------------
https://github.com/choldgraf/write-ups/blob/master/comp/craigslist_data_analysis.ipynb
https://github.com/choldgraf/write-ups/blob/master/comp/craigslist_scrape.ipynb
https://github.com/choldgraf/PythonDataScienceHandbook   -- Book!
https://github.com/choldgraf?tab=repositories  -- Hunderds of repos


#------------------------  This project
1. Use scrapy tool scrape the website with a specific url (See attached eby_housing.py)
2. Create a csv file with data elements
3. Run Jupyter with Panda

1. Crawl and scrape for needed data...
---------------------------------------
For more info on running Scrapy take a look at: https://python.gotrained.com/scrapy-tutorial-web-scraping-craigslist/

> source ~/venv/mine/bin/activate   (or alias > actmypy)
> pip install scrapy
It creates a command in ~/venv/mine/bin/scrapy

> cd ~/bins/
> scrapy startproject craig_03_2018
> cd craig_03_2018
> scrapy genspider eby_housing https://sfbay.craigslist.org/search/eby/hhh    # create spider called "eby_housing" with East Bay housing link...

(mine) shiva_m[bins]> tree craig_03_2018/

craig_03_2018/
├── craig_03_2018
│   ├── __init__.py
│   ├── __init__.pyc
│   ├── items.py
│   ├── pipelines.py
│   ├── settings.py
│   ├── settings.pyc
│   └── spiders
│       ├── __init__.py
│       ├── __init__.pyc
│       ├── eby_housing.py
│       └── eby_housing.pyc
└── scrapy.cfg

# edit eby_housing.py
> vim craig_03_2018/spiders/eby_housing.py    # Edit parser...
  # add and start_urls, "for loop" to parse through the xpath/html elements for required nodes...

> scrapy crawl eby_housing    # Now run spider eby_housing
  # Output will be from the parser


2. Create a csv file for Jupyter/Panda...
---------------------------------------
> scrapy crawl eby_housing -o eby_records.csv
> vim eby_records.csv
   # Clean records - remove $ and ft signs, remove " - " in housing, rename header appropriately, etc.

3. Start a notebook and run analysis
---------------------------------------
# Go to https://hub.mybinder.org/user/jupyterlab-jupyterlab-demo-u71ciiw0/lab
  # and create a new folder on the left side. Start a python notebook
  # In the new folder upload the above (eby_records.csv) file
  # Execute below notebooks

## eastbay_rents.ipynb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
# matplotlib.style.use('ggplot')
# matplotlib.use('TkAgg')
#--------------------------

rents = pd.read_csv('ebay_lists.csv')
# rents = pd.read_csv('eby_rents.csv')
new_rents_df = rents

new_rents_df['rent_per_sq_ft'] = new_rents_df['rent']/new_rents_df['sq_ft']
# new_rents_df['rent_in_hundreds'] = new_rents_df['rent']/100  # For scaling later divide by 100
# new_rents_df = new_rents_df.drop('rent', 1)
new_rents_df
#-----------------------

aggs2 = {
  # 'rent_in_hundreds':{
  #  # 'count': 'count',
  #  'mean' : 'mean',
  #  'max'  : 'max',
  #  'min'  : 'min',
  #  'median': 'median',
  #  'std'  : 'std',
  # },
  'rent' : {
    # 'count': 'count',
    'mean' : 'mean',
    'max'  : 'max',
    'min'  : 'min',
    'median': 'median',
    'std'  : 'std',
  },
  'sq_ft' : {
    'mean' : 'mean',
    'max'  : 'max',
    'min'  : 'min',
    'median': 'median',
    'std'  : 'std'
  },
  'rent_per_sq_ft' : {
    'mean' : 'mean',
    'max'  : 'max',
    'min'  : 'min',
    'median': 'median',
    'std'  : 'std'
  },
}
metrics = new_rents_df.groupby('bed_rooms').agg(aggs2)
metrics
#-------------------------

# fig = plt.subplots()
new_df = pd.DataFrame(new_rents_df, columns=('bed_rooms', 'rent', 'sq_ft'))
new_df.boxplot(by='bed_rooms')
plt.suptitle("") # Remove boxplot header
plt.show()


## eby_housing.py
# -*- coding: utf-8 -*-
import scrapy
import re

class EbyHousingSpider(scrapy.Spider):
    name = "eby_housing"
    allowed_domains = ["https://sfbay.craigslist.org/search/eby/hhh"]
    # start_urls = ['https://sfbay.craigslist.org/search/eby/hhh/']

    # East bay/ San Ramon...
    # start_urls = ['https://sfbay.craigslist.org/search/eby/apa?availabilityMode=0&max_bedrooms=5&min_bedrooms=4&query=san%20ramon']     # San ramon rental
    # start_urls = ['https://sfbay.craigslist.org/search/eby/hhh?query=san+ramon&min_bedrooms=4&max_bedrooms=5&availabilityMode=0&sale_date=all+dates']   # San ramon rental and sales

    # Bengaluru
    start_urls = ['http://bangalore.craigslist.co.in/search/apa?min_bedrooms=4&max_bedrooms=5&availabilityMode=0&sale_date=all+dates']  # Rentals...

    def parse(self, response):
        # pass

        # titles = response.xpath('//a[@class="result-title hdrlnk"]/text()').extract()
        # print(titles)
        # prices = response.xpath('//span[@class="result-price"]/text()').extract()
        # print(prices)
        houses = response.xpath('//p[@class="result-info"]')  # Return tree nodes?
        for h in houses:
            title = h.xpath('a/text()').extract_first()
            price = h.xpath('span[@class="result-meta"]/span[@class="result-price"]/text()').extract_first("")
            housing = h.xpath('span[@class="result-meta"]/span[@class="housing"]/text()').extract_first("")
            # housing has new lines and lots of spaces. Clean it up.
            housing_c = re.sub('\n\s+', ' ', housing)
            relative_url = h.xpath('a/@href').extract_first()
            absolute_url = response.urljoin(relative_url)

            yield{'Price':price, 'Housing':housing_c}  # 'Title':title, 'URL': absolute_url
	#---------------------- Very intersting related links ------------------
	https://github.com/choldgraf/write-ups/blob/master/comp/craigslist_data_analysis.ipynb
	https://github.com/choldgraf/write-ups/blob/master/comp/craigslist_scrape.ipynb
	https://github.com/choldgraf/PythonDataScienceHandbook -- Book!
	https://github.com/choldgraf?tab=repositories -- Hunderds of repos


	#------------------------ This project
	1. Use scrapy tool scrape the website with a specific url (See attached eby_housing.py)
	2. Create a csv file with data elements
	3. Run Jupyter with Panda

	1. Crawl and scrape for needed data...
	---------------------------------------
	For more info on running Scrapy take a look at: https://python.gotrained.com/scrapy-tutorial-web-scraping-craigslist/

	> source ~/venv/mine/bin/activate (or alias > actmypy)
	> pip install scrapy
	It creates a command in ~/venv/mine/bin/scrapy

	> cd ~/bins/
	> scrapy startproject craig_03_2018
	> cd craig_03_2018
	> scrapy genspider eby_housing https://sfbay.craigslist.org/search/eby/hhh # create spider called "eby_housing" with East Bay housing link...

	(mine) shiva_m[bins]> tree craig_03_2018/

	craig_03_2018/
	├── craig_03_2018
	│ ├── __init__.py
	│ ├── __init__.pyc
	│ ├── items.py
	│ ├── pipelines.py
	│ ├── settings.py
	│ ├── settings.pyc
	│ └── spiders
	│ ├── __init__.py
	│ ├── __init__.pyc
	│ ├── eby_housing.py
	│ └── eby_housing.pyc
	└── scrapy.cfg

	# edit eby_housing.py
	> vim craig_03_2018/spiders/eby_housing.py # Edit parser...
	# add and start_urls, "for loop" to parse through the xpath/html elements for required nodes...

	> scrapy crawl eby_housing # Now run spider eby_housing
	# Output will be from the parser


	2. Create a csv file for Jupyter/Panda...
	---------------------------------------
	> scrapy crawl eby_housing -o eby_records.csv
	> vim eby_records.csv
	# Clean records - remove $ and ft signs, remove " - " in housing, rename header appropriately, etc.

	3. Start a notebook and run analysis
	---------------------------------------
	# Go to https://hub.mybinder.org/user/jupyterlab-jupyterlab-demo-u71ciiw0/lab
	# and create a new folder on the left side. Start a python notebook
	# In the new folder upload the above (eby_records.csv) file
	# Execute below notebooks
	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import matplotlib
	# matplotlib.style.use('ggplot')
	# matplotlib.use('TkAgg')
	#--------------------------

	rents = pd.read_csv('ebay_lists.csv')
	# rents = pd.read_csv('eby_rents.csv')
	new_rents_df = rents

	new_rents_df['rent_per_sq_ft'] = new_rents_df['rent']/new_rents_df['sq_ft']
	# new_rents_df['rent_in_hundreds'] = new_rents_df['rent']/100 # For scaling later divide by 100
	# new_rents_df = new_rents_df.drop('rent', 1)
	new_rents_df
	#-----------------------

	aggs2 = {
	# 'rent_in_hundreds':{
	# # 'count': 'count',
	# 'mean' : 'mean',
	# 'max' : 'max',
	# 'min' : 'min',
	# 'median': 'median',
	# 'std' : 'std',
	# },
	'rent' : {
	# 'count': 'count',
	'mean' : 'mean',
	'max' : 'max',
	'min' : 'min',
	'median': 'median',
	'std' : 'std',
	},
	'sq_ft' : {
	'mean' : 'mean',
	'max' : 'max',
	'min' : 'min',
	'median': 'median',
	'std' : 'std'
	},
	'rent_per_sq_ft' : {
	'mean' : 'mean',
	'max' : 'max',
	'min' : 'min',
	'median': 'median',
	'std' : 'std'
	},
	}
	metrics = new_rents_df.groupby('bed_rooms').agg(aggs2)
	metrics
	#-------------------------

	# fig = plt.subplots()
	new_df = pd.DataFrame(new_rents_df, columns=('bed_rooms', 'rent', 'sq_ft'))
	new_df.boxplot(by='bed_rooms')
	plt.suptitle("") # Remove boxplot header
	plt.show()
	# -- coding: utf-8 --
	import scrapy
	import re

	class EbyHousingSpider(scrapy.Spider):
	name = "eby_housing"
	allowed_domains = ["https://sfbay.craigslist.org/search/eby/hhh"]
	# start_urls = ['https://sfbay.craigslist.org/search/eby/hhh/']

	# East bay/ San Ramon...
	# start_urls = ['https://sfbay.craigslist.org/search/eby/apa?availabilityMode=0&max_bedrooms=5&min_bedrooms=4&query=san%20ramon'] # San ramon rental
	# start_urls = ['https://sfbay.craigslist.org/search/eby/hhh?query=san+ramon&min_bedrooms=4&max_bedrooms=5&availabilityMode=0&sale_date=all+dates'] # San ramon rental and sales

	# Bengaluru
	start_urls = ['http://bangalore.craigslist.co.in/search/apa?min_bedrooms=4&max_bedrooms=5&availabilityMode=0&sale_date=all+dates'] # Rentals...

	def parse(self, response):
	# pass

	# titles = response.xpath('//a[@class="result-title hdrlnk"]/text()').extract()
	# print(titles)
	# prices = response.xpath('//span[@class="result-price"]/text()').extract()
	# print(prices)
	houses = response.xpath('//p[@class="result-info"]') # Return tree nodes?
	for h in houses:
	title = h.xpath('a/text()').extract_first()
	price = h.xpath('span[@class="result-meta"]/span[@class="result-price"]/text()').extract_first("")
	housing = h.xpath('span[@class="result-meta"]/span[@class="housing"]/text()').extract_first("")
	# housing has new lines and lots of spaces. Clean it up.
	housing_c = re.sub('\n\s+', ' ', housing)
	relative_url = h.xpath('a/@href').extract_first()
	absolute_url = response.urljoin(relative_url)

	yield{'Price':price, 'Housing':housing_c} # 'Title':title, 'URL': absolute_url