Last active
August 29, 2015 13:58
-
-
Save kikiliu/10226757 to your computer and use it in GitHub Desktop.
Extract_Property_Info_fromAPI
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"import pandas as pd\n", | |
"from pandas import Series, DataFrame\n", | |
"\n", | |
"import settings\n", | |
"\n", | |
"from itertools import islice\n", | |
"\n", | |
"import requests\n", | |
"from lxml import etree" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 1 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"assert settings.apikey is not None\n", | |
"print settings.zwsid\n", | |
"print settings.apikey" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"hidden\n", | |
"hidden\n" | |
] | |
} | |
], | |
"prompt_number": 2 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"import urlparse\n", | |
"import urllib\n", | |
"from IPython.core.display import HTML" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 3 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"from urllib2 import urlopen\n", | |
"from urlparse import urljoin\n", | |
"from bs4 import BeautifulSoup" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 4 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"def set_url_parameter(url, name, value):\n", | |
" \"\"\"Take an input URL template and key-value string, one pair each call\"\"\"\n", | |
"\n", | |
" uri = urlparse.urlparse(url)\n", | |
" \n", | |
" #convert pr.query from string to dict \n", | |
" query_dict = urlparse.parse_qs(uri.query)#Parse a query string given as a string argument\n", | |
" query_dict[name]= value\n", | |
"\n", | |
" #e.g. http://stackoverflow.com/a/10233141/7782 \n", | |
" #convert a mapping object or a sequence (doseq=True) of two-element tuples to a \u201cpercent-encoded\u201d string\n", | |
" #suitable to pass to urlopen() as the optional data argument.\n", | |
" query = urllib.urlencode(query_dict, doseq=True)\n", | |
" uri = (uri.scheme, uri.netloc, uri.path, uri.params, query, uri.fragment)#avoid magic number[4]\n", | |
" \n", | |
" #Construct a URL from a tuple as returned by urlparse()\n", | |
" return urlparse.urlunparse(uri)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 5 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"oak_zip = [\"94601\", \"94602\", \"94605\", \"94606\", \"94607\", \"94610\", \n", | |
" \"94611\", \"94612\", \"94618\", \"94619\", \"94615\", \"94621\"]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 6 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"#http://www.zillow.com/howto/api/GetDemographics.htm\n", | |
"def getDemographics_xml(zipcode):\n", | |
" url_template_zillow = 'http://www.zillow.com/webservice/GetDemographics.htm?zws-id=&zip='\n", | |
" url = set_url_parameter(url_template_zillow, 'zws-id', settings.zwsid)\n", | |
" url = set_url_parameter(url, 'zip', zipcode)\n", | |
" response = requests.get(url).content \n", | |
"\n", | |
" return etree.fromstring(response)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 7 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"#http://developer.trulia.com/docs/read/TruliaStats/getZipCodeStats\n", | |
"def getZipCodeStats_xml(zipcode,startDate,endDate):\n", | |
" url_template_trulia = 'http://api.trulia.com/webservices.php?library=TruliaStats&function=getZipCodeStats&zipCode=&startDate=&endDate=&apikey='\n", | |
" url = set_url_parameter(url_template_trulia, 'apikey', settings.apikey)\n", | |
" url = set_url_parameter(url, 'zipCode', zipcode)\n", | |
" url = set_url_parameter(url, 'startDate', startDate)\n", | |
" url = set_url_parameter(url, 'endDate', endDate)\n", | |
" response = requests.get(url).content \n", | |
" \n", | |
" return etree.fromstring(response)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 8 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"def getDeepSearchResults_xml(addr, citystatezip):\n", | |
" url_template_zillow = 'http://www.zillow.com/webservice/GetDeepSearchResults.htm?zws-id=&address=&citystatezip='\n", | |
" url = set_url_parameter(url_template_zillow, 'zws-id', settings.zwsid)\n", | |
" url = set_url_parameter(url, 'address', addr)\n", | |
" url = set_url_parameter(url, 'citystatezip', citystatezip)\n", | |
" response = requests.get(url).content\n", | |
" \n", | |
" return etree.fromstring(response)\n", | |
"# return url\n", | |
"\n", | |
"# getDeepSearchResults_xml(\"2114 Bigelow Ave\", \"Seattle, WA\")" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 9 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"def parse_value(path, xml_doc):\n", | |
" '''Take xml doc retrieved and parse certain value according to the path'''\n", | |
" value_list = []\n", | |
" value = xml_doc.xpath(path)\n", | |
" if value != []:\n", | |
" for item in value:\n", | |
" value_list.append(item.text)\n", | |
" else:\n", | |
" value_list.append(0)\n", | |
" return value_list" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 10 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"#http://www.zillow.com/howto/api/GetDeepSearchResults.htm\n", | |
"def getDeepSearchResults(addr, citystatezip):\n", | |
" '''Take in address and city+state combination and/or ZIP code, output '''\n", | |
" \n", | |
" lastSoldDate_path = \"//response//lastSoldDate\"\n", | |
" lastSoldPrice_path = \"//response//lastSoldPrice\"\n", | |
" latitude_path = \"//response//address/latitude\"\n", | |
" longitude_path = \"//response//address/longitude\"\n", | |
" \n", | |
" doc = getDeepSearchResults_xml(addr, citystatezip)\n", | |
" lastSoldDate = parse_value(lastSoldDate_path, doc)[0]\n", | |
" lastSoldPrice = parse_value(lastSoldPrice_path, doc)[0]\n", | |
" latitude = parse_value(latitude_path, doc)[0]\n", | |
" longitude = parse_value(longitude_path, doc)[0]\n", | |
" \n", | |
" return Series([lastSoldDate, lastSoldPrice, latitude, longitude])\n", | |
"\n", | |
"getDeepSearchResults(\"2114 Bigelow Ave\", \"Seattle, WA\") " | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 12, | |
"text": [ | |
"0 11/26/2008\n", | |
"1 1025000\n", | |
"2 47.637933\n", | |
"3 -122.347938\n", | |
"dtype: object" | |
] | |
} | |
], | |
"prompt_number": 12 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"def download(url):\n", | |
" response = urlopen(url) \n", | |
" data = response.read()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 13 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"{zipcode:[1,2] for zipcode in oak_zip}" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 19, | |
"text": [ | |
"{'94601': [1, 2],\n", | |
" '94602': [1, 2],\n", | |
" '94605': [1, 2],\n", | |
" '94606': [1, 2],\n", | |
" '94607': [1, 2],\n", | |
" '94610': [1, 2],\n", | |
" '94611': [1, 2],\n", | |
" '94612': [1, 2],\n", | |
" '94615': [1, 2],\n", | |
" '94618': [1, 2],\n", | |
" '94619': [1, 2],\n", | |
" '94621': [1, 2]}" | |
] | |
} | |
], | |
"prompt_number": 19 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"def getDemographics_df(ziplist):\n", | |
" Index_path = \"//page[name='Affordability']//attribute[name='Zillow Home Value Index']/values/zip/value\"\n", | |
" SFH_path = \"//page[name='Affordability']//attribute[name='Median Single Family Home Value']/values/zip/value\"\n", | |
" Condo_path = \"//page[name='Affordability']//attribute[name='Median Condo Value']/values/zip/value\"\n", | |
" SalePrice_path = \"//page[name='Affordability']//attribute[name='Median Sale Price']/values/zip/value\"\n", | |
" Persqft_path = \"//page[name='Affordability']//attribute[name='Median Value Per Sq Ft']/values/zip/value\" \n", | |
" One_year_path = \"//page[name='Affordability']//attribute[name='1-Yr. Change']/values/zip/value\"\n", | |
" demographics = {zipcode:[] for zipcode in ziplist}\n", | |
" for zipcode in ziplist:\n", | |
" doc = getDemographics_xml(zipcode)\n", | |
" demographics[zipcode].append(parse_value(Index_path, doc))\n", | |
" demographics[zipcode].append(parse_value(Persqft_path, doc))\n", | |
" return DataFrame.from_dict(demographics, 'index')\n", | |
"# DataFrame([[zipcode, home_value, median_per_sqft] for zipcode, value in demographics for home_value, median_per_sqft in value])\n", | |
"\n", | |
"getDemographics_df(oak_zip)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>0</th>\n", | |
" <th>1</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>94606</th>\n", | |
" <td> [366100]</td>\n", | |
" <td> [250]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>94607</th>\n", | |
" <td> [355100]</td>\n", | |
" <td> [355]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>94605</th>\n", | |
" <td> [365000]</td>\n", | |
" <td> [260]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>94602</th>\n", | |
" <td> [632000]</td>\n", | |
" <td> [330]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>94610</th>\n", | |
" <td> [801300]</td>\n", | |
" <td> [360]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>94601</th>\n", | |
" <td> [281300]</td>\n", | |
" <td> [235]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>94612</th>\n", | |
" <td> [406300]</td>\n", | |
" <td> [311]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>94619</th>\n", | |
" <td> [466700]</td>\n", | |
" <td> [354]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>94618</th>\n", | |
" <td> [936100]</td>\n", | |
" <td> [506]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>94621</th>\n", | |
" <td> [200000]</td>\n", | |
" <td> [180]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>94611</th>\n", | |
" <td> [867800]</td>\n", | |
" <td> [413]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>94615</th>\n", | |
" <td> [0]</td>\n", | |
" <td> [0]</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>12 rows \u00d7 2 columns</p>\n", | |
"</div>" | |
], | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 35, | |
"text": [ | |
" 0 1\n", | |
"94606 [366100] [250]\n", | |
"94607 [355100] [355]\n", | |
"94605 [365000] [260]\n", | |
"94602 [632000] [330]\n", | |
"94610 [801300] [360]\n", | |
"94601 [281300] [235]\n", | |
"94612 [406300] [311]\n", | |
"94619 [466700] [354]\n", | |
"94618 [936100] [506]\n", | |
"94621 [200000] [180]\n", | |
"94611 [867800] [413]\n", | |
"94615 [0] [0]\n", | |
"\n", | |
"[12 rows x 2 columns]" | |
] | |
} | |
], | |
"prompt_number": 35 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"def getZipCodeStats_df(ziplist,startDate,endDate):\n", | |
" \n", | |
" MedianListing_path = \"//listingStats//subcategory[type='All Properties']/medianListingPrice\"\n", | |
" WeekEndingDate_path = \"//listingStats/listingStat/weekEndingDate\"\n", | |
" ZipCodeStats = {zipcode:[] for zipcode in ziplist}\n", | |
" for zipcode in ziplist:\n", | |
" doc = getZipCodeStats_xml(zipcode,startDate,endDate)\n", | |
" ZipCodeStats[zipcode].append(parse_value(MedianListing_path, doc))\n", | |
" ZipCodeStats[zipcode].append(parse_value(WeekEndingDate_path, doc))\n", | |
" return DataFrame.from_dict(ZipCodeStats, 'index')\n", | |
"\n", | |
" \n", | |
"getZipCodeStats_df(oak_zip, '2014-01-01', '2014-02-01')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>0</th>\n", | |
" <th>1</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>94606</th>\n", | |
" <td> [215757, 217457, 223400]</td>\n", | |
" <td> [2014-02-22, 2014-03-01, 2014-03-08]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>94607</th>\n", | |
" <td> [0]</td>\n", | |
" <td> [0]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>94605</th>\n", | |
" <td> [140650, 139179, 138350]</td>\n", | |
" <td> [2014-02-22, 2014-03-01, 2014-03-08]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>94602</th>\n", | |
" <td> [0]</td>\n", | |
" <td> [0]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>94610</th>\n", | |
" <td> [158471, 167043, 154300]</td>\n", | |
" <td> [2014-02-22, 2014-03-01, 2014-03-08]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>94601</th>\n", | |
" <td> [567613, 547134, 535358, 538907, 531871, 50071...</td>\n", | |
" <td> [2014-01-11, 2014-01-18, 2014-01-25, 2014-02-0...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>94612</th>\n", | |
" <td> [0]</td>\n", | |
" <td> [0]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>94619</th>\n", | |
" <td> [0]</td>\n", | |
" <td> [0]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>94618</th>\n", | |
" <td> [179843, 177736, 174843, 174871, 178086, 17947...</td>\n", | |
" <td> [2013-12-14, 2013-12-21, 2013-12-28, 2014-01-0...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>94621</th>\n", | |
" <td> [0]</td>\n", | |
" <td> [0]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>94611</th>\n", | |
" <td> [377029, 382250, 398286, 395000, 391536, 38885...</td>\n", | |
" <td> [2013-12-14, 2013-12-21, 2013-12-28, 2014-01-0...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>94615</th>\n", | |
" <td> [0]</td>\n", | |
" <td> [0]</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>12 rows \u00d7 2 columns</p>\n", | |
"</div>" | |
], | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 34, | |
"text": [ | |
" 0 \\\n", | |
"94606 [215757, 217457, 223400] \n", | |
"94607 [0] \n", | |
"94605 [140650, 139179, 138350] \n", | |
"94602 [0] \n", | |
"94610 [158471, 167043, 154300] \n", | |
"94601 [567613, 547134, 535358, 538907, 531871, 50071... \n", | |
"94612 [0] \n", | |
"94619 [0] \n", | |
"94618 [179843, 177736, 174843, 174871, 178086, 17947... \n", | |
"94621 [0] \n", | |
"94611 [377029, 382250, 398286, 395000, 391536, 38885... \n", | |
"94615 [0] \n", | |
"\n", | |
" 1 \n", | |
"94606 [2014-02-22, 2014-03-01, 2014-03-08] \n", | |
"94607 [0] \n", | |
"94605 [2014-02-22, 2014-03-01, 2014-03-08] \n", | |
"94602 [0] \n", | |
"94610 [2014-02-22, 2014-03-01, 2014-03-08] \n", | |
"94601 [2014-01-11, 2014-01-18, 2014-01-25, 2014-02-0... \n", | |
"94612 [0] \n", | |
"94619 [0] \n", | |
"94618 [2013-12-14, 2013-12-21, 2013-12-28, 2014-01-0... \n", | |
"94621 [0] \n", | |
"94611 [2013-12-14, 2013-12-21, 2013-12-28, 2014-01-0... \n", | |
"94615 [0] \n", | |
"\n", | |
"[12 rows x 2 columns]" | |
] | |
} | |
], | |
"prompt_number": 34 | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment