herrfz/Getting Data

## Getting Data
{
 "metadata": {
  "name": "getting_data"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "heading",
     "level": 2,
     "metadata": {},
     "source": [
      "Getting data -- in Python"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# getting and setting directory\n",
      "#\n",
      "# equivalent to R getwd() and setwd() commands\n",
      "#\n",
      "# this is the more general Python way\n",
      "# in IPython we can use system commands directly prefixed by !\n",
      "import os\n",
      "\n",
      "os.getcwd()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "pyout",
       "prompt_number": 1,
       "text": [
        "'/Users/erriza/Documents/ipython-notebooks'"
       ]
      }
     ],
     "prompt_number": 1
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "os.chdir('..')\n",
      "os.getcwd()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "pyout",
       "prompt_number": 2,
       "text": [
        "'/Users/erriza/Documents'"
       ]
      }
     ],
     "prompt_number": 2
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "os.chdir('./ipython-notebooks/')\n",
      "os.getcwd()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "pyout",
       "prompt_number": 3,
       "text": [
        "'/Users/erriza/Documents/ipython-notebooks'"
       ]
      }
     ],
     "prompt_number": 3
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# load data into Pandas\n",
      "import pandas as pd\n",
      "\n",
      "fileUrl = 'https://data.baltimorecity.gov/api/views/dz54-2aru/rows.csv?accessType=DOWNLOAD'\n",
      "\n",
      "# we can directly use read_csv to download the file\n",
      "# this is equivalent to R's combined download.file() and read.table() or read.csv() commands\n",
      "cameraData = pd.read_csv(fileUrl)\n",
      "\n",
      "cameraData.to_csv('./data/cameras.csv', index=False)\n",
      "\n",
      "# for simplicity I'll use IPython tricks to list folder contents\n",
      "!ls ./data\n",
      "\n",
      "\n",
      "# get current date and time\n",
      "# this is equivalent to R date() command\n",
      "# note that I use IPython ! prefix to run my system's command\n",
      "dateDownloaded = !date\n",
      "print dateDownloaded\n",
      "\n",
      "cameraData.head()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "camera.xls          camera.xlsx         cameras.csv         camerasModified.csv\r\n"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "['Thu Jan 31 21:32:24 CET 2013']\n"
       ]
      },
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>address</th>\n",
        "      <th>direction</th>\n",
        "      <th>street</th>\n",
        "      <th>crossStreet</th>\n",
        "      <th>intersection</th>\n",
        "      <th>Location 1</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <td><strong>0</strong></td>\n",
        "      <td>       S CATON AVE & BENSON AVE</td>\n",
        "      <td> N/B</td>\n",
        "      <td>   Caton Ave</td>\n",
        "      <td>   Benson Ave</td>\n",
        "      <td>     Caton Ave & Benson Ave</td>\n",
        "      <td> (39.2693779962, -76.6688185297)</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td><strong>1</strong></td>\n",
        "      <td>       S CATON AVE & BENSON AVE</td>\n",
        "      <td> S/B</td>\n",
        "      <td>   Caton Ave</td>\n",
        "      <td>   Benson Ave</td>\n",
        "      <td>     Caton Ave & Benson Ave</td>\n",
        "      <td> (39.2693157898, -76.6689698176)</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td><strong>2</strong></td>\n",
        "      <td> WILKENS AVE & PINE HEIGHTS AVE</td>\n",
        "      <td> E/B</td>\n",
        "      <td> Wilkens Ave</td>\n",
        "      <td> Pine Heights</td>\n",
        "      <td> Wilkens Ave & Pine Heights</td>\n",
        "      <td>  (39.2720252302, -76.676960806)</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td><strong>3</strong></td>\n",
        "      <td>        THE ALAMEDA & E 33RD ST</td>\n",
        "      <td> S/B</td>\n",
        "      <td> The Alameda</td>\n",
        "      <td>      33rd St</td>\n",
        "      <td>     The Alameda  & 33rd St</td>\n",
        "      <td> (39.3285013141, -76.5953545714)</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td><strong>4</strong></td>\n",
        "      <td>        E 33RD ST & THE ALAMEDA</td>\n",
        "      <td> E/B</td>\n",
        "      <td>      E 33rd</td>\n",
        "      <td>  The Alameda</td>\n",
        "      <td>      E 33rd  & The Alameda</td>\n",
        "      <td> (39.3283410623, -76.5953594625)</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "output_type": "pyout",
       "prompt_number": 4,
       "text": [
        "                          address direction       street   crossStreet  \\\n",
        "0        S CATON AVE & BENSON AVE       N/B    Caton Ave    Benson Ave   \n",
        "1        S CATON AVE & BENSON AVE       S/B    Caton Ave    Benson Ave   \n",
        "2  WILKENS AVE & PINE HEIGHTS AVE       E/B  Wilkens Ave  Pine Heights   \n",
        "3         THE ALAMEDA & E 33RD ST       S/B  The Alameda       33rd St   \n",
        "4         E 33RD ST & THE ALAMEDA       E/B       E 33rd   The Alameda   \n",
        "\n",
        "                 intersection                       Location 1  \n",
        "0      Caton Ave & Benson Ave  (39.2693779962, -76.6688185297)  \n",
        "1      Caton Ave & Benson Ave  (39.2693157898, -76.6689698176)  \n",
        "2  Wilkens Ave & Pine Heights   (39.2720252302, -76.676960806)  \n",
        "3      The Alameda  & 33rd St  (39.3285013141, -76.5953545714)  \n",
        "4       E 33rd  & The Alameda  (39.3283410623, -76.5953594625)  "
       ]
      }
     ],
     "prompt_number": 4
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# read Excel file\n",
      "# this is equivalent to R read.xlsx() and read.xlsx2() commands\n",
      "#\n",
      "# we need openpyxl 1.5.8 (don't use the latest version due to a bug) and xlrd packages\n",
      "# install with: \n",
      "# sudo pip install openpyxl==1.5.8\n",
      "# sudo pip install xlrd\n",
      "#\n",
      "# unfortunately Pandas ExcelFile() can't download and read at once (in contrast to read_csv())\n",
      "# so we need to resort to the basic Python way\n",
      "# also notice I'm using .xls; .xlsx doesn't work in my computer\n",
      "import urllib2\n",
      "\n",
      "# download the file as camera.xls and save it in ./data subfolder\n",
      "fileUrl = 'https://data.baltimorecity.gov/api/views/dz54-2aru/rows.xls?accessType=DOWNLOAD'\n",
      "f = urllib2.urlopen(fileUrl)\n",
      "data = f.read()\n",
      "with open('./data/camera.xls', 'wb') as w:\n",
      "    w.write(data)\n",
      "\n",
      "# load the Excel file as a pandas DataFrame\n",
      "cameraData = pd.ExcelFile('./data/camera.xls')\n",
      "cameraData = cameraData.parse('Baltimore Fixed Speed Cameras', index_col=None, na_values=['NA'])\n",
      "cameraData.head()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>address</th>\n",
        "      <th>direction</th>\n",
        "      <th>street</th>\n",
        "      <th>crossStreet</th>\n",
        "      <th>intersection</th>\n",
        "      <th>Location 1</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <td><strong>0</strong></td>\n",
        "      <td>       S CATON AVE & BENSON AVE</td>\n",
        "      <td> N/B</td>\n",
        "      <td>   Caton Ave</td>\n",
        "      <td>   Benson Ave</td>\n",
        "      <td>     Caton Ave & Benson Ave</td>\n",
        "      <td> (39.2693779962, -76.6688185297)</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td><strong>1</strong></td>\n",
        "      <td>       S CATON AVE & BENSON AVE</td>\n",
        "      <td> S/B</td>\n",
        "      <td>   Caton Ave</td>\n",
        "      <td>   Benson Ave</td>\n",
        "      <td>     Caton Ave & Benson Ave</td>\n",
        "      <td> (39.2693157898, -76.6689698176)</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td><strong>2</strong></td>\n",
        "      <td> WILKENS AVE & PINE HEIGHTS AVE</td>\n",
        "      <td> E/B</td>\n",
        "      <td> Wilkens Ave</td>\n",
        "      <td> Pine Heights</td>\n",
        "      <td> Wilkens Ave & Pine Heights</td>\n",
        "      <td>  (39.2720252302, -76.676960806)</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td><strong>3</strong></td>\n",
        "      <td>        THE ALAMEDA & E 33RD ST</td>\n",
        "      <td> S/B</td>\n",
        "      <td> The Alameda</td>\n",
        "      <td>      33rd St</td>\n",
        "      <td>     The Alameda  & 33rd St</td>\n",
        "      <td> (39.3285013141, -76.5953545714)</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td><strong>4</strong></td>\n",
        "      <td>        E 33RD ST & THE ALAMEDA</td>\n",
        "      <td> E/B</td>\n",
        "      <td>      E 33rd</td>\n",
        "      <td>  The Alameda</td>\n",
        "      <td>      E 33rd  & The Alameda</td>\n",
        "      <td> (39.3283410623, -76.5953594625)</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "output_type": "pyout",
       "prompt_number": 5,
       "text": [
        "                          address direction       street   crossStreet  \\\n",
        "0        S CATON AVE & BENSON AVE       N/B    Caton Ave    Benson Ave   \n",
        "1        S CATON AVE & BENSON AVE       S/B    Caton Ave    Benson Ave   \n",
        "2  WILKENS AVE & PINE HEIGHTS AVE       E/B  Wilkens Ave  Pine Heights   \n",
        "3         THE ALAMEDA & E 33RD ST       S/B  The Alameda       33rd St   \n",
        "4         E 33RD ST & THE ALAMEDA       E/B       E 33rd   The Alameda   \n",
        "\n",
        "                 intersection                       Location 1  \n",
        "0      Caton Ave & Benson Ave  (39.2693779962, -76.6688185297)  \n",
        "1      Caton Ave & Benson Ave  (39.2693157898, -76.6689698176)  \n",
        "2  Wilkens Ave & Pine Heights   (39.2720252302, -76.676960806)  \n",
        "3      The Alameda  & 33rd St  (39.3285013141, -76.5953545714)  \n",
        "4       E 33rd  & The Alameda  (39.3283410623, -76.5953594625)  "
       ]
      }
     ],
     "prompt_number": 5
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# R readLines() for reading a text file is similar to standard Python file access\n",
      "# I'm not going to detail it here\n",
      "\n",
      "# R readLines() to read data from a website is similar to Python with urllib2 package as in xls example above\n",
      "\n",
      "# read json\n",
      "# this is equivalent to R fromJSON() command\n",
      "import json\n",
      "\n",
      "# first we get the json file from the website\n",
      "fileUrl = 'https://data.baltimorecity.gov/api/views/dz54-2aru/rows.json?accessType=DOWNLOAD'\n",
      "req = urllib2.Request(fileUrl)\n",
      "opener = urllib2.build_opener()\n",
      "f = opener.open(req)\n",
      "\n",
      "# then we read it into a data structure\n",
      "jsonCamera = json.loads(f.read())\n",
      "\n",
      "# json is loadad as dictionary\n",
      "print jsonCamera['meta']['view']['id']\n",
      "print jsonCamera['meta']['view']['name']\n",
      "print jsonCamera['meta']['view']['attribution']"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "dz54-2aru\n",
        "Baltimore Fixed Speed Cameras\n",
        "Department of Transportation\n"
       ]
      }
     ],
     "prompt_number": 6
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# writing data\n",
      "\n",
      "# first read the csv file\n",
      "cameraData = pd.read_csv('./data/cameras.csv')\n",
      "# take a subset of the columns\n",
      "tmpData = cameraData.ix[:,2:]\n",
      "# then save it to a different csv file\n",
      "# this is equivalent to R write.table() command\n",
      "tmpData.to_csv('./data/camerasModified.csv', sep=',', index=False)\n",
      "cameraData2 = pd.read_csv('./data/camerasModified.csv')\n",
      "cameraData2.head()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>street</th>\n",
        "      <th>crossStreet</th>\n",
        "      <th>intersection</th>\n",
        "      <th>Location 1</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <td><strong>0</strong></td>\n",
        "      <td>   Caton Ave</td>\n",
        "      <td>   Benson Ave</td>\n",
        "      <td>     Caton Ave & Benson Ave</td>\n",
        "      <td> (39.2693779962, -76.6688185297)</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td><strong>1</strong></td>\n",
        "      <td>   Caton Ave</td>\n",
        "      <td>   Benson Ave</td>\n",
        "      <td>     Caton Ave & Benson Ave</td>\n",
        "      <td> (39.2693157898, -76.6689698176)</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td><strong>2</strong></td>\n",
        "      <td> Wilkens Ave</td>\n",
        "      <td> Pine Heights</td>\n",
        "      <td> Wilkens Ave & Pine Heights</td>\n",
        "      <td>  (39.2720252302, -76.676960806)</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td><strong>3</strong></td>\n",
        "      <td> The Alameda</td>\n",
        "      <td>      33rd St</td>\n",
        "      <td>     The Alameda  & 33rd St</td>\n",
        "      <td> (39.3285013141, -76.5953545714)</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td><strong>4</strong></td>\n",
        "      <td>      E 33rd</td>\n",
        "      <td>  The Alameda</td>\n",
        "      <td>      E 33rd  & The Alameda</td>\n",
        "      <td> (39.3283410623, -76.5953594625)</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "output_type": "pyout",
       "prompt_number": 7,
       "text": [
        "        street   crossStreet                intersection  \\\n",
        "0    Caton Ave    Benson Ave      Caton Ave & Benson Ave   \n",
        "1    Caton Ave    Benson Ave      Caton Ave & Benson Ave   \n",
        "2  Wilkens Ave  Pine Heights  Wilkens Ave & Pine Heights   \n",
        "3  The Alameda       33rd St      The Alameda  & 33rd St   \n",
        "4       E 33rd   The Alameda       E 33rd  & The Alameda   \n",
        "\n",
        "                        Location 1  \n",
        "0  (39.2693779962, -76.6688185297)  \n",
        "1  (39.2693157898, -76.6689698176)  \n",
        "2   (39.2720252302, -76.676960806)  \n",
        "3  (39.3285013141, -76.5953545714)  \n",
        "4  (39.3283410623, -76.5953594625)  "
       ]
      }
     ],
     "prompt_number": 7
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# I don't find the equivalent of R's saving and loading workspace commands"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 8
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# R's paste() and paste0() commands look like standard Python's string manipulations\n",
      "print ['./data' + str(i) + '.csv' for i in range(1, 6)]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "['./data1.csv', './data2.csv', './data3.csv', './data4.csv', './data5.csv']\n"
       ]
      }
     ],
     "prompt_number": 9
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# getting data off webpages\n",
      "from lxml.html import parse\n",
      "\n",
      "url = 'http://scholar.google.com/citations?user=HI-I6C0AAAAJ&hl=en'\n",
      "\n",
      "# this is equivalent to the combined R's opening/reading/closing connection and htmlTreeParse() commands\n",
      "html3 = parse(url).getroot()\n",
      "\n",
      "# get the title text using xpath expression\n",
      "# this is equivalent to R xpathSApply() command\n",
      "title = html3.xpath('//title')\n",
      "print [x.text_content() for x in title]\n",
      "\n",
      "# get the texts of col-citedby elements using xpath expression\n",
      "citedby = html3.xpath(\"//td[@id='col-citedby']\")\n",
      "print [x.text_content() for x in citedby]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "['Jeff Leek - Google Scholar Citations']\n",
        "['Cited by', '337', '172', '140', '133', '107', '95', '79', '78', '53', '17', '10', '9', '9', '8', '8', '6', '6', '6', '5', '3']\n"
       ]
      }
     ],
     "prompt_number": 10
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": []
    }
   ],
   "metadata": {}
  }
 ]
}
	{
	"metadata": {
	"name": "getting_data"
	},
	"nbformat": 3,
	"nbformat_minor": 0,
	"worksheets": [
	{
	"cells": [
	{
	"cell_type": "heading",
	"level": 2,
	"metadata": {},
	"source": [
	"Getting data -- in Python"
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# getting and setting directory\n",
	"#\n",
	"# equivalent to R getwd() and setwd() commands\n",
	"#\n",
	"# this is the more general Python way\n",
	"# in IPython we can use system commands directly prefixed by !\n",
	"import os\n",
	"\n",
	"os.getcwd()"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "pyout",
	"prompt_number": 1,
	"text": [
	"'/Users/erriza/Documents/ipython-notebooks'"
	]
	}
	],
	"prompt_number": 1
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"os.chdir('..')\n",
	"os.getcwd()"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "pyout",
	"prompt_number": 2,
	"text": [
	"'/Users/erriza/Documents'"
	]
	}
	],
	"prompt_number": 2
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"os.chdir('./ipython-notebooks/')\n",
	"os.getcwd()"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "pyout",
	"prompt_number": 3,
	"text": [
	"'/Users/erriza/Documents/ipython-notebooks'"
	]
	}
	],
	"prompt_number": 3
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# load data into Pandas\n",
	"import pandas as pd\n",
	"\n",
	"fileUrl = 'https://data.baltimorecity.gov/api/views/dz54-2aru/rows.csv?accessType=DOWNLOAD'\n",
	"\n",
	"# we can directly use read_csv to download the file\n",
	"# this is equivalent to R's combined download.file() and read.table() or read.csv() commands\n",
	"cameraData = pd.read_csv(fileUrl)\n",
	"\n",
	"cameraData.to_csv('./data/cameras.csv', index=False)\n",
	"\n",
	"# for simplicity I'll use IPython tricks to list folder contents\n",
	"!ls ./data\n",
	"\n",
	"\n",
	"# get current date and time\n",
	"# this is equivalent to R date() command\n",
	"# note that I use IPython ! prefix to run my system's command\n",
	"dateDownloaded = !date\n",
	"print dateDownloaded\n",
	"\n",
	"cameraData.head()"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"camera.xls camera.xlsx cameras.csv camerasModified.csv\r\n"
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"['Thu Jan 31 21:32:24 CET 2013']\n"
	]
	},
	{
	"html": [
	"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>address</th>\n",
	" <th>direction</th>\n",
	" <th>street</th>\n",
	" <th>crossStreet</th>\n",
	" <th>intersection</th>\n",
	" <th>Location 1</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <td><strong>0</strong></td>\n",
	" <td> S CATON AVE & BENSON AVE</td>\n",
	" <td> N/B</td>\n",
	" <td> Caton Ave</td>\n",
	" <td> Benson Ave</td>\n",
	" <td> Caton Ave & Benson Ave</td>\n",
	" <td> (39.2693779962, -76.6688185297)</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td><strong>1</strong></td>\n",
	" <td> S CATON AVE & BENSON AVE</td>\n",
	" <td> S/B</td>\n",
	" <td> Caton Ave</td>\n",
	" <td> Benson Ave</td>\n",
	" <td> Caton Ave & Benson Ave</td>\n",
	" <td> (39.2693157898, -76.6689698176)</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td><strong>2</strong></td>\n",
	" <td> WILKENS AVE & PINE HEIGHTS AVE</td>\n",
	" <td> E/B</td>\n",
	" <td> Wilkens Ave</td>\n",
	" <td> Pine Heights</td>\n",
	" <td> Wilkens Ave & Pine Heights</td>\n",
	" <td> (39.2720252302, -76.676960806)</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td><strong>3</strong></td>\n",
	" <td> THE ALAMEDA & E 33RD ST</td>\n",
	" <td> S/B</td>\n",
	" <td> The Alameda</td>\n",
	" <td> 33rd St</td>\n",
	" <td> The Alameda & 33rd St</td>\n",
	" <td> (39.3285013141, -76.5953545714)</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td><strong>4</strong></td>\n",
	" <td> E 33RD ST & THE ALAMEDA</td>\n",
	" <td> E/B</td>\n",
	" <td> E 33rd</td>\n",
	" <td> The Alameda</td>\n",
	" <td> E 33rd & The Alameda</td>\n",
	" <td> (39.3283410623, -76.5953594625)</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"output_type": "pyout",
	"prompt_number": 4,
	"text": [
	" address direction street crossStreet \\\n",
	"0 S CATON AVE & BENSON AVE N/B Caton Ave Benson Ave \n",
	"1 S CATON AVE & BENSON AVE S/B Caton Ave Benson Ave \n",
	"2 WILKENS AVE & PINE HEIGHTS AVE E/B Wilkens Ave Pine Heights \n",
	"3 THE ALAMEDA & E 33RD ST S/B The Alameda 33rd St \n",
	"4 E 33RD ST & THE ALAMEDA E/B E 33rd The Alameda \n",
	"\n",
	" intersection Location 1 \n",
	"0 Caton Ave & Benson Ave (39.2693779962, -76.6688185297) \n",
	"1 Caton Ave & Benson Ave (39.2693157898, -76.6689698176) \n",
	"2 Wilkens Ave & Pine Heights (39.2720252302, -76.676960806) \n",
	"3 The Alameda & 33rd St (39.3285013141, -76.5953545714) \n",
	"4 E 33rd & The Alameda (39.3283410623, -76.5953594625) "
	]
	}
	],
	"prompt_number": 4
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# read Excel file\n",
	"# this is equivalent to R read.xlsx() and read.xlsx2() commands\n",
	"#\n",
	"# we need openpyxl 1.5.8 (don't use the latest version due to a bug) and xlrd packages\n",
	"# install with: \n",
	"# sudo pip install openpyxl==1.5.8\n",
	"# sudo pip install xlrd\n",
	"#\n",
	"# unfortunately Pandas ExcelFile() can't download and read at once (in contrast to read_csv())\n",
	"# so we need to resort to the basic Python way\n",
	"# also notice I'm using .xls; .xlsx doesn't work in my computer\n",
	"import urllib2\n",
	"\n",
	"# download the file as camera.xls and save it in ./data subfolder\n",
	"fileUrl = 'https://data.baltimorecity.gov/api/views/dz54-2aru/rows.xls?accessType=DOWNLOAD'\n",
	"f = urllib2.urlopen(fileUrl)\n",
	"data = f.read()\n",
	"with open('./data/camera.xls', 'wb') as w:\n",
	" w.write(data)\n",
	"\n",
	"# load the Excel file as a pandas DataFrame\n",
	"cameraData = pd.ExcelFile('./data/camera.xls')\n",
	"cameraData = cameraData.parse('Baltimore Fixed Speed Cameras', index_col=None, na_values=['NA'])\n",
	"cameraData.head()"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"html": [
	"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>address</th>\n",
	" <th>direction</th>\n",
	" <th>street</th>\n",
	" <th>crossStreet</th>\n",
	" <th>intersection</th>\n",
	" <th>Location 1</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <td><strong>0</strong></td>\n",
	" <td> S CATON AVE & BENSON AVE</td>\n",
	" <td> N/B</td>\n",
	" <td> Caton Ave</td>\n",
	" <td> Benson Ave</td>\n",
	" <td> Caton Ave & Benson Ave</td>\n",
	" <td> (39.2693779962, -76.6688185297)</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td><strong>1</strong></td>\n",
	" <td> S CATON AVE & BENSON AVE</td>\n",
	" <td> S/B</td>\n",
	" <td> Caton Ave</td>\n",
	" <td> Benson Ave</td>\n",
	" <td> Caton Ave & Benson Ave</td>\n",
	" <td> (39.2693157898, -76.6689698176)</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td><strong>2</strong></td>\n",
	" <td> WILKENS AVE & PINE HEIGHTS AVE</td>\n",
	" <td> E/B</td>\n",
	" <td> Wilkens Ave</td>\n",
	" <td> Pine Heights</td>\n",
	" <td> Wilkens Ave & Pine Heights</td>\n",
	" <td> (39.2720252302, -76.676960806)</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td><strong>3</strong></td>\n",
	" <td> THE ALAMEDA & E 33RD ST</td>\n",
	" <td> S/B</td>\n",
	" <td> The Alameda</td>\n",
	" <td> 33rd St</td>\n",
	" <td> The Alameda & 33rd St</td>\n",
	" <td> (39.3285013141, -76.5953545714)</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td><strong>4</strong></td>\n",
	" <td> E 33RD ST & THE ALAMEDA</td>\n",
	" <td> E/B</td>\n",
	" <td> E 33rd</td>\n",
	" <td> The Alameda</td>\n",
	" <td> E 33rd & The Alameda</td>\n",
	" <td> (39.3283410623, -76.5953594625)</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"output_type": "pyout",
	"prompt_number": 5,
	"text": [
	" address direction street crossStreet \\\n",
	"0 S CATON AVE & BENSON AVE N/B Caton Ave Benson Ave \n",
	"1 S CATON AVE & BENSON AVE S/B Caton Ave Benson Ave \n",
	"2 WILKENS AVE & PINE HEIGHTS AVE E/B Wilkens Ave Pine Heights \n",
	"3 THE ALAMEDA & E 33RD ST S/B The Alameda 33rd St \n",
	"4 E 33RD ST & THE ALAMEDA E/B E 33rd The Alameda \n",
	"\n",
	" intersection Location 1 \n",
	"0 Caton Ave & Benson Ave (39.2693779962, -76.6688185297) \n",
	"1 Caton Ave & Benson Ave (39.2693157898, -76.6689698176) \n",
	"2 Wilkens Ave & Pine Heights (39.2720252302, -76.676960806) \n",
	"3 The Alameda & 33rd St (39.3285013141, -76.5953545714) \n",
	"4 E 33rd & The Alameda (39.3283410623, -76.5953594625) "
	]
	}
	],
	"prompt_number": 5
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# R readLines() for reading a text file is similar to standard Python file access\n",
	"# I'm not going to detail it here\n",
	"\n",
	"# R readLines() to read data from a website is similar to Python with urllib2 package as in xls example above\n",
	"\n",
	"# read json\n",
	"# this is equivalent to R fromJSON() command\n",
	"import json\n",
	"\n",
	"# first we get the json file from the website\n",
	"fileUrl = 'https://data.baltimorecity.gov/api/views/dz54-2aru/rows.json?accessType=DOWNLOAD'\n",
	"req = urllib2.Request(fileUrl)\n",
	"opener = urllib2.build_opener()\n",
	"f = opener.open(req)\n",
	"\n",
	"# then we read it into a data structure\n",
	"jsonCamera = json.loads(f.read())\n",
	"\n",
	"# json is loadad as dictionary\n",
	"print jsonCamera['meta']['view']['id']\n",
	"print jsonCamera['meta']['view']['name']\n",
	"print jsonCamera['meta']['view']['attribution']"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"dz54-2aru\n",
	"Baltimore Fixed Speed Cameras\n",
	"Department of Transportation\n"
	]
	}
	],
	"prompt_number": 6
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# writing data\n",
	"\n",
	"# first read the csv file\n",
	"cameraData = pd.read_csv('./data/cameras.csv')\n",
	"# take a subset of the columns\n",
	"tmpData = cameraData.ix[:,2:]\n",
	"# then save it to a different csv file\n",
	"# this is equivalent to R write.table() command\n",
	"tmpData.to_csv('./data/camerasModified.csv', sep=',', index=False)\n",
	"cameraData2 = pd.read_csv('./data/camerasModified.csv')\n",
	"cameraData2.head()"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"html": [
	"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>street</th>\n",
	" <th>crossStreet</th>\n",
	" <th>intersection</th>\n",
	" <th>Location 1</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <td><strong>0</strong></td>\n",
	" <td> Caton Ave</td>\n",
	" <td> Benson Ave</td>\n",
	" <td> Caton Ave & Benson Ave</td>\n",
	" <td> (39.2693779962, -76.6688185297)</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td><strong>1</strong></td>\n",
	" <td> Caton Ave</td>\n",
	" <td> Benson Ave</td>\n",
	" <td> Caton Ave & Benson Ave</td>\n",
	" <td> (39.2693157898, -76.6689698176)</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td><strong>2</strong></td>\n",
	" <td> Wilkens Ave</td>\n",
	" <td> Pine Heights</td>\n",
	" <td> Wilkens Ave & Pine Heights</td>\n",
	" <td> (39.2720252302, -76.676960806)</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td><strong>3</strong></td>\n",
	" <td> The Alameda</td>\n",
	" <td> 33rd St</td>\n",
	" <td> The Alameda & 33rd St</td>\n",
	" <td> (39.3285013141, -76.5953545714)</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td><strong>4</strong></td>\n",
	" <td> E 33rd</td>\n",
	" <td> The Alameda</td>\n",
	" <td> E 33rd & The Alameda</td>\n",
	" <td> (39.3283410623, -76.5953594625)</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"output_type": "pyout",
	"prompt_number": 7,
	"text": [
	" street crossStreet intersection \\\n",
	"0 Caton Ave Benson Ave Caton Ave & Benson Ave \n",
	"1 Caton Ave Benson Ave Caton Ave & Benson Ave \n",
	"2 Wilkens Ave Pine Heights Wilkens Ave & Pine Heights \n",
	"3 The Alameda 33rd St The Alameda & 33rd St \n",
	"4 E 33rd The Alameda E 33rd & The Alameda \n",
	"\n",
	" Location 1 \n",
	"0 (39.2693779962, -76.6688185297) \n",
	"1 (39.2693157898, -76.6689698176) \n",
	"2 (39.2720252302, -76.676960806) \n",
	"3 (39.3285013141, -76.5953545714) \n",
	"4 (39.3283410623, -76.5953594625) "
	]
	}
	],
	"prompt_number": 7
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# I don't find the equivalent of R's saving and loading workspace commands"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 8
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# R's paste() and paste0() commands look like standard Python's string manipulations\n",
	"print ['./data' + str(i) + '.csv' for i in range(1, 6)]"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"['./data1.csv', './data2.csv', './data3.csv', './data4.csv', './data5.csv']\n"
	]
	}
	],
	"prompt_number": 9
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# getting data off webpages\n",
	"from lxml.html import parse\n",
	"\n",
	"url = 'http://scholar.google.com/citations?user=HI-I6C0AAAAJ&hl=en'\n",
	"\n",
	"# this is equivalent to the combined R's opening/reading/closing connection and htmlTreeParse() commands\n",
	"html3 = parse(url).getroot()\n",
	"\n",
	"# get the title text using xpath expression\n",
	"# this is equivalent to R xpathSApply() command\n",
	"title = html3.xpath('//title')\n",
	"print [x.text_content() for x in title]\n",
	"\n",
	"# get the texts of col-citedby elements using xpath expression\n",
	"citedby = html3.xpath(\"//td[@id='col-citedby']\")\n",
	"print [x.text_content() for x in citedby]"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"['Jeff Leek - Google Scholar Citations']\n",
	"['Cited by', '337', '172', '140', '133', '107', '95', '79', '78', '53', '17', '10', '9', '9', '8', '8', '6', '6', '6', '5', '3']\n"
	]
	}
	],
	"prompt_number": 10
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [],
	"language": "python",
	"metadata": {},
	"outputs": []
	}
	],
	"metadata": {}
	}
	]
	}