Created
January 31, 2013 21:14
-
-
Save herrfz/4686521 to your computer and use it in GitHub Desktop.
Coursera Data Analysis -- in Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "summarizing_data" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "heading", | |
"level": 2, | |
"metadata": {}, | |
"source": [ | |
"Summarizing data -- in Python" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"import pandas as pd\n", | |
"import numpy as np" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 14 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 3, | |
"metadata": {}, | |
"source": [ | |
"Earthquake data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"fileUrl = 'http://earthquake.usgs.gov/earthquakes/catalogs/eqs7day-M1.txt'\n", | |
"\n", | |
"eData = pd.read_csv(fileUrl)\n", | |
"\n", | |
"dateDownloaded = !date\n", | |
"dateDownloaded" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 15, | |
"text": [ | |
"['Thu Jan 31 22:11:48 CET 2013']" | |
] | |
} | |
], | |
"prompt_number": 15 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# looking at data\n", | |
"# for large data, only a summary is shown\n", | |
"eData" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 16, | |
"text": [ | |
"<class 'pandas.core.frame.DataFrame'>\n", | |
"Int64Index: 891 entries, 0 to 890\n", | |
"Data columns:\n", | |
"Src 891 non-null values\n", | |
"Eqid 891 non-null values\n", | |
"Version 891 non-null values\n", | |
"Datetime 891 non-null values\n", | |
"Lat 891 non-null values\n", | |
"Lon 891 non-null values\n", | |
"Magnitude 891 non-null values\n", | |
"Depth 891 non-null values\n", | |
"NST 891 non-null values\n", | |
"Region 891 non-null values\n", | |
"dtypes: float64(4), int64(1), object(5)" | |
] | |
} | |
], | |
"prompt_number": 16 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"eData.head()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>Src</th>\n", | |
" <th>Eqid</th>\n", | |
" <th>Version</th>\n", | |
" <th>Datetime</th>\n", | |
" <th>Lat</th>\n", | |
" <th>Lon</th>\n", | |
" <th>Magnitude</th>\n", | |
" <th>Depth</th>\n", | |
" <th>NST</th>\n", | |
" <th>Region</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <td><strong>0</strong></td>\n", | |
" <td> ci</td>\n", | |
" <td> 15279625</td>\n", | |
" <td> 0</td>\n", | |
" <td> Thursday, January 31, 2013 21:00:41 UTC</td>\n", | |
" <td> 34.3583</td>\n", | |
" <td>-118.7433</td>\n", | |
" <td> 1.9</td>\n", | |
" <td> 5.1</td>\n", | |
" <td> 18</td>\n", | |
" <td> Greater Los Angeles area, California</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td><strong>1</strong></td>\n", | |
" <td> ak</td>\n", | |
" <td> 10648034</td>\n", | |
" <td> 1</td>\n", | |
" <td> Thursday, January 31, 2013 20:31:09 UTC</td>\n", | |
" <td> 64.6695</td>\n", | |
" <td>-146.9074</td>\n", | |
" <td> 1.0</td>\n", | |
" <td> 10.1</td>\n", | |
" <td> 5</td>\n", | |
" <td> Central Alaska</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td><strong>2</strong></td>\n", | |
" <td> ak</td>\n", | |
" <td> 10648023</td>\n", | |
" <td> 1</td>\n", | |
" <td> Thursday, January 31, 2013 20:19:10 UTC</td>\n", | |
" <td> 62.1891</td>\n", | |
" <td>-145.5031</td>\n", | |
" <td> 1.7</td>\n", | |
" <td> 10.0</td>\n", | |
" <td> 13</td>\n", | |
" <td> Central Alaska</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td><strong>3</strong></td>\n", | |
" <td> nn</td>\n", | |
" <td> 00401418</td>\n", | |
" <td> 9</td>\n", | |
" <td> Thursday, January 31, 2013 20:13:01 UTC</td>\n", | |
" <td> 37.3921</td>\n", | |
" <td>-116.9290</td>\n", | |
" <td> 1.3</td>\n", | |
" <td> 13.4</td>\n", | |
" <td> 13</td>\n", | |
" <td> Nevada</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td><strong>4</strong></td>\n", | |
" <td> nc</td>\n", | |
" <td> 71932391</td>\n", | |
" <td> 0</td>\n", | |
" <td> Thursday, January 31, 2013 19:59:34 UTC</td>\n", | |
" <td> 38.8353</td>\n", | |
" <td>-122.7833</td>\n", | |
" <td> 1.4</td>\n", | |
" <td> 2.4</td>\n", | |
" <td> 22</td>\n", | |
" <td> Northern California</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"output_type": "pyout", | |
"prompt_number": 17, | |
"text": [ | |
" Src Eqid Version Datetime Lat Lon \\\n", | |
"0 ci 15279625 0 Thursday, January 31, 2013 21:00:41 UTC 34.3583 -118.7433 \n", | |
"1 ak 10648034 1 Thursday, January 31, 2013 20:31:09 UTC 64.6695 -146.9074 \n", | |
"2 ak 10648023 1 Thursday, January 31, 2013 20:19:10 UTC 62.1891 -145.5031 \n", | |
"3 nn 00401418 9 Thursday, January 31, 2013 20:13:01 UTC 37.3921 -116.9290 \n", | |
"4 nc 71932391 0 Thursday, January 31, 2013 19:59:34 UTC 38.8353 -122.7833 \n", | |
"\n", | |
" Magnitude Depth NST Region \n", | |
"0 1.9 5.1 18 Greater Los Angeles area, California \n", | |
"1 1.0 10.1 5 Central Alaska \n", | |
"2 1.7 10.0 13 Central Alaska \n", | |
"3 1.3 13.4 13 Nevada \n", | |
"4 1.4 2.4 22 Northern California " | |
] | |
} | |
], | |
"prompt_number": 17 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# equivalent to R dim()\n", | |
"eData.shape" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 18, | |
"text": [ | |
"(891, 10)" | |
] | |
} | |
], | |
"prompt_number": 18 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# equivalent to R names()\n", | |
"eData.columns" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 19, | |
"text": [ | |
"Index([Src, Eqid, Version, Datetime, Lat, Lon, Magnitude, Depth, NST, Region], dtype=object)" | |
] | |
} | |
], | |
"prompt_number": 19 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# computing quantiles is not as straightforward as R's quantile()\n", | |
"p = [0, 0.25, 0.5, 0.75, 1]\n", | |
"[eData['Lat'].quantile(q=i) for i in p]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 20, | |
"text": [ | |
"[-58.834400000000002,\n", | |
" 34.195750000000004,\n", | |
" 38.790500000000002,\n", | |
" 55.290900000000001,\n", | |
" 65.953999999999994]" | |
] | |
} | |
], | |
"prompt_number": 20 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# equivalent (but not as detailed) as R summary()\n", | |
"eData.describe()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>Lat</th>\n", | |
" <th>Lon</th>\n", | |
" <th>Magnitude</th>\n", | |
" <th>Depth</th>\n", | |
" <th>NST</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <td><strong>count</strong></td>\n", | |
" <td> 891.000000</td>\n", | |
" <td> 891.000000</td>\n", | |
" <td> 891.000000</td>\n", | |
" <td> 891.000000</td>\n", | |
" <td> 891.000000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td><strong>mean</strong></td>\n", | |
" <td> 39.979868</td>\n", | |
" <td>-109.265021</td>\n", | |
" <td> 2.048373</td>\n", | |
" <td> 23.753311</td>\n", | |
" <td> 31.907969</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td><strong>std</strong></td>\n", | |
" <td> 19.308071</td>\n", | |
" <td> 70.236505</td>\n", | |
" <td> 1.115400</td>\n", | |
" <td> 45.972455</td>\n", | |
" <td> 50.079515</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td><strong>min</strong></td>\n", | |
" <td> -58.834400</td>\n", | |
" <td>-179.968200</td>\n", | |
" <td> 1.000000</td>\n", | |
" <td> 0.000000</td>\n", | |
" <td> 0.000000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td><strong>25%</strong></td>\n", | |
" <td> 34.195750</td>\n", | |
" <td>-148.334600</td>\n", | |
" <td> 1.300000</td>\n", | |
" <td> 4.000000</td>\n", | |
" <td> 11.000000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td><strong>50%</strong></td>\n", | |
" <td> 38.790500</td>\n", | |
" <td>-122.121300</td>\n", | |
" <td> 1.700000</td>\n", | |
" <td> 9.400000</td>\n", | |
" <td> 18.000000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td><strong>75%</strong></td>\n", | |
" <td> 55.290900</td>\n", | |
" <td>-116.736400</td>\n", | |
" <td> 2.300000</td>\n", | |
" <td> 23.350000</td>\n", | |
" <td> 33.000000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td><strong>max</strong></td>\n", | |
" <td> 65.954000</td>\n", | |
" <td> 179.909300</td>\n", | |
" <td> 6.800000</td>\n", | |
" <td> 585.200000</td>\n", | |
" <td> 608.000000</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"output_type": "pyout", | |
"prompt_number": 21, | |
"text": [ | |
" Lat Lon Magnitude Depth NST\n", | |
"count 891.000000 891.000000 891.000000 891.000000 891.000000\n", | |
"mean 39.979868 -109.265021 2.048373 23.753311 31.907969\n", | |
"std 19.308071 70.236505 1.115400 45.972455 50.079515\n", | |
"min -58.834400 -179.968200 1.000000 0.000000 0.000000\n", | |
"25% 34.195750 -148.334600 1.300000 4.000000 11.000000\n", | |
"50% 38.790500 -122.121300 1.700000 9.400000 18.000000\n", | |
"75% 55.290900 -116.736400 2.300000 23.350000 33.000000\n", | |
"max 65.954000 179.909300 6.800000 585.200000 608.000000" | |
] | |
} | |
], | |
"prompt_number": 21 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# R's class() is equivalent to type()\n", | |
"type(eData)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 22, | |
"text": [ | |
"pandas.core.frame.DataFrame" | |
] | |
} | |
], | |
"prompt_number": 22 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# get the data types of all columns\n", | |
"# similarly to the method used in the video, \n", | |
"# we just apply the function type() to values in a row\n", | |
"# the zip() method is just for nice printing\n", | |
"zip(eData.columns, [type(x) for x in eData.ix[0,:]])" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 23, | |
"text": [ | |
"[('Src', str),\n", | |
" ('Eqid', str),\n", | |
" ('Version', str),\n", | |
" ('Datetime', str),\n", | |
" ('Lat', numpy.float64),\n", | |
" ('Lon', numpy.float64),\n", | |
" ('Magnitude', numpy.float64),\n", | |
" ('Depth', numpy.float64),\n", | |
" ('NST', numpy.int64),\n", | |
" ('Region', str)]" | |
] | |
} | |
], | |
"prompt_number": 23 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# equivalent to R's unique() command\n", | |
"eData['Src'].unique()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 24, | |
"text": [ | |
"array([ci, ak, nn, nc, us, nm, uu, hv, uw, pr, se, mb], dtype=object)" | |
] | |
} | |
], | |
"prompt_number": 24 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# equivalent to R's length() command\n", | |
"len(eData['Src'].unique())" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 25, | |
"text": [ | |
"12" | |
] | |
} | |
], | |
"prompt_number": 25 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# for this particular case, value_counts() is equivalent to R's table()\n", | |
"eData['Src'].value_counts() \n", | |
"\n", | |
"# or alternatively: pd.crosstab(eData['Src'], [])" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 26, | |
"text": [ | |
"ak 264\n", | |
"nc 241\n", | |
"ci 122\n", | |
"us 99\n", | |
"nn 46\n", | |
"hv 30\n", | |
"pr 28\n", | |
"uu 25\n", | |
"uw 22\n", | |
"mb 6\n", | |
"nm 6\n", | |
"se 2" | |
] | |
} | |
], | |
"prompt_number": 26 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# equivalent to R's table() to compute frequency table\n", | |
"pd.crosstab(eData['Src'], eData['Version'])" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th>Version</th>\n", | |
" <th>0</th>\n", | |
" <th>1</th>\n", | |
" <th>2</th>\n", | |
" <th>3</th>\n", | |
" <th>4</th>\n", | |
" <th>5</th>\n", | |
" <th>6</th>\n", | |
" <th>7</th>\n", | |
" <th>8</th>\n", | |
" <th>9</th>\n", | |
" <th>A</th>\n", | |
" <th>B</th>\n", | |
" <th>C</th>\n", | |
" <th>D</th>\n", | |
" <th>E</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Src</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <td><strong>ak</strong></td>\n", | |
" <td> 0</td>\n", | |
" <td> 110</td>\n", | |
" <td> 144</td>\n", | |
" <td> 10</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td><strong>ci</strong></td>\n", | |
" <td> 35</td>\n", | |
" <td> 0</td>\n", | |
" <td> 82</td>\n", | |
" <td> 3</td>\n", | |
" <td> 2</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td><strong>hv</strong></td>\n", | |
" <td> 0</td>\n", | |
" <td> 14</td>\n", | |
" <td> 12</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 3</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td><strong>mb</strong></td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 6</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td><strong>nc</strong></td>\n", | |
" <td> 77</td>\n", | |
" <td> 54</td>\n", | |
" <td> 63</td>\n", | |
" <td> 26</td>\n", | |
" <td> 10</td>\n", | |
" <td> 5</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 3</td>\n", | |
" <td> 1</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td><strong>nm</strong></td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 5</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td><strong>nn</strong></td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 45</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td><strong>pr</strong></td>\n", | |
" <td> 28</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td><strong>se</strong></td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 2</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td><strong>us</strong></td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 3</td>\n", | |
" <td> 12</td>\n", | |
" <td> 16</td>\n", | |
" <td> 24</td>\n", | |
" <td> 10</td>\n", | |
" <td> 15</td>\n", | |
" <td> 10</td>\n", | |
" <td> 4</td>\n", | |
" <td> 1</td>\n", | |
" <td> 1</td>\n", | |
" <td> 1</td>\n", | |
" <td> 1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td><strong>uu</strong></td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 10</td>\n", | |
" <td> 2</td>\n", | |
" <td> 12</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td><strong>uw</strong></td>\n", | |
" <td> 0</td>\n", | |
" <td> 10</td>\n", | |
" <td> 4</td>\n", | |
" <td> 8</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"output_type": "pyout", | |
"prompt_number": 27, | |
"text": [ | |
"Version 0 1 2 3 4 5 6 7 8 9 A B C D E\n", | |
"Src \n", | |
"ak 0 110 144 10 0 0 0 0 0 0 0 0 0 0 0\n", | |
"ci 35 0 82 3 2 0 0 0 0 0 0 0 0 0 0\n", | |
"hv 0 14 12 0 1 3 0 0 0 0 0 0 0 0 0\n", | |
"mb 0 0 6 0 0 0 0 0 0 0 0 0 0 0 0\n", | |
"nc 77 54 63 26 10 5 1 0 3 1 1 0 0 0 0\n", | |
"nm 0 0 0 0 0 0 0 0 0 0 5 1 0 0 0\n", | |
"nn 0 1 0 0 0 0 0 0 0 45 0 0 0 0 0\n", | |
"pr 28 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", | |
"se 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0\n", | |
"us 0 0 1 3 12 16 24 10 15 10 4 1 1 1 1\n", | |
"uu 0 0 10 2 12 1 0 0 0 0 0 0 0 0 0\n", | |
"uw 0 10 4 8 0 0 0 0 0 0 0 0 0 0 0" | |
] | |
} | |
], | |
"prompt_number": 27 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"eData.ix[0:9,'Lat']" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 28, | |
"text": [ | |
"0 34.3583\n", | |
"1 64.6695\n", | |
"2 62.1891\n", | |
"3 37.3921\n", | |
"4 38.8353\n", | |
"5 40.9122\n", | |
"6 61.3245\n", | |
"7 62.7546\n", | |
"8 61.1027\n", | |
"9 33.3077\n", | |
"Name: Lat" | |
] | |
} | |
], | |
"prompt_number": 28 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"eData.ix[0:9,'Lat'] > 40" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 29, | |
"text": [ | |
"0 False\n", | |
"1 True\n", | |
"2 True\n", | |
"3 False\n", | |
"4 False\n", | |
"5 True\n", | |
"6 True\n", | |
"7 True\n", | |
"8 True\n", | |
"9 False\n", | |
"Name: Lat" | |
] | |
} | |
], | |
"prompt_number": 29 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# equivalent to R's any()\n", | |
"(eData.ix[0:9,'Lat'] > 40).any()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 30, | |
"text": [ | |
"True" | |
] | |
} | |
], | |
"prompt_number": 30 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# equivalent to R's all()\n", | |
"(eData.ix[0:9,'Lat'] > 40).all()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 31, | |
"text": [ | |
"False" | |
] | |
} | |
], | |
"prompt_number": 31 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# looking at subsets; very similar to R's & operator\n", | |
"eData[(eData['Lat'] > 0) & (eData['Lon'] > 0)][['Lat', 'Lon']][:10]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>Lat</th>\n", | |
" <th>Lon</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <td><strong>12 </strong></td>\n", | |
" <td> 35.9827</td>\n", | |
" <td> 139.7463</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td><strong>17 </strong></td>\n", | |
" <td> 36.7675</td>\n", | |
" <td> 140.4900</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td><strong>56 </strong></td>\n", | |
" <td> 39.8641</td>\n", | |
" <td> 19.7467</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td><strong>87 </strong></td>\n", | |
" <td> 49.8770</td>\n", | |
" <td> 150.7514</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td><strong>141</strong></td>\n", | |
" <td> 36.6463</td>\n", | |
" <td> 96.5500</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td><strong>154</strong></td>\n", | |
" <td> 10.5744</td>\n", | |
" <td> 126.9572</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td><strong>164</strong></td>\n", | |
" <td> 16.9495</td>\n", | |
" <td> 96.1267</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td><strong>167</strong></td>\n", | |
" <td> 10.4223</td>\n", | |
" <td> 126.9224</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td><strong>184</strong></td>\n", | |
" <td> 32.8470</td>\n", | |
" <td> 94.7063</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td><strong>197</strong></td>\n", | |
" <td> 38.7662</td>\n", | |
" <td> 23.3660</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"output_type": "pyout", | |
"prompt_number": 32, | |
"text": [ | |
" Lat Lon\n", | |
"12 35.9827 139.7463\n", | |
"17 36.7675 140.4900\n", | |
"56 39.8641 19.7467\n", | |
"87 49.8770 150.7514\n", | |
"141 36.6463 96.5500\n", | |
"154 10.5744 126.9572\n", | |
"164 16.9495 96.1267\n", | |
"167 10.4223 126.9224\n", | |
"184 32.8470 94.7063\n", | |
"197 38.7662 23.3660" | |
] | |
} | |
], | |
"prompt_number": 32 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# looking at subsets; very similar to R's | operator\n", | |
"eData[(eData['Lat'] > 0) | (eData['Lon'] > 0)][['Lat', 'Lon']][-10:]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>Lat</th>\n", | |
" <th>Lon</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <td><strong>881</strong></td>\n", | |
" <td> 38.8237</td>\n", | |
" <td>-122.7510</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td><strong>882</strong></td>\n", | |
" <td> 36.5673</td>\n", | |
" <td>-121.1177</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td><strong>883</strong></td>\n", | |
" <td> 38.1566</td>\n", | |
" <td>-118.0268</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td><strong>884</strong></td>\n", | |
" <td> 56.9073</td>\n", | |
" <td>-157.7617</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td><strong>885</strong></td>\n", | |
" <td> 59.8094</td>\n", | |
" <td>-152.6787</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td><strong>886</strong></td>\n", | |
" <td> 38.8283</td>\n", | |
" <td>-122.8507</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td><strong>887</strong></td>\n", | |
" <td> 61.2522</td>\n", | |
" <td>-149.6330</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td><strong>888</strong></td>\n", | |
" <td> 53.0663</td>\n", | |
" <td>-166.7094</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td><strong>889</strong></td>\n", | |
" <td> 38.1543</td>\n", | |
" <td>-118.0146</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td><strong>890</strong></td>\n", | |
" <td> 32.5852</td>\n", | |
" <td>-115.7117</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"output_type": "pyout", | |
"prompt_number": 33, | |
"text": [ | |
" Lat Lon\n", | |
"881 38.8237 -122.7510\n", | |
"882 36.5673 -121.1177\n", | |
"883 38.1566 -118.0268\n", | |
"884 56.9073 -157.7617\n", | |
"885 59.8094 -152.6787\n", | |
"886 38.8283 -122.8507\n", | |
"887 61.2522 -149.6330\n", | |
"888 53.0663 -166.7094\n", | |
"889 38.1543 -118.0146\n", | |
"890 32.5852 -115.7117" | |
] | |
} | |
], | |
"prompt_number": 33 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 3, | |
"metadata": {}, | |
"source": [ | |
"Peer review experiment data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"fileUrl1 = 'https://dl.dropbox.com/u/7710864/data/reviews-apr29.csv'\n", | |
"fileUrl2 = 'https://dl.dropbox.com/u/7710864/data/solutions-apr29.csv'\n", | |
"\n", | |
"reviews = pd.read_csv(fileUrl1)\n", | |
"solutions = pd.read_csv(fileUrl2)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 34 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"reviews.head(2)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>id</th>\n", | |
" <th>solution_id</th>\n", | |
" <th>reviewer_id</th>\n", | |
" <th>start</th>\n", | |
" <th>stop</th>\n", | |
" <th>time_left</th>\n", | |
" <th>accept</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <td><strong>0</strong></td>\n", | |
" <td> 1</td>\n", | |
" <td> 3</td>\n", | |
" <td> 27</td>\n", | |
" <td> 1304095698</td>\n", | |
" <td> 1304095758</td>\n", | |
" <td> 1754</td>\n", | |
" <td> 1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td><strong>1</strong></td>\n", | |
" <td> 2</td>\n", | |
" <td> 4</td>\n", | |
" <td> 22</td>\n", | |
" <td> 1304095188</td>\n", | |
" <td> 1304095206</td>\n", | |
" <td> 2306</td>\n", | |
" <td> 1</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"output_type": "pyout", | |
"prompt_number": 35, | |
"text": [ | |
" id solution_id reviewer_id start stop time_left accept\n", | |
"0 1 3 27 1304095698 1304095758 1754 1\n", | |
"1 2 4 22 1304095188 1304095206 2306 1" | |
] | |
} | |
], | |
"prompt_number": 35 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"solutions.head(2)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>id</th>\n", | |
" <th>problem_id</th>\n", | |
" <th>subject_id</th>\n", | |
" <th>start</th>\n", | |
" <th>stop</th>\n", | |
" <th>time_left</th>\n", | |
" <th>answer</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <td><strong>0</strong></td>\n", | |
" <td> 1</td>\n", | |
" <td> 156</td>\n", | |
" <td> 29</td>\n", | |
" <td> 1304095119</td>\n", | |
" <td> 1304095169</td>\n", | |
" <td> 2343</td>\n", | |
" <td> B</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td><strong>1</strong></td>\n", | |
" <td> 2</td>\n", | |
" <td> 269</td>\n", | |
" <td> 25</td>\n", | |
" <td> 1304095119</td>\n", | |
" <td> 1304095183</td>\n", | |
" <td> 2329</td>\n", | |
" <td> C</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"output_type": "pyout", | |
"prompt_number": 36, | |
"text": [ | |
" id problem_id subject_id start stop time_left answer\n", | |
"0 1 156 29 1304095119 1304095169 2343 B\n", | |
"1 2 269 25 1304095119 1304095183 2329 C" | |
] | |
} | |
], | |
"prompt_number": 36 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# find if there are missing values; equivalent to R's is.na()\n", | |
"reviews.ix[0:9,'time_left'].isnull()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 37, | |
"text": [ | |
"0 False\n", | |
"1 False\n", | |
"2 False\n", | |
"3 False\n", | |
"4 False\n", | |
"5 False\n", | |
"6 False\n", | |
"7 True\n", | |
"8 False\n", | |
"9 False\n", | |
"Name: time_left" | |
] | |
} | |
], | |
"prompt_number": 37 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"reviews['time_left'].isnull().sum()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 38, | |
"text": [ | |
"84" | |
] | |
} | |
], | |
"prompt_number": 38 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"reviews['time_left'].isnull().value_counts()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 39, | |
"text": [ | |
"False 115\n", | |
"True 84" | |
] | |
} | |
], | |
"prompt_number": 39 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# equivalent to R's colSums()\n", | |
"# notice we need to specifically include missing values with the option skipna=False\n", | |
"# this is in contrary to R, where the default is to *include* NA\n", | |
"reviews.sum(skipna=False) " | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 40, | |
"text": [ | |
"id 19900\n", | |
"solution_id 19929\n", | |
"reviewer_id 5064\n", | |
"start NaN\n", | |
"stop NaN\n", | |
"time_left NaN\n", | |
"accept NaN" | |
] | |
} | |
], | |
"prompt_number": 40 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# equivalent to R's colMeans\n", | |
"# same remark as above: here missing values are by default excluded\n", | |
"reviews.mean()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 41, | |
"text": [ | |
"id 1.000000e+02\n", | |
"solution_id 1.001457e+02\n", | |
"reviewer_id 2.544724e+01\n", | |
"start 1.304096e+09\n", | |
"stop 1.304096e+09\n", | |
"time_left 1.114287e+03\n", | |
"accept 6.434783e-01" | |
] | |
} | |
], | |
"prompt_number": 41 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# equivalent to R's rowMeans()\n", | |
"reviews.mean(axis=1)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 42, | |
"text": [ | |
"0 3.725990e+08\n", | |
"1 3.725990e+08\n", | |
"2 3.725990e+08\n", | |
"3 3.725990e+08\n", | |
"4 3.725990e+08\n", | |
"5 3.725990e+08\n", | |
"6 3.725990e+08\n", | |
"7 1.300000e+01\n", | |
"8 3.725990e+08\n", | |
"9 3.725990e+08\n", | |
"10 3.725990e+08\n", | |
"11 3.725990e+08\n", | |
"12 3.725990e+08\n", | |
"13 3.725990e+08\n", | |
"14 3.725990e+08\n", | |
"...\n", | |
"184 1.326667e+02\n", | |
"185 1.333333e+02\n", | |
"186 1.343333e+02\n", | |
"187 1.340000e+02\n", | |
"188 3.725993e+08\n", | |
"189 1.356667e+02\n", | |
"190 1.370000e+02\n", | |
"191 3.725993e+08\n", | |
"192 1.390000e+02\n", | |
"193 1.383333e+02\n", | |
"194 1.366667e+02\n", | |
"195 1.410000e+02\n", | |
"196 1.396667e+02\n", | |
"197 1.420000e+02\n", | |
"198 1.393333e+02\n", | |
"Length: 199" | |
] | |
} | |
], | |
"prompt_number": 42 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment