Skip to content

Instantly share code, notes, and snippets.

@herrfz
Created January 31, 2013 21:14
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save herrfz/4686521 to your computer and use it in GitHub Desktop.
Save herrfz/4686521 to your computer and use it in GitHub Desktop.
Coursera Data Analysis -- in Python
{
"metadata": {
"name": "summarizing_data"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"Summarizing data -- in Python"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import pandas as pd\n",
"import numpy as np"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 14
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"Earthquake data"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"fileUrl = 'http://earthquake.usgs.gov/earthquakes/catalogs/eqs7day-M1.txt'\n",
"\n",
"eData = pd.read_csv(fileUrl)\n",
"\n",
"dateDownloaded = !date\n",
"dateDownloaded"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 15,
"text": [
"['Thu Jan 31 22:11:48 CET 2013']"
]
}
],
"prompt_number": 15
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# looking at data\n",
"# for large data, only a summary is shown\n",
"eData"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 16,
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Int64Index: 891 entries, 0 to 890\n",
"Data columns:\n",
"Src 891 non-null values\n",
"Eqid 891 non-null values\n",
"Version 891 non-null values\n",
"Datetime 891 non-null values\n",
"Lat 891 non-null values\n",
"Lon 891 non-null values\n",
"Magnitude 891 non-null values\n",
"Depth 891 non-null values\n",
"NST 891 non-null values\n",
"Region 891 non-null values\n",
"dtypes: float64(4), int64(1), object(5)"
]
}
],
"prompt_number": 16
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"eData.head()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Src</th>\n",
" <th>Eqid</th>\n",
" <th>Version</th>\n",
" <th>Datetime</th>\n",
" <th>Lat</th>\n",
" <th>Lon</th>\n",
" <th>Magnitude</th>\n",
" <th>Depth</th>\n",
" <th>NST</th>\n",
" <th>Region</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td><strong>0</strong></td>\n",
" <td> ci</td>\n",
" <td> 15279625</td>\n",
" <td> 0</td>\n",
" <td> Thursday, January 31, 2013 21:00:41 UTC</td>\n",
" <td> 34.3583</td>\n",
" <td>-118.7433</td>\n",
" <td> 1.9</td>\n",
" <td> 5.1</td>\n",
" <td> 18</td>\n",
" <td> Greater Los Angeles area, California</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>1</strong></td>\n",
" <td> ak</td>\n",
" <td> 10648034</td>\n",
" <td> 1</td>\n",
" <td> Thursday, January 31, 2013 20:31:09 UTC</td>\n",
" <td> 64.6695</td>\n",
" <td>-146.9074</td>\n",
" <td> 1.0</td>\n",
" <td> 10.1</td>\n",
" <td> 5</td>\n",
" <td> Central Alaska</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>2</strong></td>\n",
" <td> ak</td>\n",
" <td> 10648023</td>\n",
" <td> 1</td>\n",
" <td> Thursday, January 31, 2013 20:19:10 UTC</td>\n",
" <td> 62.1891</td>\n",
" <td>-145.5031</td>\n",
" <td> 1.7</td>\n",
" <td> 10.0</td>\n",
" <td> 13</td>\n",
" <td> Central Alaska</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>3</strong></td>\n",
" <td> nn</td>\n",
" <td> 00401418</td>\n",
" <td> 9</td>\n",
" <td> Thursday, January 31, 2013 20:13:01 UTC</td>\n",
" <td> 37.3921</td>\n",
" <td>-116.9290</td>\n",
" <td> 1.3</td>\n",
" <td> 13.4</td>\n",
" <td> 13</td>\n",
" <td> Nevada</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>4</strong></td>\n",
" <td> nc</td>\n",
" <td> 71932391</td>\n",
" <td> 0</td>\n",
" <td> Thursday, January 31, 2013 19:59:34 UTC</td>\n",
" <td> 38.8353</td>\n",
" <td>-122.7833</td>\n",
" <td> 1.4</td>\n",
" <td> 2.4</td>\n",
" <td> 22</td>\n",
" <td> Northern California</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 17,
"text": [
" Src Eqid Version Datetime Lat Lon \\\n",
"0 ci 15279625 0 Thursday, January 31, 2013 21:00:41 UTC 34.3583 -118.7433 \n",
"1 ak 10648034 1 Thursday, January 31, 2013 20:31:09 UTC 64.6695 -146.9074 \n",
"2 ak 10648023 1 Thursday, January 31, 2013 20:19:10 UTC 62.1891 -145.5031 \n",
"3 nn 00401418 9 Thursday, January 31, 2013 20:13:01 UTC 37.3921 -116.9290 \n",
"4 nc 71932391 0 Thursday, January 31, 2013 19:59:34 UTC 38.8353 -122.7833 \n",
"\n",
" Magnitude Depth NST Region \n",
"0 1.9 5.1 18 Greater Los Angeles area, California \n",
"1 1.0 10.1 5 Central Alaska \n",
"2 1.7 10.0 13 Central Alaska \n",
"3 1.3 13.4 13 Nevada \n",
"4 1.4 2.4 22 Northern California "
]
}
],
"prompt_number": 17
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# equivalent to R dim()\n",
"eData.shape"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 18,
"text": [
"(891, 10)"
]
}
],
"prompt_number": 18
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# equivalent to R names()\n",
"eData.columns"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 19,
"text": [
"Index([Src, Eqid, Version, Datetime, Lat, Lon, Magnitude, Depth, NST, Region], dtype=object)"
]
}
],
"prompt_number": 19
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# computing quantiles is not as straightforward as R's quantile()\n",
"p = [0, 0.25, 0.5, 0.75, 1]\n",
"[eData['Lat'].quantile(q=i) for i in p]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 20,
"text": [
"[-58.834400000000002,\n",
" 34.195750000000004,\n",
" 38.790500000000002,\n",
" 55.290900000000001,\n",
" 65.953999999999994]"
]
}
],
"prompt_number": 20
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# equivalent (but not as detailed) as R summary()\n",
"eData.describe()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Lat</th>\n",
" <th>Lon</th>\n",
" <th>Magnitude</th>\n",
" <th>Depth</th>\n",
" <th>NST</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td><strong>count</strong></td>\n",
" <td> 891.000000</td>\n",
" <td> 891.000000</td>\n",
" <td> 891.000000</td>\n",
" <td> 891.000000</td>\n",
" <td> 891.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>mean</strong></td>\n",
" <td> 39.979868</td>\n",
" <td>-109.265021</td>\n",
" <td> 2.048373</td>\n",
" <td> 23.753311</td>\n",
" <td> 31.907969</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>std</strong></td>\n",
" <td> 19.308071</td>\n",
" <td> 70.236505</td>\n",
" <td> 1.115400</td>\n",
" <td> 45.972455</td>\n",
" <td> 50.079515</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>min</strong></td>\n",
" <td> -58.834400</td>\n",
" <td>-179.968200</td>\n",
" <td> 1.000000</td>\n",
" <td> 0.000000</td>\n",
" <td> 0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>25%</strong></td>\n",
" <td> 34.195750</td>\n",
" <td>-148.334600</td>\n",
" <td> 1.300000</td>\n",
" <td> 4.000000</td>\n",
" <td> 11.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>50%</strong></td>\n",
" <td> 38.790500</td>\n",
" <td>-122.121300</td>\n",
" <td> 1.700000</td>\n",
" <td> 9.400000</td>\n",
" <td> 18.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>75%</strong></td>\n",
" <td> 55.290900</td>\n",
" <td>-116.736400</td>\n",
" <td> 2.300000</td>\n",
" <td> 23.350000</td>\n",
" <td> 33.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>max</strong></td>\n",
" <td> 65.954000</td>\n",
" <td> 179.909300</td>\n",
" <td> 6.800000</td>\n",
" <td> 585.200000</td>\n",
" <td> 608.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 21,
"text": [
" Lat Lon Magnitude Depth NST\n",
"count 891.000000 891.000000 891.000000 891.000000 891.000000\n",
"mean 39.979868 -109.265021 2.048373 23.753311 31.907969\n",
"std 19.308071 70.236505 1.115400 45.972455 50.079515\n",
"min -58.834400 -179.968200 1.000000 0.000000 0.000000\n",
"25% 34.195750 -148.334600 1.300000 4.000000 11.000000\n",
"50% 38.790500 -122.121300 1.700000 9.400000 18.000000\n",
"75% 55.290900 -116.736400 2.300000 23.350000 33.000000\n",
"max 65.954000 179.909300 6.800000 585.200000 608.000000"
]
}
],
"prompt_number": 21
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# R's class() is equivalent to type()\n",
"type(eData)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 22,
"text": [
"pandas.core.frame.DataFrame"
]
}
],
"prompt_number": 22
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# get the data types of all columns\n",
"# similarly to the method used in the video, \n",
"# we just apply the function type() to values in a row\n",
"# the zip() method is just for nice printing\n",
"zip(eData.columns, [type(x) for x in eData.ix[0,:]])"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 23,
"text": [
"[('Src', str),\n",
" ('Eqid', str),\n",
" ('Version', str),\n",
" ('Datetime', str),\n",
" ('Lat', numpy.float64),\n",
" ('Lon', numpy.float64),\n",
" ('Magnitude', numpy.float64),\n",
" ('Depth', numpy.float64),\n",
" ('NST', numpy.int64),\n",
" ('Region', str)]"
]
}
],
"prompt_number": 23
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# equivalent to R's unique() command\n",
"eData['Src'].unique()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 24,
"text": [
"array([ci, ak, nn, nc, us, nm, uu, hv, uw, pr, se, mb], dtype=object)"
]
}
],
"prompt_number": 24
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# equivalent to R's length() command\n",
"len(eData['Src'].unique())"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 25,
"text": [
"12"
]
}
],
"prompt_number": 25
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# for this particular case, value_counts() is equivalent to R's table()\n",
"eData['Src'].value_counts() \n",
"\n",
"# or alternatively: pd.crosstab(eData['Src'], [])"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 26,
"text": [
"ak 264\n",
"nc 241\n",
"ci 122\n",
"us 99\n",
"nn 46\n",
"hv 30\n",
"pr 28\n",
"uu 25\n",
"uw 22\n",
"mb 6\n",
"nm 6\n",
"se 2"
]
}
],
"prompt_number": 26
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# equivalent to R's table() to compute frequency table\n",
"pd.crosstab(eData['Src'], eData['Version'])"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th>Version</th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" <th>3</th>\n",
" <th>4</th>\n",
" <th>5</th>\n",
" <th>6</th>\n",
" <th>7</th>\n",
" <th>8</th>\n",
" <th>9</th>\n",
" <th>A</th>\n",
" <th>B</th>\n",
" <th>C</th>\n",
" <th>D</th>\n",
" <th>E</th>\n",
" </tr>\n",
" <tr>\n",
" <th>Src</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td><strong>ak</strong></td>\n",
" <td> 0</td>\n",
" <td> 110</td>\n",
" <td> 144</td>\n",
" <td> 10</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>ci</strong></td>\n",
" <td> 35</td>\n",
" <td> 0</td>\n",
" <td> 82</td>\n",
" <td> 3</td>\n",
" <td> 2</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>hv</strong></td>\n",
" <td> 0</td>\n",
" <td> 14</td>\n",
" <td> 12</td>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 3</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>mb</strong></td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 6</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>nc</strong></td>\n",
" <td> 77</td>\n",
" <td> 54</td>\n",
" <td> 63</td>\n",
" <td> 26</td>\n",
" <td> 10</td>\n",
" <td> 5</td>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 3</td>\n",
" <td> 1</td>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>nm</strong></td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 5</td>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>nn</strong></td>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 45</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>pr</strong></td>\n",
" <td> 28</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>se</strong></td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 2</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>us</strong></td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 3</td>\n",
" <td> 12</td>\n",
" <td> 16</td>\n",
" <td> 24</td>\n",
" <td> 10</td>\n",
" <td> 15</td>\n",
" <td> 10</td>\n",
" <td> 4</td>\n",
" <td> 1</td>\n",
" <td> 1</td>\n",
" <td> 1</td>\n",
" <td> 1</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>uu</strong></td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 10</td>\n",
" <td> 2</td>\n",
" <td> 12</td>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>uw</strong></td>\n",
" <td> 0</td>\n",
" <td> 10</td>\n",
" <td> 4</td>\n",
" <td> 8</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 27,
"text": [
"Version 0 1 2 3 4 5 6 7 8 9 A B C D E\n",
"Src \n",
"ak 0 110 144 10 0 0 0 0 0 0 0 0 0 0 0\n",
"ci 35 0 82 3 2 0 0 0 0 0 0 0 0 0 0\n",
"hv 0 14 12 0 1 3 0 0 0 0 0 0 0 0 0\n",
"mb 0 0 6 0 0 0 0 0 0 0 0 0 0 0 0\n",
"nc 77 54 63 26 10 5 1 0 3 1 1 0 0 0 0\n",
"nm 0 0 0 0 0 0 0 0 0 0 5 1 0 0 0\n",
"nn 0 1 0 0 0 0 0 0 0 45 0 0 0 0 0\n",
"pr 28 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
"se 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0\n",
"us 0 0 1 3 12 16 24 10 15 10 4 1 1 1 1\n",
"uu 0 0 10 2 12 1 0 0 0 0 0 0 0 0 0\n",
"uw 0 10 4 8 0 0 0 0 0 0 0 0 0 0 0"
]
}
],
"prompt_number": 27
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"eData.ix[0:9,'Lat']"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 28,
"text": [
"0 34.3583\n",
"1 64.6695\n",
"2 62.1891\n",
"3 37.3921\n",
"4 38.8353\n",
"5 40.9122\n",
"6 61.3245\n",
"7 62.7546\n",
"8 61.1027\n",
"9 33.3077\n",
"Name: Lat"
]
}
],
"prompt_number": 28
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"eData.ix[0:9,'Lat'] > 40"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 29,
"text": [
"0 False\n",
"1 True\n",
"2 True\n",
"3 False\n",
"4 False\n",
"5 True\n",
"6 True\n",
"7 True\n",
"8 True\n",
"9 False\n",
"Name: Lat"
]
}
],
"prompt_number": 29
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# equivalent to R's any()\n",
"(eData.ix[0:9,'Lat'] > 40).any()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 30,
"text": [
"True"
]
}
],
"prompt_number": 30
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# equivalent to R's all()\n",
"(eData.ix[0:9,'Lat'] > 40).all()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 31,
"text": [
"False"
]
}
],
"prompt_number": 31
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# looking at subsets; very similar to R's & operator\n",
"eData[(eData['Lat'] > 0) & (eData['Lon'] > 0)][['Lat', 'Lon']][:10]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Lat</th>\n",
" <th>Lon</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td><strong>12 </strong></td>\n",
" <td> 35.9827</td>\n",
" <td> 139.7463</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>17 </strong></td>\n",
" <td> 36.7675</td>\n",
" <td> 140.4900</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>56 </strong></td>\n",
" <td> 39.8641</td>\n",
" <td> 19.7467</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>87 </strong></td>\n",
" <td> 49.8770</td>\n",
" <td> 150.7514</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>141</strong></td>\n",
" <td> 36.6463</td>\n",
" <td> 96.5500</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>154</strong></td>\n",
" <td> 10.5744</td>\n",
" <td> 126.9572</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>164</strong></td>\n",
" <td> 16.9495</td>\n",
" <td> 96.1267</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>167</strong></td>\n",
" <td> 10.4223</td>\n",
" <td> 126.9224</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>184</strong></td>\n",
" <td> 32.8470</td>\n",
" <td> 94.7063</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>197</strong></td>\n",
" <td> 38.7662</td>\n",
" <td> 23.3660</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 32,
"text": [
" Lat Lon\n",
"12 35.9827 139.7463\n",
"17 36.7675 140.4900\n",
"56 39.8641 19.7467\n",
"87 49.8770 150.7514\n",
"141 36.6463 96.5500\n",
"154 10.5744 126.9572\n",
"164 16.9495 96.1267\n",
"167 10.4223 126.9224\n",
"184 32.8470 94.7063\n",
"197 38.7662 23.3660"
]
}
],
"prompt_number": 32
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# looking at subsets; very similar to R's | operator\n",
"eData[(eData['Lat'] > 0) | (eData['Lon'] > 0)][['Lat', 'Lon']][-10:]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Lat</th>\n",
" <th>Lon</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td><strong>881</strong></td>\n",
" <td> 38.8237</td>\n",
" <td>-122.7510</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>882</strong></td>\n",
" <td> 36.5673</td>\n",
" <td>-121.1177</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>883</strong></td>\n",
" <td> 38.1566</td>\n",
" <td>-118.0268</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>884</strong></td>\n",
" <td> 56.9073</td>\n",
" <td>-157.7617</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>885</strong></td>\n",
" <td> 59.8094</td>\n",
" <td>-152.6787</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>886</strong></td>\n",
" <td> 38.8283</td>\n",
" <td>-122.8507</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>887</strong></td>\n",
" <td> 61.2522</td>\n",
" <td>-149.6330</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>888</strong></td>\n",
" <td> 53.0663</td>\n",
" <td>-166.7094</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>889</strong></td>\n",
" <td> 38.1543</td>\n",
" <td>-118.0146</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>890</strong></td>\n",
" <td> 32.5852</td>\n",
" <td>-115.7117</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 33,
"text": [
" Lat Lon\n",
"881 38.8237 -122.7510\n",
"882 36.5673 -121.1177\n",
"883 38.1566 -118.0268\n",
"884 56.9073 -157.7617\n",
"885 59.8094 -152.6787\n",
"886 38.8283 -122.8507\n",
"887 61.2522 -149.6330\n",
"888 53.0663 -166.7094\n",
"889 38.1543 -118.0146\n",
"890 32.5852 -115.7117"
]
}
],
"prompt_number": 33
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"Peer review experiment data"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"fileUrl1 = 'https://dl.dropbox.com/u/7710864/data/reviews-apr29.csv'\n",
"fileUrl2 = 'https://dl.dropbox.com/u/7710864/data/solutions-apr29.csv'\n",
"\n",
"reviews = pd.read_csv(fileUrl1)\n",
"solutions = pd.read_csv(fileUrl2)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 34
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"reviews.head(2)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>solution_id</th>\n",
" <th>reviewer_id</th>\n",
" <th>start</th>\n",
" <th>stop</th>\n",
" <th>time_left</th>\n",
" <th>accept</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td><strong>0</strong></td>\n",
" <td> 1</td>\n",
" <td> 3</td>\n",
" <td> 27</td>\n",
" <td> 1304095698</td>\n",
" <td> 1304095758</td>\n",
" <td> 1754</td>\n",
" <td> 1</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>1</strong></td>\n",
" <td> 2</td>\n",
" <td> 4</td>\n",
" <td> 22</td>\n",
" <td> 1304095188</td>\n",
" <td> 1304095206</td>\n",
" <td> 2306</td>\n",
" <td> 1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 35,
"text": [
" id solution_id reviewer_id start stop time_left accept\n",
"0 1 3 27 1304095698 1304095758 1754 1\n",
"1 2 4 22 1304095188 1304095206 2306 1"
]
}
],
"prompt_number": 35
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"solutions.head(2)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>problem_id</th>\n",
" <th>subject_id</th>\n",
" <th>start</th>\n",
" <th>stop</th>\n",
" <th>time_left</th>\n",
" <th>answer</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td><strong>0</strong></td>\n",
" <td> 1</td>\n",
" <td> 156</td>\n",
" <td> 29</td>\n",
" <td> 1304095119</td>\n",
" <td> 1304095169</td>\n",
" <td> 2343</td>\n",
" <td> B</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>1</strong></td>\n",
" <td> 2</td>\n",
" <td> 269</td>\n",
" <td> 25</td>\n",
" <td> 1304095119</td>\n",
" <td> 1304095183</td>\n",
" <td> 2329</td>\n",
" <td> C</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 36,
"text": [
" id problem_id subject_id start stop time_left answer\n",
"0 1 156 29 1304095119 1304095169 2343 B\n",
"1 2 269 25 1304095119 1304095183 2329 C"
]
}
],
"prompt_number": 36
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# find if there are missing values; equivalent to R's is.na()\n",
"reviews.ix[0:9,'time_left'].isnull()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 37,
"text": [
"0 False\n",
"1 False\n",
"2 False\n",
"3 False\n",
"4 False\n",
"5 False\n",
"6 False\n",
"7 True\n",
"8 False\n",
"9 False\n",
"Name: time_left"
]
}
],
"prompt_number": 37
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"reviews['time_left'].isnull().sum()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 38,
"text": [
"84"
]
}
],
"prompt_number": 38
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"reviews['time_left'].isnull().value_counts()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 39,
"text": [
"False 115\n",
"True 84"
]
}
],
"prompt_number": 39
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# equivalent to R's colSums()\n",
"# notice we need to specifically include missing values with the option skipna=False\n",
"# this is in contrary to R, where the default is to *include* NA\n",
"reviews.sum(skipna=False) "
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 40,
"text": [
"id 19900\n",
"solution_id 19929\n",
"reviewer_id 5064\n",
"start NaN\n",
"stop NaN\n",
"time_left NaN\n",
"accept NaN"
]
}
],
"prompt_number": 40
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# equivalent to R's colMeans\n",
"# same remark as above: here missing values are by default excluded\n",
"reviews.mean()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 41,
"text": [
"id 1.000000e+02\n",
"solution_id 1.001457e+02\n",
"reviewer_id 2.544724e+01\n",
"start 1.304096e+09\n",
"stop 1.304096e+09\n",
"time_left 1.114287e+03\n",
"accept 6.434783e-01"
]
}
],
"prompt_number": 41
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# equivalent to R's rowMeans()\n",
"reviews.mean(axis=1)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 42,
"text": [
"0 3.725990e+08\n",
"1 3.725990e+08\n",
"2 3.725990e+08\n",
"3 3.725990e+08\n",
"4 3.725990e+08\n",
"5 3.725990e+08\n",
"6 3.725990e+08\n",
"7 1.300000e+01\n",
"8 3.725990e+08\n",
"9 3.725990e+08\n",
"10 3.725990e+08\n",
"11 3.725990e+08\n",
"12 3.725990e+08\n",
"13 3.725990e+08\n",
"14 3.725990e+08\n",
"...\n",
"184 1.326667e+02\n",
"185 1.333333e+02\n",
"186 1.343333e+02\n",
"187 1.340000e+02\n",
"188 3.725993e+08\n",
"189 1.356667e+02\n",
"190 1.370000e+02\n",
"191 3.725993e+08\n",
"192 1.390000e+02\n",
"193 1.383333e+02\n",
"194 1.366667e+02\n",
"195 1.410000e+02\n",
"196 1.396667e+02\n",
"197 1.420000e+02\n",
"198 1.393333e+02\n",
"Length: 199"
]
}
],
"prompt_number": 42
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment