Last active
December 11, 2015 16:28
-
-
Save herrfz/4627680 to your computer and use it in GitHub Desktop.
Coursera Data Analysis -- in Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "representing_data" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "heading", | |
"level": 1, | |
"metadata": {}, | |
"source": [ | |
"Representing Data in R -- equivalent in Python" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": true, | |
"input": [ | |
"import pandas as pd\n", | |
"import numpy as np" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 62 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# 'characters' is equivalent to string\n", | |
"firstName = 'jeff'\n", | |
"print type(firstName), firstName" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"<type 'str'> jeff\n" | |
] | |
} | |
], | |
"prompt_number": 63 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# 'numeric' is equivalent to float\n", | |
"heightCM = 188.2\n", | |
"print type(heightCM), heightCM" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"<type 'float'> 188.2\n" | |
] | |
} | |
], | |
"prompt_number": 64 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# integer is equivalent to integer\n", | |
"numberSons = 1\n", | |
"print type(numberSons), numberSons" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"<type 'int'> 1\n" | |
] | |
} | |
], | |
"prompt_number": 65 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# 'logical' is equivalent to Boolean\n", | |
"teachingCoursera = True\n", | |
"print type(teachingCoursera), teachingCoursera" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"<type 'bool'> True\n" | |
] | |
} | |
], | |
"prompt_number": 66 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# 'vectors' is equivalent to numpy array or Python list (I will use array everywhere for consistency)\n", | |
"heights = np.array([188.2, 181.3, 193.4])\n", | |
"print heights\n", | |
"\n", | |
"firstNames = np.array(['jeff', 'roger', 'andrew', 'brian'])\n", | |
"print firstNames" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"[ 188.2 181.3 193.4]\n", | |
"['jeff' 'roger' 'andrew' 'brian']\n" | |
] | |
} | |
], | |
"prompt_number": 67 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# 'list' is equivalent to dictionary in Python\n", | |
"vector1 = np.array([188.2, 181.3, 193.4])\n", | |
"vector2 = np.array(['jeff', 'roger', 'andrew', 'brian'])\n", | |
"myList = dict(heights = vector1, firstNames = vector2)\n", | |
"print myList\n", | |
"\n", | |
"print myList['heights']\n", | |
"print myList['firstNames']" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"{'firstNames': array(['jeff', 'roger', 'andrew', 'brian'], \n", | |
" dtype='|S6'), 'heights': array([ 188.2, 181.3, 193.4])}\n", | |
"[ 188.2 181.3 193.4]\n", | |
"['jeff' 'roger' 'andrew' 'brian']\n" | |
] | |
} | |
], | |
"prompt_number": 68 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# 'matrices' is equivalent to two-dimensional numpy array\n", | |
"myMatrix = np.array([[1, 2], [3, 4]])\n", | |
"print myMatrix" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"[[1 2]\n", | |
" [3 4]]\n" | |
] | |
} | |
], | |
"prompt_number": 69 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# data frame is equivalent to Pandas DataFrame\n", | |
"# this example doesn't work because the input array lengths are not the same\n", | |
"vector1 = np.array([188.2, 181.3, 193.4])\n", | |
"vector2 = np.array(['jeff', 'roger', 'andrew', 'brian'])\n", | |
"\n", | |
"# ValueError: arrays must all be same length\n", | |
"# \n", | |
"myDataFrame = pd.DataFrame(dict(heights = vector1, firstNames = vector2))" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 10 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# data frame -- fixed\n", | |
"vector1 = np.array([188.2, 181.3, 193.4, 192.3])\n", | |
"vector2 = np.array(['jeff', 'roger', 'andrew', 'brian'])\n", | |
"\n", | |
"myDataFrame = pd.DataFrame(dict(heights = vector1, firstNames = vector2))\n", | |
"myDataFrame" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>firstNames</th>\n", | |
" <th>heights</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <td><strong>0</strong></td>\n", | |
" <td> jeff</td>\n", | |
" <td> 188.2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td><strong>1</strong></td>\n", | |
" <td> roger</td>\n", | |
" <td> 181.3</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td><strong>2</strong></td>\n", | |
" <td> andrew</td>\n", | |
" <td> 193.4</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td><strong>3</strong></td>\n", | |
" <td> brian</td>\n", | |
" <td> 192.3</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"output_type": "pyout", | |
"prompt_number": 70, | |
"text": [ | |
" firstNames heights\n", | |
"0 jeff 188.2\n", | |
"1 roger 181.3\n", | |
"2 andrew 193.4\n", | |
"3 brian 192.3" | |
] | |
} | |
], | |
"prompt_number": 70 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# factors\n", | |
"# Pandas has a Factor class (also called Categorical), but it feels rather clumsy compared to R, and documentation is scarce\n", | |
"# this is the closest I can get\n", | |
"smoker = np.array(['yes', 'no', 'yes', 'yes'])\n", | |
"s = pd.Factor(NaN, np.unique(smoker))\n", | |
"smokerFactor = s.from_array(smoker)\n", | |
"smokerFactor" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 71, | |
"text": [ | |
"Categorical: \n", | |
"array([yes, no, yes, yes], dtype=object)\n", | |
"Levels (2): Index([no, yes], dtype=object)" | |
] | |
} | |
], | |
"prompt_number": 71 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# R's NA missing values is equivalent to NaN\n", | |
"vector1 = np.array([188.2, 181.3, 193.4, NaN])\n", | |
"print vector1\n", | |
"print isnan(vector1)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"[ 188.2 181.3 193.4 nan]\n", | |
"[False False False True]\n" | |
] | |
} | |
], | |
"prompt_number": 72 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# subsetting\n", | |
"vector1 = np.array([188.2, 181.3, 193.4, 192.3])\n", | |
"vector2 = np.array(['jeff', 'roger', 'andrew', 'brian'])\n", | |
"\n", | |
"myDataFrame = pd.DataFrame(dict(heights = vector1, firstNames = vector2))\n", | |
"\n", | |
"print '------------------'\n", | |
"print vector1[0]\n", | |
"print '------------------'\n", | |
"print vector1[[0, 1, 3]]\n", | |
"print '------------------'\n", | |
"print myDataFrame.ix[0, 0:2] # appears transposed as compared to R\n", | |
"print '------------------'\n", | |
"print myDataFrame['firstNames'] # there's no 'Levels' as in R\n", | |
"print '------------------'\n", | |
"print myDataFrame[myDataFrame['firstNames'] == 'jeff']\n", | |
"print '------------------'\n", | |
"print myDataFrame[myDataFrame['heights'] < 190]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"------------------\n", | |
"188.2\n", | |
"------------------\n", | |
"[ 188.2 181.3 192.3]\n", | |
"------------------\n", | |
"firstNames jeff\n", | |
"heights 188.2\n", | |
"Name: 0\n", | |
"------------------\n", | |
"0 jeff\n", | |
"1 roger\n", | |
"2 andrew\n", | |
"3 brian\n", | |
"Name: firstNames\n", | |
"------------------\n", | |
" firstNames heights\n", | |
"0 jeff 188.2\n", | |
"------------------\n", | |
" firstNames heights\n", | |
"0 jeff 188.2\n", | |
"1 roger 181.3\n" | |
] | |
} | |
], | |
"prompt_number": 73 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": true, | |
"input": [], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment