Skip to content

Instantly share code, notes, and snippets.

@herrfz
Last active December 11, 2015 16:28
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save herrfz/4627680 to your computer and use it in GitHub Desktop.
Save herrfz/4627680 to your computer and use it in GitHub Desktop.
Coursera Data Analysis -- in Python
{
"metadata": {
"name": "representing_data"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "heading",
"level": 1,
"metadata": {},
"source": [
"Representing Data in R -- equivalent in Python"
]
},
{
"cell_type": "code",
"collapsed": true,
"input": [
"import pandas as pd\n",
"import numpy as np"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 62
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# 'characters' is equivalent to string\n",
"firstName = 'jeff'\n",
"print type(firstName), firstName"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"<type 'str'> jeff\n"
]
}
],
"prompt_number": 63
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# 'numeric' is equivalent to float\n",
"heightCM = 188.2\n",
"print type(heightCM), heightCM"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"<type 'float'> 188.2\n"
]
}
],
"prompt_number": 64
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# integer is equivalent to integer\n",
"numberSons = 1\n",
"print type(numberSons), numberSons"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"<type 'int'> 1\n"
]
}
],
"prompt_number": 65
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# 'logical' is equivalent to Boolean\n",
"teachingCoursera = True\n",
"print type(teachingCoursera), teachingCoursera"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"<type 'bool'> True\n"
]
}
],
"prompt_number": 66
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# 'vectors' is equivalent to numpy array or Python list (I will use array everywhere for consistency)\n",
"heights = np.array([188.2, 181.3, 193.4])\n",
"print heights\n",
"\n",
"firstNames = np.array(['jeff', 'roger', 'andrew', 'brian'])\n",
"print firstNames"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"[ 188.2 181.3 193.4]\n",
"['jeff' 'roger' 'andrew' 'brian']\n"
]
}
],
"prompt_number": 67
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# 'list' is equivalent to dictionary in Python\n",
"vector1 = np.array([188.2, 181.3, 193.4])\n",
"vector2 = np.array(['jeff', 'roger', 'andrew', 'brian'])\n",
"myList = dict(heights = vector1, firstNames = vector2)\n",
"print myList\n",
"\n",
"print myList['heights']\n",
"print myList['firstNames']"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"{'firstNames': array(['jeff', 'roger', 'andrew', 'brian'], \n",
" dtype='|S6'), 'heights': array([ 188.2, 181.3, 193.4])}\n",
"[ 188.2 181.3 193.4]\n",
"['jeff' 'roger' 'andrew' 'brian']\n"
]
}
],
"prompt_number": 68
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# 'matrices' is equivalent to two-dimensional numpy array\n",
"myMatrix = np.array([[1, 2], [3, 4]])\n",
"print myMatrix"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"[[1 2]\n",
" [3 4]]\n"
]
}
],
"prompt_number": 69
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# data frame is equivalent to Pandas DataFrame\n",
"# this example doesn't work because the input array lengths are not the same\n",
"vector1 = np.array([188.2, 181.3, 193.4])\n",
"vector2 = np.array(['jeff', 'roger', 'andrew', 'brian'])\n",
"\n",
"# ValueError: arrays must all be same length\n",
"# \n",
"myDataFrame = pd.DataFrame(dict(heights = vector1, firstNames = vector2))"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 10
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# data frame -- fixed\n",
"vector1 = np.array([188.2, 181.3, 193.4, 192.3])\n",
"vector2 = np.array(['jeff', 'roger', 'andrew', 'brian'])\n",
"\n",
"myDataFrame = pd.DataFrame(dict(heights = vector1, firstNames = vector2))\n",
"myDataFrame"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>firstNames</th>\n",
" <th>heights</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td><strong>0</strong></td>\n",
" <td> jeff</td>\n",
" <td> 188.2</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>1</strong></td>\n",
" <td> roger</td>\n",
" <td> 181.3</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>2</strong></td>\n",
" <td> andrew</td>\n",
" <td> 193.4</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>3</strong></td>\n",
" <td> brian</td>\n",
" <td> 192.3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 70,
"text": [
" firstNames heights\n",
"0 jeff 188.2\n",
"1 roger 181.3\n",
"2 andrew 193.4\n",
"3 brian 192.3"
]
}
],
"prompt_number": 70
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# factors\n",
"# Pandas has a Factor class (also called Categorical), but it feels rather clumsy compared to R, and documentation is scarce\n",
"# this is the closest I can get\n",
"smoker = np.array(['yes', 'no', 'yes', 'yes'])\n",
"s = pd.Factor(NaN, np.unique(smoker))\n",
"smokerFactor = s.from_array(smoker)\n",
"smokerFactor"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 71,
"text": [
"Categorical: \n",
"array([yes, no, yes, yes], dtype=object)\n",
"Levels (2): Index([no, yes], dtype=object)"
]
}
],
"prompt_number": 71
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# R's NA missing values is equivalent to NaN\n",
"vector1 = np.array([188.2, 181.3, 193.4, NaN])\n",
"print vector1\n",
"print isnan(vector1)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"[ 188.2 181.3 193.4 nan]\n",
"[False False False True]\n"
]
}
],
"prompt_number": 72
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# subsetting\n",
"vector1 = np.array([188.2, 181.3, 193.4, 192.3])\n",
"vector2 = np.array(['jeff', 'roger', 'andrew', 'brian'])\n",
"\n",
"myDataFrame = pd.DataFrame(dict(heights = vector1, firstNames = vector2))\n",
"\n",
"print '------------------'\n",
"print vector1[0]\n",
"print '------------------'\n",
"print vector1[[0, 1, 3]]\n",
"print '------------------'\n",
"print myDataFrame.ix[0, 0:2] # appears transposed as compared to R\n",
"print '------------------'\n",
"print myDataFrame['firstNames'] # there's no 'Levels' as in R\n",
"print '------------------'\n",
"print myDataFrame[myDataFrame['firstNames'] == 'jeff']\n",
"print '------------------'\n",
"print myDataFrame[myDataFrame['heights'] < 190]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"------------------\n",
"188.2\n",
"------------------\n",
"[ 188.2 181.3 192.3]\n",
"------------------\n",
"firstNames jeff\n",
"heights 188.2\n",
"Name: 0\n",
"------------------\n",
"0 jeff\n",
"1 roger\n",
"2 andrew\n",
"3 brian\n",
"Name: firstNames\n",
"------------------\n",
" firstNames heights\n",
"0 jeff 188.2\n",
"------------------\n",
" firstNames heights\n",
"0 jeff 188.2\n",
"1 roger 181.3\n"
]
}
],
"prompt_number": 73
},
{
"cell_type": "code",
"collapsed": true,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment