Skip to content

Instantly share code, notes, and snippets.

@herrfz
Created January 31, 2013 21:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save herrfz/4686684 to your computer and use it in GitHub Desktop.
Save herrfz/4686684 to your computer and use it in GitHub Desktop.
Coursera Data Analysis -- in Python
{
"metadata": {
"name": "data_munging_basics"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"Data munging basics -- in Python"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import pandas as pd\n",
"import numpy as np"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 6
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"Camera data"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"cameraData = pd.read_csv('./data/cameras.csv')\n",
"cameraData.columns\n",
"\n",
"# somehow the column is named 'Location 1' instead of 'Location.1'"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 7,
"text": [
"Index([address, direction, street, crossStreet, intersection, Location 1], dtype=object)"
]
}
],
"prompt_number": 7
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# transform column names to lowercase\n",
"# equivalent to R's tolower()\n",
"cameraData.columns = cameraData.columns.map(lambda x: x.lower())\n",
"cameraData.columns"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 8,
"text": [
"Index([address, direction, street, crossstreet, intersection, location 1], dtype=object)"
]
}
],
"prompt_number": 8
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# string split; split a string into list of strings\n",
"# equivalent to R's strsplit()\n",
"splitNames = cameraData.columns.map(lambda x: x.split(' '))\n",
"splitNames[4]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 9,
"text": [
"['intersection']"
]
}
],
"prompt_number": 9
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"splitNames[5]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 10,
"text": [
"['location', '1']"
]
}
],
"prompt_number": 10
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"splitNames[5][0]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 11,
"text": [
"'location'"
]
}
],
"prompt_number": 11
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# apply function to names\n",
"cameraData.columns.map(lambda x: x.split(' ')[0])"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 12,
"text": [
"array([address, direction, street, crossstreet, intersection, location], dtype=object)"
]
}
],
"prompt_number": 12
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"Peer review experiment data"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"fileUrl1 = 'https://dl.dropbox.com/u/7710864/data/reviews-apr29.csv'\n",
"fileUrl2 = 'https://dl.dropbox.com/u/7710864/data/solutions-apr29.csv'\n",
"\n",
"reviews = pd.read_csv(fileUrl1)\n",
"solutions = pd.read_csv(fileUrl2)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 13
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"reviews.head(2)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>solution_id</th>\n",
" <th>reviewer_id</th>\n",
" <th>start</th>\n",
" <th>stop</th>\n",
" <th>time_left</th>\n",
" <th>accept</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td><strong>0</strong></td>\n",
" <td> 1</td>\n",
" <td> 3</td>\n",
" <td> 27</td>\n",
" <td> 1304095698</td>\n",
" <td> 1304095758</td>\n",
" <td> 1754</td>\n",
" <td> 1</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>1</strong></td>\n",
" <td> 2</td>\n",
" <td> 4</td>\n",
" <td> 22</td>\n",
" <td> 1304095188</td>\n",
" <td> 1304095206</td>\n",
" <td> 2306</td>\n",
" <td> 1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 14,
"text": [
" id solution_id reviewer_id start stop time_left accept\n",
"0 1 3 27 1304095698 1304095758 1754 1\n",
"1 2 4 22 1304095188 1304095206 2306 1"
]
}
],
"prompt_number": 14
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"solutions.head(2)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>problem_id</th>\n",
" <th>subject_id</th>\n",
" <th>start</th>\n",
" <th>stop</th>\n",
" <th>time_left</th>\n",
" <th>answer</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td><strong>0</strong></td>\n",
" <td> 1</td>\n",
" <td> 156</td>\n",
" <td> 29</td>\n",
" <td> 1304095119</td>\n",
" <td> 1304095169</td>\n",
" <td> 2343</td>\n",
" <td> B</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>1</strong></td>\n",
" <td> 2</td>\n",
" <td> 269</td>\n",
" <td> 25</td>\n",
" <td> 1304095119</td>\n",
" <td> 1304095183</td>\n",
" <td> 2329</td>\n",
" <td> C</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 15,
"text": [
" id problem_id subject_id start stop time_left answer\n",
"0 1 156 29 1304095119 1304095169 2343 B\n",
"1 2 269 25 1304095119 1304095183 2329 C"
]
}
],
"prompt_number": 15
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"reviews.columns"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 16,
"text": [
"Index([id, solution_id, reviewer_id, start, stop, time_left, accept], dtype=object)"
]
}
],
"prompt_number": 16
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# remove underscores\n",
"# equivalent to R's sub()\n",
"reviews.columns = reviews.columns.map(lambda x: x.replace('_', ''))\n",
"solutions.columns = solutions.columns.map(lambda x: x.replace('_', ''))"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 17
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# test; every occurence is replaced\n",
"# in contrast to R's sub(), where only one occurence is replaced\n",
"testName = 'this_is_a_test'\n",
"testName.replace('_', '')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 18,
"text": [
"'thisisatest'"
]
}
],
"prompt_number": 18
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"reviews['timeleft'][:10]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 19,
"text": [
"0 1754\n",
"1 2306\n",
"2 2192\n",
"3 2089\n",
"4 2043\n",
"5 1999\n",
"6 2130\n",
"7 NaN\n",
"8 1899\n",
"9 2024\n",
"Name: timeleft"
]
}
],
"prompt_number": 19
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# cut into ranges\n",
"# this is equivalent to R's cut()\n",
"timeRanges = pd.cut(reviews['timeleft'], range(0, 4000, 600)) # note it needs to go up to 4000 in order to include 3600\n",
"timeRanges[:10]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 20,
"text": [
"Categorical: \n",
"array([(1200, 1800], (1800, 2400], (1800, 2400], (1800, 2400],\n",
" (1800, 2400], (1800, 2400], (1800, 2400], nan, (1800, 2400],\n",
" (1800, 2400]], dtype=object)\n",
"Levels (6): Index([(0, 600], (600, 1200], (1200, 1800], (1800, 2400],\n",
" (2400, 3000], (3000, 3600]], dtype=object)"
]
}
],
"prompt_number": 20
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"type(timeRanges)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 21,
"text": [
"pandas.core.categorical.Categorical"
]
}
],
"prompt_number": 21
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"pd.value_counts(timeRanges)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 22,
"text": [
"(600, 1200] 32\n",
"(0, 600] 30\n",
"(1800, 2400] 28\n",
"(1200, 1800] 25"
]
}
],
"prompt_number": 22
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# equivalent to R's cut2()\n",
"timeRanges = pd.cut(reviews['timeleft'], 6)\n",
"pd.value_counts(timeRanges) # note that NaN values are excluded; the resulting ranges are thus different from video"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 23,
"text": [
"(19.716, 402.667] 22\n",
"(1164, 1544.667] 20\n",
"(783.333, 1164] 19\n",
"(1544.667, 1925.333] 19\n",
"(402.667, 783.333] 19\n",
"(1925.333, 2306] 16"
]
}
],
"prompt_number": 23
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# adding an extra variable\n",
"# similarly to R, simply assign to a new column\n",
"reviews['timeRanges'] = timeRanges\n",
"reviews.head(2)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>solutionid</th>\n",
" <th>reviewerid</th>\n",
" <th>start</th>\n",
" <th>stop</th>\n",
" <th>timeleft</th>\n",
" <th>accept</th>\n",
" <th>timeRanges</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td><strong>0</strong></td>\n",
" <td> 1</td>\n",
" <td> 3</td>\n",
" <td> 27</td>\n",
" <td> 1304095698</td>\n",
" <td> 1304095758</td>\n",
" <td> 1754</td>\n",
" <td> 1</td>\n",
" <td> (1544.667, 1925.333]</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>1</strong></td>\n",
" <td> 2</td>\n",
" <td> 4</td>\n",
" <td> 22</td>\n",
" <td> 1304095188</td>\n",
" <td> 1304095206</td>\n",
" <td> 2306</td>\n",
" <td> 1</td>\n",
" <td> (1925.333, 2306]</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 24,
"text": [
" id solutionid reviewerid start stop timeleft accept \\\n",
"0 1 3 27 1304095698 1304095758 1754 1 \n",
"1 2 4 22 1304095188 1304095206 2306 1 \n",
"\n",
" timeRanges \n",
"0 (1544.667, 1925.333] \n",
"1 (1925.333, 2306] "
]
}
],
"prompt_number": 24
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# merging data\n",
"print reviews.columns\n",
"print solutions.columns"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Index([id, solutionid, reviewerid, start, stop, timeleft, accept, timeRanges], dtype=object)\n",
"Index([id, problemid, subjectid, start, stop, timeleft, answer], dtype=object)\n"
]
}
],
"prompt_number": 25
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# equivalent to R's merge()\n",
"mergedData2 = pd.merge(reviews, solutions, left_on='solutionid', right_on='id', sort=True)\n",
"mergedData2.ix[:,0:6].head(3)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id_x</th>\n",
" <th>solutionid</th>\n",
" <th>reviewerid</th>\n",
" <th>start_x</th>\n",
" <th>stop_x</th>\n",
" <th>timeleft_x</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td><strong>0</strong></td>\n",
" <td> 4</td>\n",
" <td> 1</td>\n",
" <td> 26</td>\n",
" <td> 1304095267</td>\n",
" <td> 1304095423</td>\n",
" <td> 2089</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>1</strong></td>\n",
" <td> 6</td>\n",
" <td> 2</td>\n",
" <td> 29</td>\n",
" <td> 1304095471</td>\n",
" <td> 1304095513</td>\n",
" <td> 1999</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>2</strong></td>\n",
" <td> 1</td>\n",
" <td> 3</td>\n",
" <td> 27</td>\n",
" <td> 1304095698</td>\n",
" <td> 1304095758</td>\n",
" <td> 1754</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 26,
"text": [
" id_x solutionid reviewerid start_x stop_x timeleft_x\n",
"0 4 1 26 1304095267 1304095423 2089\n",
"1 6 2 29 1304095471 1304095513 1999\n",
"2 1 3 27 1304095698 1304095758 1754"
]
}
],
"prompt_number": 26
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"reviews.ix[0,0:6]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 27,
"text": [
"id 1\n",
"solutionid 3\n",
"reviewerid 27\n",
"start 1.304096e+09\n",
"stop 1.304096e+09\n",
"timeleft 1754\n",
"Name: 0"
]
}
],
"prompt_number": 27
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# sorting values\n",
"mergedData2['reviewerid'][:10]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 28,
"text": [
"0 26\n",
"1 29\n",
"2 27\n",
"3 22\n",
"4 28\n",
"5 22\n",
"6 29\n",
"7 23\n",
"8 25\n",
"9 29\n",
"Name: reviewerid"
]
}
],
"prompt_number": 28
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# equivalent to R's sort()\n",
"mergedData2['reviewerid'].order()[:10]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 29,
"text": [
"3 22\n",
"5 22\n",
"13 22\n",
"21 22\n",
"22 22\n",
"23 22\n",
"26 22\n",
"31 22\n",
"36 22\n",
"38 22\n",
"Name: reviewerid"
]
}
],
"prompt_number": 29
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# R's 'order' can be implemented by getting the index of the sorted Series\n",
"mergedData2['reviewerid'][mergedData2['reviewerid'].order().index][:10]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 30,
"text": [
"3 22\n",
"5 22\n",
"13 22\n",
"21 22\n",
"22 22\n",
"23 22\n",
"26 22\n",
"31 22\n",
"36 22\n",
"38 22\n",
"Name: reviewerid"
]
}
],
"prompt_number": 30
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# reordering a DataFrame\n",
"mergedData2.ix[:,0:6].head(3)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id_x</th>\n",
" <th>solutionid</th>\n",
" <th>reviewerid</th>\n",
" <th>start_x</th>\n",
" <th>stop_x</th>\n",
" <th>timeleft_x</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td><strong>0</strong></td>\n",
" <td> 4</td>\n",
" <td> 1</td>\n",
" <td> 26</td>\n",
" <td> 1304095267</td>\n",
" <td> 1304095423</td>\n",
" <td> 2089</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>1</strong></td>\n",
" <td> 6</td>\n",
" <td> 2</td>\n",
" <td> 29</td>\n",
" <td> 1304095471</td>\n",
" <td> 1304095513</td>\n",
" <td> 1999</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>2</strong></td>\n",
" <td> 1</td>\n",
" <td> 3</td>\n",
" <td> 27</td>\n",
" <td> 1304095698</td>\n",
" <td> 1304095758</td>\n",
" <td> 1754</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 31,
"text": [
" id_x solutionid reviewerid start_x stop_x timeleft_x\n",
"0 4 1 26 1304095267 1304095423 2089\n",
"1 6 2 29 1304095471 1304095513 1999\n",
"2 1 3 27 1304095698 1304095758 1754"
]
}
],
"prompt_number": 31
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# sort by reviewerid\n",
"sortedData = mergedData2.ix[mergedData2['reviewerid'].order().index]\n",
"sortedData.ix[:,0:6].head(3)\n",
"\n",
"# however, it's also possible to do: mergedData2.sort(['reviewerid'])"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id_x</th>\n",
" <th>solutionid</th>\n",
" <th>reviewerid</th>\n",
" <th>start_x</th>\n",
" <th>stop_x</th>\n",
" <th>timeleft_x</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td><strong>3 </strong></td>\n",
" <td> 2</td>\n",
" <td> 4</td>\n",
" <td> 22</td>\n",
" <td> 1304095188</td>\n",
" <td> 1304095206</td>\n",
" <td> 2306</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>5 </strong></td>\n",
" <td> 16</td>\n",
" <td> 6</td>\n",
" <td> 22</td>\n",
" <td> 1304095303</td>\n",
" <td> 1304095471</td>\n",
" <td> 2041</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>13</strong></td>\n",
" <td> 12</td>\n",
" <td> 14</td>\n",
" <td> 22</td>\n",
" <td> 1304095280</td>\n",
" <td> 1304095301</td>\n",
" <td> 2211</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 32,
"text": [
" id_x solutionid reviewerid start_x stop_x timeleft_x\n",
"3 2 4 22 1304095188 1304095206 2306\n",
"5 16 6 22 1304095303 1304095471 2041\n",
"13 12 14 22 1304095280 1304095301 2211"
]
}
],
"prompt_number": 32
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# reordering by multiple columns\n",
"sortedData = mergedData2.sort(['reviewerid', 'id_x'])\n",
"sortedData.ix[:,0:6].head(3)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id_x</th>\n",
" <th>solutionid</th>\n",
" <th>reviewerid</th>\n",
" <th>start_x</th>\n",
" <th>stop_x</th>\n",
" <th>timeleft_x</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td><strong>3 </strong></td>\n",
" <td> 2</td>\n",
" <td> 4</td>\n",
" <td> 22</td>\n",
" <td> 1304095188</td>\n",
" <td> 1304095206</td>\n",
" <td> 2306</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>13</strong></td>\n",
" <td> 12</td>\n",
" <td> 14</td>\n",
" <td> 22</td>\n",
" <td> 1304095280</td>\n",
" <td> 1304095301</td>\n",
" <td> 2211</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>5 </strong></td>\n",
" <td> 16</td>\n",
" <td> 6</td>\n",
" <td> 22</td>\n",
" <td> 1304095303</td>\n",
" <td> 1304095471</td>\n",
" <td> 2041</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 33,
"text": [
" id_x solutionid reviewerid start_x stop_x timeleft_x\n",
"3 2 4 22 1304095188 1304095206 2306\n",
"13 12 14 22 1304095280 1304095301 2211\n",
"5 16 6 22 1304095303 1304095471 2041"
]
}
],
"prompt_number": 33
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"reshaping data - example"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"misShaped = pd.DataFrame({'treatmentA' : [NaN, 1, 2], 'treatmentB' : [5, 4, 3]})\n",
"misShaped['people'] = ['John', 'Jane', 'Mary']\n",
"misShaped"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>treatmentA</th>\n",
" <th>treatmentB</th>\n",
" <th>people</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td><strong>0</strong></td>\n",
" <td>NaN</td>\n",
" <td> 5</td>\n",
" <td> John</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>1</strong></td>\n",
" <td> 1</td>\n",
" <td> 4</td>\n",
" <td> Jane</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>2</strong></td>\n",
" <td> 2</td>\n",
" <td> 3</td>\n",
" <td> Mary</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 34,
"text": [
" treatmentA treatmentB people\n",
"0 NaN 5 John\n",
"1 1 4 Jane\n",
"2 2 3 Mary"
]
}
],
"prompt_number": 34
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# equivalent to R's melt()\n",
"pd.melt(misShaped, id_vars='people')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>people</th>\n",
" <th>variable</th>\n",
" <th>value</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td><strong>0</strong></td>\n",
" <td> John</td>\n",
" <td> treatmentA</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>1</strong></td>\n",
" <td> Jane</td>\n",
" <td> treatmentA</td>\n",
" <td> 1</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>2</strong></td>\n",
" <td> Mary</td>\n",
" <td> treatmentA</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>3</strong></td>\n",
" <td> John</td>\n",
" <td> treatmentB</td>\n",
" <td> 5</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>4</strong></td>\n",
" <td> Jane</td>\n",
" <td> treatmentB</td>\n",
" <td> 4</td>\n",
" </tr>\n",
" <tr>\n",
" <td><strong>5</strong></td>\n",
" <td> Mary</td>\n",
" <td> treatmentB</td>\n",
" <td> 3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 35,
"text": [
" people variable value\n",
"0 John treatmentA NaN\n",
"1 Jane treatmentA 1\n",
"2 Mary treatmentA 2\n",
"3 John treatmentB 5\n",
"4 Jane treatmentB 4\n",
"5 Mary treatmentB 3"
]
}
],
"prompt_number": 35
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment