herrfz/Data Munging Basics

## Data Munging Basics
{
 "metadata": {
  "name": "data_munging_basics"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "heading",
     "level": 2,
     "metadata": {},
     "source": [
      "Data munging basics -- in Python"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import pandas as pd\n",
      "import numpy as np"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 6
    },
    {
     "cell_type": "heading",
     "level": 3,
     "metadata": {},
     "source": [
      "Camera data"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "cameraData = pd.read_csv('./data/cameras.csv')\n",
      "cameraData.columns\n",
      "\n",
      "# somehow the column is named 'Location 1' instead of 'Location.1'"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "pyout",
       "prompt_number": 7,
       "text": [
        "Index([address, direction, street, crossStreet, intersection, Location 1], dtype=object)"
       ]
      }
     ],
     "prompt_number": 7
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# transform column names to lowercase\n",
      "# equivalent to R's tolower()\n",
      "cameraData.columns = cameraData.columns.map(lambda x: x.lower())\n",
      "cameraData.columns"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "pyout",
       "prompt_number": 8,
       "text": [
        "Index([address, direction, street, crossstreet, intersection, location 1], dtype=object)"
       ]
      }
     ],
     "prompt_number": 8
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# string split; split a string into list of strings\n",
      "# equivalent to R's strsplit()\n",
      "splitNames = cameraData.columns.map(lambda x: x.split(' '))\n",
      "splitNames[4]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "pyout",
       "prompt_number": 9,
       "text": [
        "['intersection']"
       ]
      }
     ],
     "prompt_number": 9
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "splitNames[5]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "pyout",
       "prompt_number": 10,
       "text": [
        "['location', '1']"
       ]
      }
     ],
     "prompt_number": 10
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "splitNames[5][0]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "pyout",
       "prompt_number": 11,
       "text": [
        "'location'"
       ]
      }
     ],
     "prompt_number": 11
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# apply function to names\n",
      "cameraData.columns.map(lambda x: x.split(' ')[0])"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "pyout",
       "prompt_number": 12,
       "text": [
        "array([address, direction, street, crossstreet, intersection, location], dtype=object)"
       ]
      }
     ],
     "prompt_number": 12
    },
    {
     "cell_type": "heading",
     "level": 3,
     "metadata": {},
     "source": [
      "Peer review experiment data"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "fileUrl1 = 'https://dl.dropbox.com/u/7710864/data/reviews-apr29.csv'\n",
      "fileUrl2 = 'https://dl.dropbox.com/u/7710864/data/solutions-apr29.csv'\n",
      "\n",
      "reviews = pd.read_csv(fileUrl1)\n",
      "solutions = pd.read_csv(fileUrl2)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 13
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "reviews.head(2)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>id</th>\n",
        "      <th>solution_id</th>\n",
        "      <th>reviewer_id</th>\n",
        "      <th>start</th>\n",
        "      <th>stop</th>\n",
        "      <th>time_left</th>\n",
        "      <th>accept</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <td><strong>0</strong></td>\n",
        "      <td> 1</td>\n",
        "      <td> 3</td>\n",
        "      <td> 27</td>\n",
        "      <td> 1304095698</td>\n",
        "      <td> 1304095758</td>\n",
        "      <td> 1754</td>\n",
        "      <td> 1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td><strong>1</strong></td>\n",
        "      <td> 2</td>\n",
        "      <td> 4</td>\n",
        "      <td> 22</td>\n",
        "      <td> 1304095188</td>\n",
        "      <td> 1304095206</td>\n",
        "      <td> 2306</td>\n",
        "      <td> 1</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "output_type": "pyout",
       "prompt_number": 14,
       "text": [
        "   id  solution_id  reviewer_id       start        stop  time_left  accept\n",
        "0   1            3           27  1304095698  1304095758       1754       1\n",
        "1   2            4           22  1304095188  1304095206       2306       1"
       ]
      }
     ],
     "prompt_number": 14
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "solutions.head(2)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>id</th>\n",
        "      <th>problem_id</th>\n",
        "      <th>subject_id</th>\n",
        "      <th>start</th>\n",
        "      <th>stop</th>\n",
        "      <th>time_left</th>\n",
        "      <th>answer</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <td><strong>0</strong></td>\n",
        "      <td> 1</td>\n",
        "      <td> 156</td>\n",
        "      <td> 29</td>\n",
        "      <td> 1304095119</td>\n",
        "      <td> 1304095169</td>\n",
        "      <td> 2343</td>\n",
        "      <td> B</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td><strong>1</strong></td>\n",
        "      <td> 2</td>\n",
        "      <td> 269</td>\n",
        "      <td> 25</td>\n",
        "      <td> 1304095119</td>\n",
        "      <td> 1304095183</td>\n",
        "      <td> 2329</td>\n",
        "      <td> C</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "output_type": "pyout",
       "prompt_number": 15,
       "text": [
        "   id  problem_id  subject_id       start        stop  time_left answer\n",
        "0   1         156          29  1304095119  1304095169       2343      B\n",
        "1   2         269          25  1304095119  1304095183       2329      C"
       ]
      }
     ],
     "prompt_number": 15
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "reviews.columns"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "pyout",
       "prompt_number": 16,
       "text": [
        "Index([id, solution_id, reviewer_id, start, stop, time_left, accept], dtype=object)"
       ]
      }
     ],
     "prompt_number": 16
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# remove underscores\n",
      "# equivalent to R's sub()\n",
      "reviews.columns = reviews.columns.map(lambda x: x.replace('_', ''))\n",
      "solutions.columns = solutions.columns.map(lambda x: x.replace('_', ''))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 17
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# test; every occurence is replaced\n",
      "# in contrast to R's sub(), where only one occurence is replaced\n",
      "testName = 'this_is_a_test'\n",
      "testName.replace('_', '')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "pyout",
       "prompt_number": 18,
       "text": [
        "'thisisatest'"
       ]
      }
     ],
     "prompt_number": 18
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "reviews['timeleft'][:10]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "pyout",
       "prompt_number": 19,
       "text": [
        "0    1754\n",
        "1    2306\n",
        "2    2192\n",
        "3    2089\n",
        "4    2043\n",
        "5    1999\n",
        "6    2130\n",
        "7     NaN\n",
        "8    1899\n",
        "9    2024\n",
        "Name: timeleft"
       ]
      }
     ],
     "prompt_number": 19
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# cut into ranges\n",
      "# this is equivalent to R's cut()\n",
      "timeRanges = pd.cut(reviews['timeleft'], range(0, 4000, 600)) # note it needs to go up to 4000 in order to include 3600\n",
      "timeRanges[:10]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "pyout",
       "prompt_number": 20,
       "text": [
        "Categorical: \n",
        "array([(1200, 1800], (1800, 2400], (1800, 2400], (1800, 2400],\n",
        "       (1800, 2400], (1800, 2400], (1800, 2400], nan, (1800, 2400],\n",
        "       (1800, 2400]], dtype=object)\n",
        "Levels (6): Index([(0, 600], (600, 1200], (1200, 1800], (1800, 2400],\n",
        "                   (2400, 3000], (3000, 3600]], dtype=object)"
       ]
      }
     ],
     "prompt_number": 20
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "type(timeRanges)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "pyout",
       "prompt_number": 21,
       "text": [
        "pandas.core.categorical.Categorical"
       ]
      }
     ],
     "prompt_number": 21
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "pd.value_counts(timeRanges)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "pyout",
       "prompt_number": 22,
       "text": [
        "(600, 1200]     32\n",
        "(0, 600]        30\n",
        "(1800, 2400]    28\n",
        "(1200, 1800]    25"
       ]
      }
     ],
     "prompt_number": 22
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# equivalent to R's cut2()\n",
      "timeRanges = pd.cut(reviews['timeleft'], 6)\n",
      "pd.value_counts(timeRanges) # note that NaN values are excluded; the resulting ranges are thus different from video"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "pyout",
       "prompt_number": 23,
       "text": [
        "(19.716, 402.667]       22\n",
        "(1164, 1544.667]        20\n",
        "(783.333, 1164]         19\n",
        "(1544.667, 1925.333]    19\n",
        "(402.667, 783.333]      19\n",
        "(1925.333, 2306]        16"
       ]
      }
     ],
     "prompt_number": 23
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# adding an extra variable\n",
      "# similarly to R, simply assign to a new column\n",
      "reviews['timeRanges'] = timeRanges\n",
      "reviews.head(2)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>id</th>\n",
        "      <th>solutionid</th>\n",
        "      <th>reviewerid</th>\n",
        "      <th>start</th>\n",
        "      <th>stop</th>\n",
        "      <th>timeleft</th>\n",
        "      <th>accept</th>\n",
        "      <th>timeRanges</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <td><strong>0</strong></td>\n",
        "      <td> 1</td>\n",
        "      <td> 3</td>\n",
        "      <td> 27</td>\n",
        "      <td> 1304095698</td>\n",
        "      <td> 1304095758</td>\n",
        "      <td> 1754</td>\n",
        "      <td> 1</td>\n",
        "      <td> (1544.667, 1925.333]</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td><strong>1</strong></td>\n",
        "      <td> 2</td>\n",
        "      <td> 4</td>\n",
        "      <td> 22</td>\n",
        "      <td> 1304095188</td>\n",
        "      <td> 1304095206</td>\n",
        "      <td> 2306</td>\n",
        "      <td> 1</td>\n",
        "      <td>     (1925.333, 2306]</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "output_type": "pyout",
       "prompt_number": 24,
       "text": [
        "   id  solutionid  reviewerid       start        stop  timeleft  accept  \\\n",
        "0   1           3          27  1304095698  1304095758      1754       1   \n",
        "1   2           4          22  1304095188  1304095206      2306       1   \n",
        "\n",
        "             timeRanges  \n",
        "0  (1544.667, 1925.333]  \n",
        "1      (1925.333, 2306]  "
       ]
      }
     ],
     "prompt_number": 24
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# merging data\n",
      "print reviews.columns\n",
      "print solutions.columns"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "Index([id, solutionid, reviewerid, start, stop, timeleft, accept, timeRanges], dtype=object)\n",
        "Index([id, problemid, subjectid, start, stop, timeleft, answer], dtype=object)\n"
       ]
      }
     ],
     "prompt_number": 25
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# equivalent to R's merge()\n",
      "mergedData2 = pd.merge(reviews, solutions, left_on='solutionid', right_on='id', sort=True)\n",
      "mergedData2.ix[:,0:6].head(3)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>id_x</th>\n",
        "      <th>solutionid</th>\n",
        "      <th>reviewerid</th>\n",
        "      <th>start_x</th>\n",
        "      <th>stop_x</th>\n",
        "      <th>timeleft_x</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <td><strong>0</strong></td>\n",
        "      <td> 4</td>\n",
        "      <td> 1</td>\n",
        "      <td> 26</td>\n",
        "      <td> 1304095267</td>\n",
        "      <td> 1304095423</td>\n",
        "      <td> 2089</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td><strong>1</strong></td>\n",
        "      <td> 6</td>\n",
        "      <td> 2</td>\n",
        "      <td> 29</td>\n",
        "      <td> 1304095471</td>\n",
        "      <td> 1304095513</td>\n",
        "      <td> 1999</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td><strong>2</strong></td>\n",
        "      <td> 1</td>\n",
        "      <td> 3</td>\n",
        "      <td> 27</td>\n",
        "      <td> 1304095698</td>\n",
        "      <td> 1304095758</td>\n",
        "      <td> 1754</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "output_type": "pyout",
       "prompt_number": 26,
       "text": [
        "   id_x  solutionid  reviewerid     start_x      stop_x  timeleft_x\n",
        "0     4           1          26  1304095267  1304095423        2089\n",
        "1     6           2          29  1304095471  1304095513        1999\n",
        "2     1           3          27  1304095698  1304095758        1754"
       ]
      }
     ],
     "prompt_number": 26
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "reviews.ix[0,0:6]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "pyout",
       "prompt_number": 27,
       "text": [
        "id                       1\n",
        "solutionid               3\n",
        "reviewerid              27\n",
        "start         1.304096e+09\n",
        "stop          1.304096e+09\n",
        "timeleft              1754\n",
        "Name: 0"
       ]
      }
     ],
     "prompt_number": 27
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# sorting values\n",
      "mergedData2['reviewerid'][:10]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "pyout",
       "prompt_number": 28,
       "text": [
        "0    26\n",
        "1    29\n",
        "2    27\n",
        "3    22\n",
        "4    28\n",
        "5    22\n",
        "6    29\n",
        "7    23\n",
        "8    25\n",
        "9    29\n",
        "Name: reviewerid"
       ]
      }
     ],
     "prompt_number": 28
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# equivalent to R's sort()\n",
      "mergedData2['reviewerid'].order()[:10]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "pyout",
       "prompt_number": 29,
       "text": [
        "3     22\n",
        "5     22\n",
        "13    22\n",
        "21    22\n",
        "22    22\n",
        "23    22\n",
        "26    22\n",
        "31    22\n",
        "36    22\n",
        "38    22\n",
        "Name: reviewerid"
       ]
      }
     ],
     "prompt_number": 29
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# R's 'order' can be implemented by getting the index of the sorted Series\n",
      "mergedData2['reviewerid'][mergedData2['reviewerid'].order().index][:10]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "pyout",
       "prompt_number": 30,
       "text": [
        "3     22\n",
        "5     22\n",
        "13    22\n",
        "21    22\n",
        "22    22\n",
        "23    22\n",
        "26    22\n",
        "31    22\n",
        "36    22\n",
        "38    22\n",
        "Name: reviewerid"
       ]
      }
     ],
     "prompt_number": 30
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# reordering a DataFrame\n",
      "mergedData2.ix[:,0:6].head(3)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>id_x</th>\n",
        "      <th>solutionid</th>\n",
        "      <th>reviewerid</th>\n",
        "      <th>start_x</th>\n",
        "      <th>stop_x</th>\n",
        "      <th>timeleft_x</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <td><strong>0</strong></td>\n",
        "      <td> 4</td>\n",
        "      <td> 1</td>\n",
        "      <td> 26</td>\n",
        "      <td> 1304095267</td>\n",
        "      <td> 1304095423</td>\n",
        "      <td> 2089</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td><strong>1</strong></td>\n",
        "      <td> 6</td>\n",
        "      <td> 2</td>\n",
        "      <td> 29</td>\n",
        "      <td> 1304095471</td>\n",
        "      <td> 1304095513</td>\n",
        "      <td> 1999</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td><strong>2</strong></td>\n",
        "      <td> 1</td>\n",
        "      <td> 3</td>\n",
        "      <td> 27</td>\n",
        "      <td> 1304095698</td>\n",
        "      <td> 1304095758</td>\n",
        "      <td> 1754</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "output_type": "pyout",
       "prompt_number": 31,
       "text": [
        "   id_x  solutionid  reviewerid     start_x      stop_x  timeleft_x\n",
        "0     4           1          26  1304095267  1304095423        2089\n",
        "1     6           2          29  1304095471  1304095513        1999\n",
        "2     1           3          27  1304095698  1304095758        1754"
       ]
      }
     ],
     "prompt_number": 31
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# sort by reviewerid\n",
      "sortedData = mergedData2.ix[mergedData2['reviewerid'].order().index]\n",
      "sortedData.ix[:,0:6].head(3)\n",
      "\n",
      "# however, it's also possible to do: mergedData2.sort(['reviewerid'])"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>id_x</th>\n",
        "      <th>solutionid</th>\n",
        "      <th>reviewerid</th>\n",
        "      <th>start_x</th>\n",
        "      <th>stop_x</th>\n",
        "      <th>timeleft_x</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <td><strong>3 </strong></td>\n",
        "      <td>  2</td>\n",
        "      <td>  4</td>\n",
        "      <td> 22</td>\n",
        "      <td> 1304095188</td>\n",
        "      <td> 1304095206</td>\n",
        "      <td> 2306</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td><strong>5 </strong></td>\n",
        "      <td> 16</td>\n",
        "      <td>  6</td>\n",
        "      <td> 22</td>\n",
        "      <td> 1304095303</td>\n",
        "      <td> 1304095471</td>\n",
        "      <td> 2041</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td><strong>13</strong></td>\n",
        "      <td> 12</td>\n",
        "      <td> 14</td>\n",
        "      <td> 22</td>\n",
        "      <td> 1304095280</td>\n",
        "      <td> 1304095301</td>\n",
        "      <td> 2211</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "output_type": "pyout",
       "prompt_number": 32,
       "text": [
        "    id_x  solutionid  reviewerid     start_x      stop_x  timeleft_x\n",
        "3      2           4          22  1304095188  1304095206        2306\n",
        "5     16           6          22  1304095303  1304095471        2041\n",
        "13    12          14          22  1304095280  1304095301        2211"
       ]
      }
     ],
     "prompt_number": 32
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# reordering by multiple columns\n",
      "sortedData = mergedData2.sort(['reviewerid', 'id_x'])\n",
      "sortedData.ix[:,0:6].head(3)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>id_x</th>\n",
        "      <th>solutionid</th>\n",
        "      <th>reviewerid</th>\n",
        "      <th>start_x</th>\n",
        "      <th>stop_x</th>\n",
        "      <th>timeleft_x</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <td><strong>3 </strong></td>\n",
        "      <td>  2</td>\n",
        "      <td>  4</td>\n",
        "      <td> 22</td>\n",
        "      <td> 1304095188</td>\n",
        "      <td> 1304095206</td>\n",
        "      <td> 2306</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td><strong>13</strong></td>\n",
        "      <td> 12</td>\n",
        "      <td> 14</td>\n",
        "      <td> 22</td>\n",
        "      <td> 1304095280</td>\n",
        "      <td> 1304095301</td>\n",
        "      <td> 2211</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td><strong>5 </strong></td>\n",
        "      <td> 16</td>\n",
        "      <td>  6</td>\n",
        "      <td> 22</td>\n",
        "      <td> 1304095303</td>\n",
        "      <td> 1304095471</td>\n",
        "      <td> 2041</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "output_type": "pyout",
       "prompt_number": 33,
       "text": [
        "    id_x  solutionid  reviewerid     start_x      stop_x  timeleft_x\n",
        "3      2           4          22  1304095188  1304095206        2306\n",
        "13    12          14          22  1304095280  1304095301        2211\n",
        "5     16           6          22  1304095303  1304095471        2041"
       ]
      }
     ],
     "prompt_number": 33
    },
    {
     "cell_type": "heading",
     "level": 3,
     "metadata": {},
     "source": [
      "reshaping data - example"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "misShaped = pd.DataFrame({'treatmentA' : [NaN, 1, 2], 'treatmentB' : [5, 4, 3]})\n",
      "misShaped['people'] = ['John', 'Jane', 'Mary']\n",
      "misShaped"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>treatmentA</th>\n",
        "      <th>treatmentB</th>\n",
        "      <th>people</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <td><strong>0</strong></td>\n",
        "      <td>NaN</td>\n",
        "      <td> 5</td>\n",
        "      <td> John</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td><strong>1</strong></td>\n",
        "      <td>  1</td>\n",
        "      <td> 4</td>\n",
        "      <td> Jane</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td><strong>2</strong></td>\n",
        "      <td>  2</td>\n",
        "      <td> 3</td>\n",
        "      <td> Mary</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "output_type": "pyout",
       "prompt_number": 34,
       "text": [
        "   treatmentA  treatmentB people\n",
        "0         NaN           5   John\n",
        "1           1           4   Jane\n",
        "2           2           3   Mary"
       ]
      }
     ],
     "prompt_number": 34
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# equivalent to R's melt()\n",
      "pd.melt(misShaped, id_vars='people')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>people</th>\n",
        "      <th>variable</th>\n",
        "      <th>value</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <td><strong>0</strong></td>\n",
        "      <td> John</td>\n",
        "      <td> treatmentA</td>\n",
        "      <td>NaN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td><strong>1</strong></td>\n",
        "      <td> Jane</td>\n",
        "      <td> treatmentA</td>\n",
        "      <td>  1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td><strong>2</strong></td>\n",
        "      <td> Mary</td>\n",
        "      <td> treatmentA</td>\n",
        "      <td>  2</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td><strong>3</strong></td>\n",
        "      <td> John</td>\n",
        "      <td> treatmentB</td>\n",
        "      <td>  5</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td><strong>4</strong></td>\n",
        "      <td> Jane</td>\n",
        "      <td> treatmentB</td>\n",
        "      <td>  4</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td><strong>5</strong></td>\n",
        "      <td> Mary</td>\n",
        "      <td> treatmentB</td>\n",
        "      <td>  3</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "output_type": "pyout",
       "prompt_number": 35,
       "text": [
        "  people    variable  value\n",
        "0   John  treatmentA    NaN\n",
        "1   Jane  treatmentA      1\n",
        "2   Mary  treatmentA      2\n",
        "3   John  treatmentB      5\n",
        "4   Jane  treatmentB      4\n",
        "5   Mary  treatmentB      3"
       ]
      }
     ],
     "prompt_number": 35
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": []
    }
   ],
   "metadata": {}
  }
 ]
}
	{
	"metadata": {
	"name": "data_munging_basics"
	},
	"nbformat": 3,
	"nbformat_minor": 0,
	"worksheets": [
	{
	"cells": [
	{
	"cell_type": "heading",
	"level": 2,
	"metadata": {},
	"source": [
	"Data munging basics -- in Python"
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"import pandas as pd\n",
	"import numpy as np"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 6
	},
	{
	"cell_type": "heading",
	"level": 3,
	"metadata": {},
	"source": [
	"Camera data"
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"cameraData = pd.read_csv('./data/cameras.csv')\n",
	"cameraData.columns\n",
	"\n",
	"# somehow the column is named 'Location 1' instead of 'Location.1'"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "pyout",
	"prompt_number": 7,
	"text": [
	"Index([address, direction, street, crossStreet, intersection, Location 1], dtype=object)"
	]
	}
	],
	"prompt_number": 7
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# transform column names to lowercase\n",
	"# equivalent to R's tolower()\n",
	"cameraData.columns = cameraData.columns.map(lambda x: x.lower())\n",
	"cameraData.columns"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "pyout",
	"prompt_number": 8,
	"text": [
	"Index([address, direction, street, crossstreet, intersection, location 1], dtype=object)"
	]
	}
	],
	"prompt_number": 8
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# string split; split a string into list of strings\n",
	"# equivalent to R's strsplit()\n",
	"splitNames = cameraData.columns.map(lambda x: x.split(' '))\n",
	"splitNames[4]"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "pyout",
	"prompt_number": 9,
	"text": [
	"['intersection']"
	]
	}
	],
	"prompt_number": 9
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"splitNames[5]"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "pyout",
	"prompt_number": 10,
	"text": [
	"['location', '1']"
	]
	}
	],
	"prompt_number": 10
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"splitNames[5][0]"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "pyout",
	"prompt_number": 11,
	"text": [
	"'location'"
	]
	}
	],
	"prompt_number": 11
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# apply function to names\n",
	"cameraData.columns.map(lambda x: x.split(' ')[0])"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "pyout",
	"prompt_number": 12,
	"text": [
	"array([address, direction, street, crossstreet, intersection, location], dtype=object)"
	]
	}
	],
	"prompt_number": 12
	},
	{
	"cell_type": "heading",
	"level": 3,
	"metadata": {},
	"source": [
	"Peer review experiment data"
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"fileUrl1 = 'https://dl.dropbox.com/u/7710864/data/reviews-apr29.csv'\n",
	"fileUrl2 = 'https://dl.dropbox.com/u/7710864/data/solutions-apr29.csv'\n",
	"\n",
	"reviews = pd.read_csv(fileUrl1)\n",
	"solutions = pd.read_csv(fileUrl2)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 13
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"reviews.head(2)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"html": [
	"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>id</th>\n",
	" <th>solution_id</th>\n",
	" <th>reviewer_id</th>\n",
	" <th>start</th>\n",
	" <th>stop</th>\n",
	" <th>time_left</th>\n",
	" <th>accept</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <td><strong>0</strong></td>\n",
	" <td> 1</td>\n",
	" <td> 3</td>\n",
	" <td> 27</td>\n",
	" <td> 1304095698</td>\n",
	" <td> 1304095758</td>\n",
	" <td> 1754</td>\n",
	" <td> 1</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td><strong>1</strong></td>\n",
	" <td> 2</td>\n",
	" <td> 4</td>\n",
	" <td> 22</td>\n",
	" <td> 1304095188</td>\n",
	" <td> 1304095206</td>\n",
	" <td> 2306</td>\n",
	" <td> 1</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"output_type": "pyout",
	"prompt_number": 14,
	"text": [
	" id solution_id reviewer_id start stop time_left accept\n",
	"0 1 3 27 1304095698 1304095758 1754 1\n",
	"1 2 4 22 1304095188 1304095206 2306 1"
	]
	}
	],
	"prompt_number": 14
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"solutions.head(2)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"html": [
	"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>id</th>\n",
	" <th>problem_id</th>\n",
	" <th>subject_id</th>\n",
	" <th>start</th>\n",
	" <th>stop</th>\n",
	" <th>time_left</th>\n",
	" <th>answer</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <td><strong>0</strong></td>\n",
	" <td> 1</td>\n",
	" <td> 156</td>\n",
	" <td> 29</td>\n",
	" <td> 1304095119</td>\n",
	" <td> 1304095169</td>\n",
	" <td> 2343</td>\n",
	" <td> B</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td><strong>1</strong></td>\n",
	" <td> 2</td>\n",
	" <td> 269</td>\n",
	" <td> 25</td>\n",
	" <td> 1304095119</td>\n",
	" <td> 1304095183</td>\n",
	" <td> 2329</td>\n",
	" <td> C</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"output_type": "pyout",
	"prompt_number": 15,
	"text": [
	" id problem_id subject_id start stop time_left answer\n",
	"0 1 156 29 1304095119 1304095169 2343 B\n",
	"1 2 269 25 1304095119 1304095183 2329 C"
	]
	}
	],
	"prompt_number": 15
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"reviews.columns"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "pyout",
	"prompt_number": 16,
	"text": [
	"Index([id, solution_id, reviewer_id, start, stop, time_left, accept], dtype=object)"
	]
	}
	],
	"prompt_number": 16
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# remove underscores\n",
	"# equivalent to R's sub()\n",
	"reviews.columns = reviews.columns.map(lambda x: x.replace('_', ''))\n",
	"solutions.columns = solutions.columns.map(lambda x: x.replace('_', ''))"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 17
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# test; every occurence is replaced\n",
	"# in contrast to R's sub(), where only one occurence is replaced\n",
	"testName = 'this_is_a_test'\n",
	"testName.replace('_', '')"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "pyout",
	"prompt_number": 18,
	"text": [
	"'thisisatest'"
	]
	}
	],
	"prompt_number": 18
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"reviews['timeleft'][:10]"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "pyout",
	"prompt_number": 19,
	"text": [
	"0 1754\n",
	"1 2306\n",
	"2 2192\n",
	"3 2089\n",
	"4 2043\n",
	"5 1999\n",
	"6 2130\n",
	"7 NaN\n",
	"8 1899\n",
	"9 2024\n",
	"Name: timeleft"
	]
	}
	],
	"prompt_number": 19
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# cut into ranges\n",
	"# this is equivalent to R's cut()\n",
	"timeRanges = pd.cut(reviews['timeleft'], range(0, 4000, 600)) # note it needs to go up to 4000 in order to include 3600\n",
	"timeRanges[:10]"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "pyout",
	"prompt_number": 20,
	"text": [
	"Categorical: \n",
	"array([(1200, 1800], (1800, 2400], (1800, 2400], (1800, 2400],\n",
	" (1800, 2400], (1800, 2400], (1800, 2400], nan, (1800, 2400],\n",
	" (1800, 2400]], dtype=object)\n",
	"Levels (6): Index([(0, 600], (600, 1200], (1200, 1800], (1800, 2400],\n",
	" (2400, 3000], (3000, 3600]], dtype=object)"
	]
	}
	],
	"prompt_number": 20
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"type(timeRanges)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "pyout",
	"prompt_number": 21,
	"text": [
	"pandas.core.categorical.Categorical"
	]
	}
	],
	"prompt_number": 21
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"pd.value_counts(timeRanges)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "pyout",
	"prompt_number": 22,
	"text": [
	"(600, 1200] 32\n",
	"(0, 600] 30\n",
	"(1800, 2400] 28\n",
	"(1200, 1800] 25"
	]
	}
	],
	"prompt_number": 22
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# equivalent to R's cut2()\n",
	"timeRanges = pd.cut(reviews['timeleft'], 6)\n",
	"pd.value_counts(timeRanges) # note that NaN values are excluded; the resulting ranges are thus different from video"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "pyout",
	"prompt_number": 23,
	"text": [
	"(19.716, 402.667] 22\n",
	"(1164, 1544.667] 20\n",
	"(783.333, 1164] 19\n",
	"(1544.667, 1925.333] 19\n",
	"(402.667, 783.333] 19\n",
	"(1925.333, 2306] 16"
	]
	}
	],
	"prompt_number": 23
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# adding an extra variable\n",
	"# similarly to R, simply assign to a new column\n",
	"reviews['timeRanges'] = timeRanges\n",
	"reviews.head(2)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"html": [
	"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>id</th>\n",
	" <th>solutionid</th>\n",
	" <th>reviewerid</th>\n",
	" <th>start</th>\n",
	" <th>stop</th>\n",
	" <th>timeleft</th>\n",
	" <th>accept</th>\n",
	" <th>timeRanges</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <td><strong>0</strong></td>\n",
	" <td> 1</td>\n",
	" <td> 3</td>\n",
	" <td> 27</td>\n",
	" <td> 1304095698</td>\n",
	" <td> 1304095758</td>\n",
	" <td> 1754</td>\n",
	" <td> 1</td>\n",
	" <td> (1544.667, 1925.333]</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td><strong>1</strong></td>\n",
	" <td> 2</td>\n",
	" <td> 4</td>\n",
	" <td> 22</td>\n",
	" <td> 1304095188</td>\n",
	" <td> 1304095206</td>\n",
	" <td> 2306</td>\n",
	" <td> 1</td>\n",
	" <td> (1925.333, 2306]</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"output_type": "pyout",
	"prompt_number": 24,
	"text": [
	" id solutionid reviewerid start stop timeleft accept \\\n",
	"0 1 3 27 1304095698 1304095758 1754 1 \n",
	"1 2 4 22 1304095188 1304095206 2306 1 \n",
	"\n",
	" timeRanges \n",
	"0 (1544.667, 1925.333] \n",
	"1 (1925.333, 2306] "
	]
	}
	],
	"prompt_number": 24
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# merging data\n",
	"print reviews.columns\n",
	"print solutions.columns"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"Index([id, solutionid, reviewerid, start, stop, timeleft, accept, timeRanges], dtype=object)\n",
	"Index([id, problemid, subjectid, start, stop, timeleft, answer], dtype=object)\n"
	]
	}
	],
	"prompt_number": 25
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# equivalent to R's merge()\n",
	"mergedData2 = pd.merge(reviews, solutions, left_on='solutionid', right_on='id', sort=True)\n",
	"mergedData2.ix[:,0:6].head(3)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"html": [
	"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>id_x</th>\n",
	" <th>solutionid</th>\n",
	" <th>reviewerid</th>\n",
	" <th>start_x</th>\n",
	" <th>stop_x</th>\n",
	" <th>timeleft_x</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <td><strong>0</strong></td>\n",
	" <td> 4</td>\n",
	" <td> 1</td>\n",
	" <td> 26</td>\n",
	" <td> 1304095267</td>\n",
	" <td> 1304095423</td>\n",
	" <td> 2089</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td><strong>1</strong></td>\n",
	" <td> 6</td>\n",
	" <td> 2</td>\n",
	" <td> 29</td>\n",
	" <td> 1304095471</td>\n",
	" <td> 1304095513</td>\n",
	" <td> 1999</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td><strong>2</strong></td>\n",
	" <td> 1</td>\n",
	" <td> 3</td>\n",
	" <td> 27</td>\n",
	" <td> 1304095698</td>\n",
	" <td> 1304095758</td>\n",
	" <td> 1754</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"output_type": "pyout",
	"prompt_number": 26,
	"text": [
	" id_x solutionid reviewerid start_x stop_x timeleft_x\n",
	"0 4 1 26 1304095267 1304095423 2089\n",
	"1 6 2 29 1304095471 1304095513 1999\n",
	"2 1 3 27 1304095698 1304095758 1754"
	]
	}
	],
	"prompt_number": 26
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"reviews.ix[0,0:6]"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "pyout",
	"prompt_number": 27,
	"text": [
	"id 1\n",
	"solutionid 3\n",
	"reviewerid 27\n",
	"start 1.304096e+09\n",
	"stop 1.304096e+09\n",
	"timeleft 1754\n",
	"Name: 0"
	]
	}
	],
	"prompt_number": 27
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# sorting values\n",
	"mergedData2['reviewerid'][:10]"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "pyout",
	"prompt_number": 28,
	"text": [
	"0 26\n",
	"1 29\n",
	"2 27\n",
	"3 22\n",
	"4 28\n",
	"5 22\n",
	"6 29\n",
	"7 23\n",
	"8 25\n",
	"9 29\n",
	"Name: reviewerid"
	]
	}
	],
	"prompt_number": 28
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# equivalent to R's sort()\n",
	"mergedData2['reviewerid'].order()[:10]"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "pyout",
	"prompt_number": 29,
	"text": [
	"3 22\n",
	"5 22\n",
	"13 22\n",
	"21 22\n",
	"22 22\n",
	"23 22\n",
	"26 22\n",
	"31 22\n",
	"36 22\n",
	"38 22\n",
	"Name: reviewerid"
	]
	}
	],
	"prompt_number": 29
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# R's 'order' can be implemented by getting the index of the sorted Series\n",
	"mergedData2['reviewerid'][mergedData2['reviewerid'].order().index][:10]"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "pyout",
	"prompt_number": 30,
	"text": [
	"3 22\n",
	"5 22\n",
	"13 22\n",
	"21 22\n",
	"22 22\n",
	"23 22\n",
	"26 22\n",
	"31 22\n",
	"36 22\n",
	"38 22\n",
	"Name: reviewerid"
	]
	}
	],
	"prompt_number": 30
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# reordering a DataFrame\n",
	"mergedData2.ix[:,0:6].head(3)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"html": [
	"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>id_x</th>\n",
	" <th>solutionid</th>\n",
	" <th>reviewerid</th>\n",
	" <th>start_x</th>\n",
	" <th>stop_x</th>\n",
	" <th>timeleft_x</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <td><strong>0</strong></td>\n",
	" <td> 4</td>\n",
	" <td> 1</td>\n",
	" <td> 26</td>\n",
	" <td> 1304095267</td>\n",
	" <td> 1304095423</td>\n",
	" <td> 2089</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td><strong>1</strong></td>\n",
	" <td> 6</td>\n",
	" <td> 2</td>\n",
	" <td> 29</td>\n",
	" <td> 1304095471</td>\n",
	" <td> 1304095513</td>\n",
	" <td> 1999</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td><strong>2</strong></td>\n",
	" <td> 1</td>\n",
	" <td> 3</td>\n",
	" <td> 27</td>\n",
	" <td> 1304095698</td>\n",
	" <td> 1304095758</td>\n",
	" <td> 1754</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"output_type": "pyout",
	"prompt_number": 31,
	"text": [
	" id_x solutionid reviewerid start_x stop_x timeleft_x\n",
	"0 4 1 26 1304095267 1304095423 2089\n",
	"1 6 2 29 1304095471 1304095513 1999\n",
	"2 1 3 27 1304095698 1304095758 1754"
	]
	}
	],
	"prompt_number": 31
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# sort by reviewerid\n",
	"sortedData = mergedData2.ix[mergedData2['reviewerid'].order().index]\n",
	"sortedData.ix[:,0:6].head(3)\n",
	"\n",
	"# however, it's also possible to do: mergedData2.sort(['reviewerid'])"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"html": [
	"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>id_x</th>\n",
	" <th>solutionid</th>\n",
	" <th>reviewerid</th>\n",
	" <th>start_x</th>\n",
	" <th>stop_x</th>\n",
	" <th>timeleft_x</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <td><strong>3 </strong></td>\n",
	" <td> 2</td>\n",
	" <td> 4</td>\n",
	" <td> 22</td>\n",
	" <td> 1304095188</td>\n",
	" <td> 1304095206</td>\n",
	" <td> 2306</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td><strong>5 </strong></td>\n",
	" <td> 16</td>\n",
	" <td> 6</td>\n",
	" <td> 22</td>\n",
	" <td> 1304095303</td>\n",
	" <td> 1304095471</td>\n",
	" <td> 2041</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td><strong>13</strong></td>\n",
	" <td> 12</td>\n",
	" <td> 14</td>\n",
	" <td> 22</td>\n",
	" <td> 1304095280</td>\n",
	" <td> 1304095301</td>\n",
	" <td> 2211</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"output_type": "pyout",
	"prompt_number": 32,
	"text": [
	" id_x solutionid reviewerid start_x stop_x timeleft_x\n",
	"3 2 4 22 1304095188 1304095206 2306\n",
	"5 16 6 22 1304095303 1304095471 2041\n",
	"13 12 14 22 1304095280 1304095301 2211"
	]
	}
	],
	"prompt_number": 32
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# reordering by multiple columns\n",
	"sortedData = mergedData2.sort(['reviewerid', 'id_x'])\n",
	"sortedData.ix[:,0:6].head(3)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"html": [
	"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>id_x</th>\n",
	" <th>solutionid</th>\n",
	" <th>reviewerid</th>\n",
	" <th>start_x</th>\n",
	" <th>stop_x</th>\n",
	" <th>timeleft_x</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <td><strong>3 </strong></td>\n",
	" <td> 2</td>\n",
	" <td> 4</td>\n",
	" <td> 22</td>\n",
	" <td> 1304095188</td>\n",
	" <td> 1304095206</td>\n",
	" <td> 2306</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td><strong>13</strong></td>\n",
	" <td> 12</td>\n",
	" <td> 14</td>\n",
	" <td> 22</td>\n",
	" <td> 1304095280</td>\n",
	" <td> 1304095301</td>\n",
	" <td> 2211</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td><strong>5 </strong></td>\n",
	" <td> 16</td>\n",
	" <td> 6</td>\n",
	" <td> 22</td>\n",
	" <td> 1304095303</td>\n",
	" <td> 1304095471</td>\n",
	" <td> 2041</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"output_type": "pyout",
	"prompt_number": 33,
	"text": [
	" id_x solutionid reviewerid start_x stop_x timeleft_x\n",
	"3 2 4 22 1304095188 1304095206 2306\n",
	"13 12 14 22 1304095280 1304095301 2211\n",
	"5 16 6 22 1304095303 1304095471 2041"
	]
	}
	],
	"prompt_number": 33
	},
	{
	"cell_type": "heading",
	"level": 3,
	"metadata": {},
	"source": [
	"reshaping data - example"
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"misShaped = pd.DataFrame({'treatmentA' : [NaN, 1, 2], 'treatmentB' : [5, 4, 3]})\n",
	"misShaped['people'] = ['John', 'Jane', 'Mary']\n",
	"misShaped"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"html": [
	"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>treatmentA</th>\n",
	" <th>treatmentB</th>\n",
	" <th>people</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <td><strong>0</strong></td>\n",
	" <td>NaN</td>\n",
	" <td> 5</td>\n",
	" <td> John</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td><strong>1</strong></td>\n",
	" <td> 1</td>\n",
	" <td> 4</td>\n",
	" <td> Jane</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td><strong>2</strong></td>\n",
	" <td> 2</td>\n",
	" <td> 3</td>\n",
	" <td> Mary</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"output_type": "pyout",
	"prompt_number": 34,
	"text": [
	" treatmentA treatmentB people\n",
	"0 NaN 5 John\n",
	"1 1 4 Jane\n",
	"2 2 3 Mary"
	]
	}
	],
	"prompt_number": 34
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# equivalent to R's melt()\n",
	"pd.melt(misShaped, id_vars='people')"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"html": [
	"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>people</th>\n",
	" <th>variable</th>\n",
	" <th>value</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <td><strong>0</strong></td>\n",
	" <td> John</td>\n",
	" <td> treatmentA</td>\n",
	" <td>NaN</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td><strong>1</strong></td>\n",
	" <td> Jane</td>\n",
	" <td> treatmentA</td>\n",
	" <td> 1</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td><strong>2</strong></td>\n",
	" <td> Mary</td>\n",
	" <td> treatmentA</td>\n",
	" <td> 2</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td><strong>3</strong></td>\n",
	" <td> John</td>\n",
	" <td> treatmentB</td>\n",
	" <td> 5</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td><strong>4</strong></td>\n",
	" <td> Jane</td>\n",
	" <td> treatmentB</td>\n",
	" <td> 4</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td><strong>5</strong></td>\n",
	" <td> Mary</td>\n",
	" <td> treatmentB</td>\n",
	" <td> 3</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"output_type": "pyout",
	"prompt_number": 35,
	"text": [
	" people variable value\n",
	"0 John treatmentA NaN\n",
	"1 Jane treatmentA 1\n",
	"2 Mary treatmentA 2\n",
	"3 John treatmentB 5\n",
	"4 Jane treatmentB 4\n",
	"5 Mary treatmentB 3"
	]
	}
	],
	"prompt_number": 35
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [],
	"language": "python",
	"metadata": {},
	"outputs": []
	}
	],
	"metadata": {}
	}
	]
	}