gchoueiter/data_transformations_in_pandas.ipynb

## data_transformations_in_pandas.ipynb
{
 "metadata": {
  "name": ""
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "heading",
     "level": 1,
     "metadata": {},
     "source": [
      "Data Transformations with Pandas"
     ]
    },
    {
     "cell_type": "raw",
     "metadata": {},
     "source": [
      "    There are quite a few functions to perform data transformations on Series and DataFrames in the Pandas library. It can get confusing pretty fast. Let's go through the main ones and see when you would use each one."
     ]
    },
    {
     "cell_type": "heading",
     "level": 2,
     "metadata": {},
     "source": [
      "map()"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import pandas as pd\n",
      "from pandas import Series, DataFrame\n",
      "import numpy as np\n",
      "\n",
      "movies = ['Les Miserables','The Wolverine in 3D','Despicable me 2','Turbo','World War Z','The Heat']\n",
      "movieTitles = Series(data = movies,index = ['movie'+str(i) for i in xrange(1,7)])\n",
      "print('The movie titles')\n",
      "print(movieTitles)\n",
      "\n",
      "# Map is used with Series\n",
      "# let's use map to convert every movie in the Series to the length of the title\n",
      "# map takes in one value and returns one value (DataFrames have applymap to do that)\n",
      "def get_title_length(title):\n",
      "    return len(title.split(' '))\n",
      "\n",
      "print('\\nThe movie title lengths')\n",
      "movieLen = movieTitles.map(get_title_length)\n",
      "print(movieLen)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "The movie titles\n",
        "movie1         Les Miserables\n",
        "movie2    The Wolverine in 3D\n",
        "movie3        Despicable me 2\n",
        "movie4                  Turbo\n",
        "movie5            World War Z\n",
        "movie6               The Heat\n",
        "dtype: object\n",
        "\n",
        "The movie title lengths\n",
        "movie1    2\n",
        "movie2    4\n",
        "movie3    3\n",
        "movie4    1\n",
        "movie5    3\n",
        "movie6    2\n",
        "dtype: int64\n"
       ]
      }
     ],
     "prompt_number": 15
    },
    {
     "cell_type": "heading",
     "level": 1,
     "metadata": {},
     "source": [
      "agg()"
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "As the name indicates, agg is used to aggregate data and produce a scalar from an array."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "data = DataFrame(data={'gender':['F','M','F','F','M'],'name':['em','jo','sid','lu','ty'],'calories':[1000,2000,1300,1500,1800]})\n",
      "print(data)\n",
      "\n",
      "# aggregating using existing function mean()\n",
      "grouped = data.groupby(['gender'])\n",
      "grouped.mean()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "   calories gender name\n",
        "0      1000      F   em\n",
        "1      2000      M   jo\n",
        "2      1300      F  sid\n",
        "3      1500      F   lu\n",
        "4      1800      M   ty\n"
       ]
      },
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>calories</th>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>gender</th>\n",
        "      <th></th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>F</th>\n",
        "      <td> 1266.666667</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>M</th>\n",
        "      <td> 1900.000000</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 110,
       "text": [
        "           calories\n",
        "gender             \n",
        "F       1266.666667\n",
        "M       1900.000000"
       ]
      }
     ],
     "prompt_number": 110
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# you can use agg with your own function\n",
      "def max_minus_min(s):\n",
      "    return max(s)-min(s)\n",
      "\n",
      "grouped['calories'].agg(max_minus_min)\n"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 111,
       "text": [
        "gender\n",
        "F         500\n",
        "M         200\n",
        "Name: calories, dtype: int64"
       ]
      }
     ],
     "prompt_number": 111
    },
    {
     "cell_type": "heading",
     "level": 1,
     "metadata": {},
     "source": [
      "transform()"
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "transform takes agg one step further and propagates the scalar value in the group."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "print(data)\n",
      "\n",
      "# Such a transformation would not be possible with agg because a \n",
      "# single scalar value is returned and it is not propagated to all\n",
      "# the appropriate locations\n",
      "data['max_minus_min'] = grouped.transform(max_minus_min)\n",
      "print('\\nAfter adding max_minus_min col')\n",
      "print(data)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "   calories gender name\n",
        "0      1000      F   em\n",
        "1      2000      M   jo\n",
        "2      1300      F  sid\n",
        "3      1500      F   lu\n",
        "4      1800      M   ty\n",
        "\n",
        "After adding max_minus_min col\n",
        "   calories gender name  max_minus_min\n",
        "0      1000      F   em            500\n",
        "1      2000      M   jo            200\n",
        "2      1300      F  sid            500\n",
        "3      1500      F   lu            500\n",
        "4      1800      M   ty            200\n"
       ]
      }
     ],
     "prompt_number": 112
    },
    {
     "cell_type": "heading",
     "level": 1,
     "metadata": {},
     "source": [
      "apply()"
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "apply is the most general purpose GroupBy method. It splits the object invoking the function into pieces, applies the function, and then sticks all the pieces together."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# zscore returns an object the same size as the input - does not aggregate\n",
      "def zscore(s):\n",
      "    return (s-s.mean())/s.std()\n",
      "\n",
      "grouped['calories'].apply(zscore)\n"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 113,
       "text": [
        "0   -1.059626\n",
        "1    0.707107\n",
        "2    0.132453\n",
        "3    0.927173\n",
        "4   -0.707107\n",
        "dtype: float64"
       ]
      }
     ],
     "prompt_number": 113
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "def highest(g,column='calories'):\n",
      "    return g['name'][g[column].idxmax()]\n",
      "\n",
      "print(grouped.apply(highest))\n",
      "print(grouped.apply(highest,column='max_minus_min'))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "gender\n",
        "F         lu\n",
        "M         jo\n",
        "dtype: object\n",
        "gender\n",
        "F         em\n",
        "M         jo\n",
        "dtype: object\n"
       ]
      }
     ],
     "prompt_number": 117
    },
    {
     "cell_type": "heading",
     "level": 1,
     "metadata": {},
     "source": [
      "applymap()"
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Going back a few steps, applymap is for DataFrames what map is for Series. One-to-one mapping.\n",
      "\n"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "data = DataFrame(np.random.randn(7,4),columns = ['col'+str(i) for i in xrange(1,5)])\n",
      "print(data)\n",
      "new_data = data.applymap(lambda x:int(x))\n",
      "print('\\n After applymap transformation')\n",
      "print(new_data)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "       col1      col2      col3      col4\n",
        "0  0.136532  1.015933 -1.196695 -0.616575\n",
        "1 -0.386423  0.979901  1.155770  0.780860\n",
        "2 -0.422275  1.058623 -0.380674  0.229553\n",
        "3 -0.797642  1.770611 -0.363573  0.596333\n",
        "4 -0.579864  0.419626  1.197262 -0.545685\n",
        "5 -0.775948 -0.302894  0.386808  1.141303\n",
        "6  0.985044 -1.533573 -0.445965  1.482011\n",
        "\n",
        " After applymap transformation\n",
        "   col1  col2  col3  col4\n",
        "0     0     1    -1     0\n",
        "1     0     0     1     0\n",
        "2     0     1     0     0\n",
        "3     0     1     0     0\n",
        "4     0     0     1     0\n",
        "5     0     0     0     1\n",
        "6     0    -1     0     1\n"
       ]
      }
     ],
     "prompt_number": 51
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": []
    }
   ],
   "metadata": {}
  }
 ]
}
	{
	"metadata": {
	"name": ""
	},
	"nbformat": 3,
	"nbformat_minor": 0,
	"worksheets": [
	{
	"cells": [
	{
	"cell_type": "heading",
	"level": 1,
	"metadata": {},
	"source": [
	"Data Transformations with Pandas"
	]
	},
	{
	"cell_type": "raw",
	"metadata": {},
	"source": [
	" There are quite a few functions to perform data transformations on Series and DataFrames in the Pandas library. It can get confusing pretty fast. Let's go through the main ones and see when you would use each one."
	]
	},
	{
	"cell_type": "heading",
	"level": 2,
	"metadata": {},
	"source": [
	"map()"
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"import pandas as pd\n",
	"from pandas import Series, DataFrame\n",
	"import numpy as np\n",
	"\n",
	"movies = ['Les Miserables','The Wolverine in 3D','Despicable me 2','Turbo','World War Z','The Heat']\n",
	"movieTitles = Series(data = movies,index = ['movie'+str(i) for i in xrange(1,7)])\n",
	"print('The movie titles')\n",
	"print(movieTitles)\n",
	"\n",
	"# Map is used with Series\n",
	"# let's use map to convert every movie in the Series to the length of the title\n",
	"# map takes in one value and returns one value (DataFrames have applymap to do that)\n",
	"def get_title_length(title):\n",
	" return len(title.split(' '))\n",
	"\n",
	"print('\\nThe movie title lengths')\n",
	"movieLen = movieTitles.map(get_title_length)\n",
	"print(movieLen)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"The movie titles\n",
	"movie1 Les Miserables\n",
	"movie2 The Wolverine in 3D\n",
	"movie3 Despicable me 2\n",
	"movie4 Turbo\n",
	"movie5 World War Z\n",
	"movie6 The Heat\n",
	"dtype: object\n",
	"\n",
	"The movie title lengths\n",
	"movie1 2\n",
	"movie2 4\n",
	"movie3 3\n",
	"movie4 1\n",
	"movie5 3\n",
	"movie6 2\n",
	"dtype: int64\n"
	]
	}
	],
	"prompt_number": 15
	},
	{
	"cell_type": "heading",
	"level": 1,
	"metadata": {},
	"source": [
	"agg()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"As the name indicates, agg is used to aggregate data and produce a scalar from an array."
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"data = DataFrame(data={'gender':['F','M','F','F','M'],'name':['em','jo','sid','lu','ty'],'calories':[1000,2000,1300,1500,1800]})\n",
	"print(data)\n",
	"\n",
	"# aggregating using existing function mean()\n",
	"grouped = data.groupby(['gender'])\n",
	"grouped.mean()"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	" calories gender name\n",
	"0 1000 F em\n",
	"1 2000 M jo\n",
	"2 1300 F sid\n",
	"3 1500 F lu\n",
	"4 1800 M ty\n"
	]
	},
	{
	"html": [
	"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>calories</th>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>gender</th>\n",
	" <th></th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>F</th>\n",
	" <td> 1266.666667</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>M</th>\n",
	" <td> 1900.000000</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"metadata": {},
	"output_type": "pyout",
	"prompt_number": 110,
	"text": [
	" calories\n",
	"gender \n",
	"F 1266.666667\n",
	"M 1900.000000"
	]
	}
	],
	"prompt_number": 110
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# you can use agg with your own function\n",
	"def max_minus_min(s):\n",
	" return max(s)-min(s)\n",
	"\n",
	"grouped['calories'].agg(max_minus_min)\n"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"metadata": {},
	"output_type": "pyout",
	"prompt_number": 111,
	"text": [
	"gender\n",
	"F 500\n",
	"M 200\n",
	"Name: calories, dtype: int64"
	]
	}
	],
	"prompt_number": 111
	},
	{
	"cell_type": "heading",
	"level": 1,
	"metadata": {},
	"source": [
	"transform()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"transform takes agg one step further and propagates the scalar value in the group."
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"print(data)\n",
	"\n",
	"# Such a transformation would not be possible with agg because a \n",
	"# single scalar value is returned and it is not propagated to all\n",
	"# the appropriate locations\n",
	"data['max_minus_min'] = grouped.transform(max_minus_min)\n",
	"print('\\nAfter adding max_minus_min col')\n",
	"print(data)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	" calories gender name\n",
	"0 1000 F em\n",
	"1 2000 M jo\n",
	"2 1300 F sid\n",
	"3 1500 F lu\n",
	"4 1800 M ty\n",
	"\n",
	"After adding max_minus_min col\n",
	" calories gender name max_minus_min\n",
	"0 1000 F em 500\n",
	"1 2000 M jo 200\n",
	"2 1300 F sid 500\n",
	"3 1500 F lu 500\n",
	"4 1800 M ty 200\n"
	]
	}
	],
	"prompt_number": 112
	},
	{
	"cell_type": "heading",
	"level": 1,
	"metadata": {},
	"source": [
	"apply()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"apply is the most general purpose GroupBy method. It splits the object invoking the function into pieces, applies the function, and then sticks all the pieces together."
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# zscore returns an object the same size as the input - does not aggregate\n",
	"def zscore(s):\n",
	" return (s-s.mean())/s.std()\n",
	"\n",
	"grouped['calories'].apply(zscore)\n"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"metadata": {},
	"output_type": "pyout",
	"prompt_number": 113,
	"text": [
	"0 -1.059626\n",
	"1 0.707107\n",
	"2 0.132453\n",
	"3 0.927173\n",
	"4 -0.707107\n",
	"dtype: float64"
	]
	}
	],
	"prompt_number": 113
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"def highest(g,column='calories'):\n",
	" return g['name'][g[column].idxmax()]\n",
	"\n",
	"print(grouped.apply(highest))\n",
	"print(grouped.apply(highest,column='max_minus_min'))"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"gender\n",
	"F lu\n",
	"M jo\n",
	"dtype: object\n",
	"gender\n",
	"F em\n",
	"M jo\n",
	"dtype: object\n"
	]
	}
	],
	"prompt_number": 117
	},
	{
	"cell_type": "heading",
	"level": 1,
	"metadata": {},
	"source": [
	"applymap()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Going back a few steps, applymap is for DataFrames what map is for Series. One-to-one mapping.\n",
	"\n"
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"data = DataFrame(np.random.randn(7,4),columns = ['col'+str(i) for i in xrange(1,5)])\n",
	"print(data)\n",
	"new_data = data.applymap(lambda x:int(x))\n",
	"print('\\n After applymap transformation')\n",
	"print(new_data)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	" col1 col2 col3 col4\n",
	"0 0.136532 1.015933 -1.196695 -0.616575\n",
	"1 -0.386423 0.979901 1.155770 0.780860\n",
	"2 -0.422275 1.058623 -0.380674 0.229553\n",
	"3 -0.797642 1.770611 -0.363573 0.596333\n",
	"4 -0.579864 0.419626 1.197262 -0.545685\n",
	"5 -0.775948 -0.302894 0.386808 1.141303\n",
	"6 0.985044 -1.533573 -0.445965 1.482011\n",
	"\n",
	" After applymap transformation\n",
	" col1 col2 col3 col4\n",
	"0 0 1 -1 0\n",
	"1 0 0 1 0\n",
	"2 0 1 0 0\n",
	"3 0 1 0 0\n",
	"4 0 0 1 0\n",
	"5 0 0 0 1\n",
	"6 0 -1 0 1\n"
	]
	}
	],
	"prompt_number": 51
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [],
	"language": "python",
	"metadata": {},
	"outputs": []
	}
	],
	"metadata": {}
	}
	]
	}