Skip to content

Instantly share code, notes, and snippets.

@upepo
Created September 6, 2013 02:21
Show Gist options
  • Save upepo/6458776 to your computer and use it in GitHub Desktop.
Save upepo/6458776 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"metadata": {
"name": "5. Getting Started with pandas"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 5. Getting Started with pandas\n",
"\n",
"* It contains high-level data structures and manipulation tools designed to make data analysis fast and easy in Python.\n",
"* pandas is built on top of NumPy and makes it easy to use in NumPy-centric applications."
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from pandas import Series, DataFrame\n",
"import pandas as pd \n",
"import numpy as np"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 5
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Series\n",
"\n",
"A Series is a one-dimensional array-like object containing an array of data (of any NumPy data type) and an associated array of data labels, called its index."
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"obj = Series([4,7,-5,3])\n",
"obj"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 167,
"text": [
"0 4\n",
"1 7\n",
"2 -5\n",
"3 3\n",
"dtype: int64"
]
}
],
"prompt_number": 167
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"obj.values"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 3,
"text": [
"array([ 4, 7, -5, 3])"
]
}
],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"obj.index"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 4,
"text": [
"Int64Index([0, 1, 2, 3], dtype=int64)"
]
}
],
"prompt_number": 4
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"dir(obj)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 5,
"text": [
"['T',\n",
" '_AXIS_ALIASES',\n",
" '_AXIS_NAMES',\n",
" '_AXIS_NUMBERS',\n",
" '__abs__',\n",
" '__add__',\n",
" '__and__',\n",
" '__array__',\n",
" '__array_finalize__',\n",
" '__array_interface__',\n",
" '__array_prepare__',\n",
" '__array_priority__',\n",
" '__array_struct__',\n",
" '__array_wrap__',\n",
" '__bytes__',\n",
" '__class__',\n",
" '__contains__',\n",
" '__copy__',\n",
" '__deepcopy__',\n",
" '__delattr__',\n",
" '__delitem__',\n",
" '__delslice__',\n",
" '__dict__',\n",
" '__div__',\n",
" '__divmod__',\n",
" '__doc__',\n",
" '__eq__',\n",
" '__float__',\n",
" '__floordiv__',\n",
" '__format__',\n",
" '__ge__',\n",
" '__getattribute__',\n",
" '__getitem__',\n",
" '__getslice__',\n",
" '__gt__',\n",
" '__hash__',\n",
" '__hex__',\n",
" '__iadd__',\n",
" '__iand__',\n",
" '__idiv__',\n",
" '__ifloordiv__',\n",
" '__ilshift__',\n",
" '__imod__',\n",
" '__imul__',\n",
" '__index__',\n",
" '__init__',\n",
" '__int__',\n",
" '__invert__',\n",
" '__ior__',\n",
" '__ipow__',\n",
" '__irshift__',\n",
" '__isub__',\n",
" '__iter__',\n",
" '__itruediv__',\n",
" '__ixor__',\n",
" '__le__',\n",
" '__len__',\n",
" '__long__',\n",
" '__lshift__',\n",
" '__lt__',\n",
" '__mod__',\n",
" '__module__',\n",
" '__mul__',\n",
" '__ne__',\n",
" '__neg__',\n",
" '__new__',\n",
" '__nonzero__',\n",
" '__oct__',\n",
" '__or__',\n",
" '__pos__',\n",
" '__pow__',\n",
" '__radd__',\n",
" '__rand__',\n",
" '__rdiv__',\n",
" '__rdivmod__',\n",
" '__reduce__',\n",
" '__reduce_ex__',\n",
" '__repr__',\n",
" '__rfloordiv__',\n",
" '__rlshift__',\n",
" '__rmod__',\n",
" '__rmul__',\n",
" '__ror__',\n",
" '__rpow__',\n",
" '__rrshift__',\n",
" '__rshift__',\n",
" '__rsub__',\n",
" '__rtruediv__',\n",
" '__rxor__',\n",
" '__setattr__',\n",
" '__setitem__',\n",
" '__setslice__',\n",
" '__setstate__',\n",
" '__sizeof__',\n",
" '__str__',\n",
" '__sub__',\n",
" '__subclasshook__',\n",
" '__truediv__',\n",
" '__unicode__',\n",
" '__weakref__',\n",
" '__xor__',\n",
" '_agg_by_level',\n",
" '_at',\n",
" '_binop',\n",
" '_can_hold_na',\n",
" '_constructor',\n",
" '_create_indexer',\n",
" '_get_axis',\n",
" '_get_axis_name',\n",
" '_get_axis_number',\n",
" '_get_repr',\n",
" '_get_val_at',\n",
" '_get_values',\n",
" '_get_values_tuple',\n",
" '_get_with',\n",
" '_iat',\n",
" '_iloc',\n",
" '_index',\n",
" '_is_mixed_type',\n",
" '_ix',\n",
" '_ixs',\n",
" '_loc',\n",
" '_make_time_series',\n",
" '_reindex_indexer',\n",
" '_repr_footer',\n",
" '_set_labels',\n",
" '_set_values',\n",
" '_set_with',\n",
" '_slice',\n",
" '_tidy_repr',\n",
" '_xs',\n",
" 'abs',\n",
" 'add',\n",
" 'align',\n",
" 'all',\n",
" 'any',\n",
" 'append',\n",
" 'apply',\n",
" 'argmax',\n",
" 'argmin',\n",
" 'argsort',\n",
" 'asfreq',\n",
" 'asof',\n",
" 'astype',\n",
" 'at',\n",
" 'at_time',\n",
" 'autocorr',\n",
" 'axes',\n",
" 'base',\n",
" 'between',\n",
" 'between_time',\n",
" 'bfill',\n",
" 'byteswap',\n",
" 'choose',\n",
" 'clip',\n",
" 'clip_lower',\n",
" 'clip_upper',\n",
" 'combine',\n",
" 'combine_first',\n",
" 'compress',\n",
" 'conj',\n",
" 'conjugate',\n",
" 'convert_objects',\n",
" 'copy',\n",
" 'corr',\n",
" 'count',\n",
" 'cov',\n",
" 'ctypes',\n",
" 'cummax',\n",
" 'cummin',\n",
" 'cumprod',\n",
" 'cumsum',\n",
" 'data',\n",
" 'describe',\n",
" 'diagonal',\n",
" 'diff',\n",
" 'div',\n",
" 'dot',\n",
" 'drop',\n",
" 'drop_duplicates',\n",
" 'dropna',\n",
" 'dtype',\n",
" 'dump',\n",
" 'dumps',\n",
" 'duplicated',\n",
" 'ffill',\n",
" 'fill',\n",
" 'fillna',\n",
" 'first',\n",
" 'first_valid_index',\n",
" 'flags',\n",
" 'flat',\n",
" 'flatten',\n",
" 'from_array',\n",
" 'from_csv',\n",
" 'get',\n",
" 'get_dtype_counts',\n",
" 'get_value',\n",
" 'getfield',\n",
" 'groupby',\n",
" 'head',\n",
" 'hist',\n",
" 'iat',\n",
" 'idxmax',\n",
" 'idxmin',\n",
" 'iget',\n",
" 'iget_value',\n",
" 'iloc',\n",
" 'imag',\n",
" 'index',\n",
" 'interpolate',\n",
" 'irow',\n",
" 'isin',\n",
" 'isnull',\n",
" 'item',\n",
" 'itemset',\n",
" 'itemsize',\n",
" 'iteritems',\n",
" 'iterkv',\n",
" 'ix',\n",
" 'keys',\n",
" 'kurt',\n",
" 'last',\n",
" 'last_valid_index',\n",
" 'load',\n",
" 'loc',\n",
" 'mad',\n",
" 'map',\n",
" 'mask',\n",
" 'max',\n",
" 'mean',\n",
" 'median',\n",
" 'min',\n",
" 'mul',\n",
" 'name',\n",
" 'nbytes',\n",
" 'ndim',\n",
" 'newbyteorder',\n",
" 'nonzero',\n",
" 'notnull',\n",
" 'nunique',\n",
" 'order',\n",
" 'pct_change',\n",
" 'plot',\n",
" 'prod',\n",
" 'ptp',\n",
" 'put',\n",
" 'quantile',\n",
" 'rank',\n",
" 'ravel',\n",
" 'real',\n",
" 'reindex',\n",
" 'reindex_axis',\n",
" 'reindex_like',\n",
" 'rename',\n",
" 'reorder_levels',\n",
" 'repeat',\n",
" 'replace',\n",
" 'resample',\n",
" 'reset_index',\n",
" 'reshape',\n",
" 'resize',\n",
" 'round',\n",
" 'save',\n",
" 'searchsorted',\n",
" 'select',\n",
" 'set_value',\n",
" 'setasflat',\n",
" 'setfield',\n",
" 'setflags',\n",
" 'shape',\n",
" 'shift',\n",
" 'size',\n",
" 'skew',\n",
" 'sort',\n",
" 'sort_index',\n",
" 'sortlevel',\n",
" 'squeeze',\n",
" 'std',\n",
" 'str',\n",
" 'strides',\n",
" 'sub',\n",
" 'sum',\n",
" 'swapaxes',\n",
" 'swaplevel',\n",
" 'tail',\n",
" 'take',\n",
" 'to_csv',\n",
" 'to_dict',\n",
" 'to_hdf',\n",
" 'to_sparse',\n",
" 'to_string',\n",
" 'tofile',\n",
" 'tolist',\n",
" 'tostring',\n",
" 'trace',\n",
" 'transpose',\n",
" 'truncate',\n",
" 'tshift',\n",
" 'tz_convert',\n",
" 'tz_localize',\n",
" 'unique',\n",
" 'unstack',\n",
" 'update',\n",
" 'valid',\n",
" 'value_counts',\n",
" 'values',\n",
" 'var',\n",
" 'view',\n",
" 'weekday',\n",
" 'where']"
]
}
],
"prompt_number": 5
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"obj2 = Series([4,7,-5,3], index=['d','b','a','c'])\n",
"obj2"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 6,
"text": [
"d 4\n",
"b 7\n",
"a -5\n",
"c 3\n",
"dtype: int64"
]
}
],
"prompt_number": 6
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"obj2.index"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 7,
"text": [
"Index([d, b, a, c], dtype=object)"
]
}
],
"prompt_number": 7
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"print obj2['a']\n",
"obj2['d']=6\n",
"obj2[['c','a','d']]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"-5\n"
]
},
{
"output_type": "pyout",
"prompt_number": 8,
"text": [
"c 3\n",
"a -5\n",
"d 6\n",
"dtype: int64"
]
}
],
"prompt_number": 8
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"obj2[obj2>0]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 170,
"text": [
"b 7.2\n",
"c 3.6\n",
"d 4.5\n",
"dtype: float64"
]
}
],
"prompt_number": 170
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"obj2*2"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 10,
"text": [
"d 12\n",
"b 14\n",
"a -10\n",
"c 6\n",
"dtype: int64"
]
}
],
"prompt_number": 10
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import numpy as np\n",
"np.exp(obj2)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 175,
"text": [
"a 0.004992\n",
"b 1339.430764\n",
"c 36.598234\n",
"d 90.017131\n",
"e NaN\n",
"dtype: float64"
]
}
],
"prompt_number": 175
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"print 'b' in obj2\n",
"print 'e' in obj2"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"True\n",
"False\n"
]
}
],
"prompt_number": 12
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#python dict to Series\n",
"sdata = {'Ohio':35000, 'Texas':71000, 'Oregon':16000, 'Utah':5000}\n",
"obj3 = Series(sdata)\n",
"obj3"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 13,
"text": [
"Ohio 35000\n",
"Oregon 16000\n",
"Texas 71000\n",
"Utah 5000\n",
"dtype: int64"
]
}
],
"prompt_number": 13
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"states=['California', 'Ohio', 'Oregon', 'Texas']\n",
"obj4 = Series(sdata, index=states)\n",
"obj4"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 14,
"text": [
"California NaN\n",
"Ohio 35000\n",
"Oregon 16000\n",
"Texas 71000\n",
"dtype: float64"
]
}
],
"prompt_number": 14
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"print pd.isnull(obj4)\n",
"print '----'\n",
"print pd.notnull(obj4)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"California True\n",
"Ohio False\n",
"Oregon False\n",
"Texas False\n",
"dtype: bool\n",
"----\n",
"California False\n",
"Ohio True\n",
"Oregon True\n",
"Texas True\n",
"dtype: bool\n"
]
}
],
"prompt_number": 16
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"obj3+obj4"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 17,
"text": [
"California NaN\n",
"Ohio 70000\n",
"Oregon 32000\n",
"Texas 142000\n",
"Utah NaN\n",
"dtype: float64"
]
}
],
"prompt_number": 17
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"obj4"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 18,
"text": [
"California NaN\n",
"Ohio 35000\n",
"Oregon 16000\n",
"Texas 71000\n",
"dtype: float64"
]
}
],
"prompt_number": 18
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"obj4.name = 'population'\n",
"obj4.index.name = 'state'\n",
"obj4"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 19,
"text": [
"state\n",
"California NaN\n",
"Ohio 35000\n",
"Oregon 16000\n",
"Texas 71000\n",
"Name: population, dtype: float64"
]
}
],
"prompt_number": 19
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"obj.index=['Bob','Steve','Jeff','Ryan']\n",
"obj\n"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 178,
"text": [
"Bob 4\n",
"Steve 7\n",
"Jeff -5\n",
"Ryan 3\n",
"dtype: int64"
]
}
],
"prompt_number": 178
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## DataFrame\n",
"\n",
"* A DataFrame represents a tabular, spreadsheet-like data structure containing an ordered collection of columns, each of which can be a different value type (numeric, string, boolean, etc.). \n",
"* The DataFrame has both a row and column index; it can be thought of as a dict of Series (one for all sharing the same index). \n",
"* Compared with other such DataFrame-like structures you may have used before (like R\u2019s data.frame), row-oriented and column-oriented operations in DataFrame are treated roughly symmetrically. \n",
"* Under the hood, the data is stored as one or more two-dimensional blocks rather than a list, dict, or some other collection of one-dimensional arrays."
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"data = {'state':['Ohio','Ohio','Ohio', 'Nevada', 'Nevada'],\n",
" 'year':[2000,2001,2002,2001,2002],\n",
" 'pop':[1.5,1.7,3.6,2.4,2.9]}\n",
"frame = DataFrame(data)\n",
"frame"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>pop</th>\n",
" <th>state</th>\n",
" <th>year</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td> 1.5</td>\n",
" <td> Ohio</td>\n",
" <td> 2000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td> 1.7</td>\n",
" <td> Ohio</td>\n",
" <td> 2001</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td> 3.6</td>\n",
" <td> Ohio</td>\n",
" <td> 2002</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td> 2.4</td>\n",
" <td> Nevada</td>\n",
" <td> 2001</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td> 2.9</td>\n",
" <td> Nevada</td>\n",
" <td> 2002</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 186,
"text": [
" pop state year\n",
"0 1.5 Ohio 2000\n",
"1 1.7 Ohio 2001\n",
"2 3.6 Ohio 2002\n",
"3 2.4 Nevada 2001\n",
"4 2.9 Nevada 2002"
]
}
],
"prompt_number": 186
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"DataFrame(data, columns=['year','state','pop'])"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>year</th>\n",
" <th>state</th>\n",
" <th>pop</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td> 2000</td>\n",
" <td> Ohio</td>\n",
" <td> 1.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td> 2001</td>\n",
" <td> Ohio</td>\n",
" <td> 1.7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td> 2002</td>\n",
" <td> Ohio</td>\n",
" <td> 3.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td> 2001</td>\n",
" <td> Nevada</td>\n",
" <td> 2.4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td> 2002</td>\n",
" <td> Nevada</td>\n",
" <td> 2.9</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 187,
"text": [
" year state pop\n",
"0 2000 Ohio 1.5\n",
"1 2001 Ohio 1.7\n",
"2 2002 Ohio 3.6\n",
"3 2001 Nevada 2.4\n",
"4 2002 Nevada 2.9"
]
}
],
"prompt_number": 187
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"frame2= DataFrame(data, columns=['year', 'state','pop', 'debt'], \n",
" index=['one','two','three','four','five'])\n",
"frame2"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>year</th>\n",
" <th>state</th>\n",
" <th>pop</th>\n",
" <th>debt</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>one</th>\n",
" <td> 2000</td>\n",
" <td> Ohio</td>\n",
" <td> 1.5</td>\n",
" <td> NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>two</th>\n",
" <td> 2001</td>\n",
" <td> Ohio</td>\n",
" <td> 1.7</td>\n",
" <td> NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>three</th>\n",
" <td> 2002</td>\n",
" <td> Ohio</td>\n",
" <td> 3.6</td>\n",
" <td> NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>four</th>\n",
" <td> 2001</td>\n",
" <td> Nevada</td>\n",
" <td> 2.4</td>\n",
" <td> NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>five</th>\n",
" <td> 2002</td>\n",
" <td> Nevada</td>\n",
" <td> 2.9</td>\n",
" <td> NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 188,
"text": [
" year state pop debt\n",
"one 2000 Ohio 1.5 NaN\n",
"two 2001 Ohio 1.7 NaN\n",
"three 2002 Ohio 3.6 NaN\n",
"four 2001 Nevada 2.4 NaN\n",
"five 2002 Nevada 2.9 NaN"
]
}
],
"prompt_number": 188
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"frame2['state']"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 197,
"text": [
"one Ohio\n",
"two Ohio\n",
"three Ohio\n",
"four Nevada\n",
"five Nevada\n",
"Name: state, dtype: object"
]
}
],
"prompt_number": 197
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"frame2.ix['three']"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 201,
"text": [
"year 2002\n",
"state Ohio\n",
"pop 3.6\n",
"debt NaN\n",
"Name: three, dtype: object"
]
}
],
"prompt_number": 201
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"frame2['debt']=16.5\n",
"frame2"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>year</th>\n",
" <th>state</th>\n",
" <th>pop</th>\n",
" <th>debt</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>one</th>\n",
" <td> 2000</td>\n",
" <td> Ohio</td>\n",
" <td> 1.5</td>\n",
" <td> 16.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>two</th>\n",
" <td> 2001</td>\n",
" <td> Ohio</td>\n",
" <td> 1.7</td>\n",
" <td> 16.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>three</th>\n",
" <td> 2002</td>\n",
" <td> Ohio</td>\n",
" <td> 3.6</td>\n",
" <td> 16.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>four</th>\n",
" <td> 2001</td>\n",
" <td> Nevada</td>\n",
" <td> 2.4</td>\n",
" <td> 16.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>five</th>\n",
" <td> 2002</td>\n",
" <td> Nevada</td>\n",
" <td> 2.9</td>\n",
" <td> 16.5</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 28,
"text": [
" year state pop debt\n",
"one 2000 Ohio 1.5 16.5\n",
"two 2001 Ohio 1.7 16.5\n",
"three 2002 Ohio 3.6 16.5\n",
"four 2001 Nevada 2.4 16.5\n",
"five 2002 Nevada 2.9 16.5"
]
}
],
"prompt_number": 28
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"frame2['debt'] = np.arange(5.)\n",
"frame2"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>year</th>\n",
" <th>state</th>\n",
" <th>pop</th>\n",
" <th>debt</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>one</th>\n",
" <td> 2000</td>\n",
" <td> Ohio</td>\n",
" <td> 1.5</td>\n",
" <td> 0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>two</th>\n",
" <td> 2001</td>\n",
" <td> Ohio</td>\n",
" <td> 1.7</td>\n",
" <td> 1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>three</th>\n",
" <td> 2002</td>\n",
" <td> Ohio</td>\n",
" <td> 3.6</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>four</th>\n",
" <td> 2001</td>\n",
" <td> Nevada</td>\n",
" <td> 2.4</td>\n",
" <td> 3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>five</th>\n",
" <td> 2002</td>\n",
" <td> Nevada</td>\n",
" <td> 2.9</td>\n",
" <td> 4</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 29,
"text": [
" year state pop debt\n",
"one 2000 Ohio 1.5 0\n",
"two 2001 Ohio 1.7 1\n",
"three 2002 Ohio 3.6 2\n",
"four 2001 Nevada 2.4 3\n",
"five 2002 Nevada 2.9 4"
]
}
],
"prompt_number": 29
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"np.arange(5.)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 30,
"text": [
"array([ 0., 1., 2., 3., 4.])"
]
}
],
"prompt_number": 30
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"val = Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])\n",
"frame2['debt'] = val\n",
"frame2 "
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>year</th>\n",
" <th>state</th>\n",
" <th>pop</th>\n",
" <th>debt</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>one</th>\n",
" <td> 2000</td>\n",
" <td> Ohio</td>\n",
" <td> 1.5</td>\n",
" <td> NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>two</th>\n",
" <td> 2001</td>\n",
" <td> Ohio</td>\n",
" <td> 1.7</td>\n",
" <td>-1.2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>three</th>\n",
" <td> 2002</td>\n",
" <td> Ohio</td>\n",
" <td> 3.6</td>\n",
" <td> NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>four</th>\n",
" <td> 2001</td>\n",
" <td> Nevada</td>\n",
" <td> 2.4</td>\n",
" <td>-1.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>five</th>\n",
" <td> 2002</td>\n",
" <td> Nevada</td>\n",
" <td> 2.9</td>\n",
" <td>-1.7</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 31,
"text": [
" year state pop debt\n",
"one 2000 Ohio 1.5 NaN\n",
"two 2001 Ohio 1.7 -1.2\n",
"three 2002 Ohio 3.6 NaN\n",
"four 2001 Nevada 2.4 -1.5\n",
"five 2002 Nevada 2.9 -1.7"
]
}
],
"prompt_number": 31
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"frame2['eastern'] = frame2.state == 'Ohio'\n",
"frame2"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>year</th>\n",
" <th>state</th>\n",
" <th>pop</th>\n",
" <th>debt</th>\n",
" <th>eastern</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>one</th>\n",
" <td> 2000</td>\n",
" <td> Ohio</td>\n",
" <td> 1.5</td>\n",
" <td> NaN</td>\n",
" <td> True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>two</th>\n",
" <td> 2001</td>\n",
" <td> Ohio</td>\n",
" <td> 1.7</td>\n",
" <td>-1.2</td>\n",
" <td> True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>three</th>\n",
" <td> 2002</td>\n",
" <td> Ohio</td>\n",
" <td> 3.6</td>\n",
" <td> NaN</td>\n",
" <td> True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>four</th>\n",
" <td> 2001</td>\n",
" <td> Nevada</td>\n",
" <td> 2.4</td>\n",
" <td>-1.5</td>\n",
" <td> False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>five</th>\n",
" <td> 2002</td>\n",
" <td> Nevada</td>\n",
" <td> 2.9</td>\n",
" <td>-1.7</td>\n",
" <td> False</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 35,
"text": [
" year state pop debt eastern\n",
"one 2000 Ohio 1.5 NaN True\n",
"two 2001 Ohio 1.7 -1.2 True\n",
"three 2002 Ohio 3.6 NaN True\n",
"four 2001 Nevada 2.4 -1.5 False\n",
"five 2002 Nevada 2.9 -1.7 False"
]
}
],
"prompt_number": 35
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"del frame2['eastern']\n",
"frame2"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>year</th>\n",
" <th>state</th>\n",
" <th>pop</th>\n",
" <th>debt</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>one</th>\n",
" <td> 2000</td>\n",
" <td> Ohio</td>\n",
" <td> 1.5</td>\n",
" <td> NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>two</th>\n",
" <td> 2001</td>\n",
" <td> Ohio</td>\n",
" <td> 1.7</td>\n",
" <td>-1.2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>three</th>\n",
" <td> 2002</td>\n",
" <td> Ohio</td>\n",
" <td> 3.6</td>\n",
" <td> NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>four</th>\n",
" <td> 2001</td>\n",
" <td> Nevada</td>\n",
" <td> 2.4</td>\n",
" <td>-1.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>five</th>\n",
" <td> 2002</td>\n",
" <td> Nevada</td>\n",
" <td> 2.9</td>\n",
" <td>-1.7</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 36,
"text": [
" year state pop debt\n",
"one 2000 Ohio 1.5 NaN\n",
"two 2001 Ohio 1.7 -1.2\n",
"three 2002 Ohio 3.6 NaN\n",
"four 2001 Nevada 2.4 -1.5\n",
"five 2002 Nevada 2.9 -1.7"
]
}
],
"prompt_number": 36
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"pop = {'Nevada': {2001: 2.4, 2002: 2.9},\n",
" 'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}\n",
"frame3 = DataFrame(pop)\n",
"frame3"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Nevada</th>\n",
" <th>Ohio</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2000</th>\n",
" <td> NaN</td>\n",
" <td> 1.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2001</th>\n",
" <td> 2.4</td>\n",
" <td> 1.7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2002</th>\n",
" <td> 2.9</td>\n",
" <td> 3.6</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 205,
"text": [
" Nevada Ohio\n",
"2000 NaN 1.5\n",
"2001 2.4 1.7\n",
"2002 2.9 3.6"
]
}
],
"prompt_number": 205
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"frame3.T"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>2000</th>\n",
" <th>2001</th>\n",
" <th>2002</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Nevada</th>\n",
" <td> NaN</td>\n",
" <td> 2.4</td>\n",
" <td> 2.9</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Ohio</th>\n",
" <td> 1.5</td>\n",
" <td> 1.7</td>\n",
" <td> 3.6</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 38,
"text": [
" 2000 2001 2002\n",
"Nevada NaN 2.4 2.9\n",
"Ohio 1.5 1.7 3.6"
]
}
],
"prompt_number": 38
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"DataFrame(pop, index=[2001,2002,2003])"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Nevada</th>\n",
" <th>Ohio</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2001</th>\n",
" <td> 2.4</td>\n",
" <td> 1.7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2002</th>\n",
" <td> 2.9</td>\n",
" <td> 3.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2003</th>\n",
" <td> NaN</td>\n",
" <td> NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 39,
"text": [
" Nevada Ohio\n",
"2001 2.4 1.7\n",
"2002 2.9 3.6\n",
"2003 NaN NaN"
]
}
],
"prompt_number": 39
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"pdata = {'Ohio': frame3['Ohio'][:-1],\n",
" 'Nevada': frame3['Nevada'][:2]}\n",
"DataFrame(pdata)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Nevada</th>\n",
" <th>Ohio</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2000</th>\n",
" <td> NaN</td>\n",
" <td> 1.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2001</th>\n",
" <td> 2.4</td>\n",
" <td> 1.7</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 40,
"text": [
" Nevada Ohio\n",
"2000 NaN 1.5\n",
"2001 2.4 1.7"
]
}
],
"prompt_number": 40
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"frame3.index.name = 'year'\n",
"frame3.columns.name = 'state'\n",
"frame3"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th>state</th>\n",
" <th>Nevada</th>\n",
" <th>Ohio</th>\n",
" </tr>\n",
" <tr>\n",
" <th>year</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2000</th>\n",
" <td> NaN</td>\n",
" <td> 1.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2001</th>\n",
" <td> 2.4</td>\n",
" <td> 1.7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2002</th>\n",
" <td> 2.9</td>\n",
" <td> 3.6</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 41,
"text": [
"state Nevada Ohio\n",
"year \n",
"2000 NaN 1.5\n",
"2001 2.4 1.7\n",
"2002 2.9 3.6"
]
}
],
"prompt_number": 41
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"frame3.values"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 208,
"text": [
"array([[ nan, 1.5],\n",
" [ 2.4, 1.7],\n",
" [ 2.9, 3.6]])"
]
}
],
"prompt_number": 208
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"frame2.values"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 43,
"text": [
"array([[2000, Ohio, 1.5, nan],\n",
" [2001, Ohio, 1.7, -1.2],\n",
" [2002, Ohio, 3.6, nan],\n",
" [2001, Nevada, 2.4, -1.5],\n",
" [2002, Nevada, 2.9, -1.7]], dtype=object)"
]
}
],
"prompt_number": 43
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Index Objects\n",
"\n",
"* pandas\u2019s Index objects are responsible for holding the axis labels and other metadata (like the axis name or names).\n",
"* Any array or other sequence of labels used when constructing a Series or DataFrame is internally converted to an Index:"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"obj = Series(range(3), index=['a', 'b', 'c'])\n",
"obj"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 209,
"text": [
"a 0\n",
"b 1\n",
"c 2\n",
"dtype: int64"
]
}
],
"prompt_number": 209
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"index = obj.index\n",
"index"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 210,
"text": [
"Index([a, b, c], dtype=object)"
]
}
],
"prompt_number": 210
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"index[1:]\n",
"\n",
"#index=[1,2,3]\n",
"#index\n",
"#obj.index = index\n",
"#obj.index"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 217,
"text": [
"Int64Index([1, 2, 3], dtype=int64)"
]
}
],
"prompt_number": 217
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"index[1] = 'd'"
],
"language": "python",
"metadata": {},
"outputs": [
{
"ename": "Exception",
"evalue": "<class 'pandas.core.index.Index'> object is immutable",
"output_type": "pyerr",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mException\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-212-676fdeb26a68>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mindex\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'd'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m/Library/Python/2.7/site-packages/pandas/core/index.pyc\u001b[0m in \u001b[0;36m__setitem__\u001b[0;34m(self, key, value)\u001b[0m\n\u001b[1;32m 367\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 368\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__setitem__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 369\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mException\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__class__\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m' object is immutable'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 370\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 371\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__getitem__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mException\u001b[0m: <class 'pandas.core.index.Index'> object is immutable"
]
}
],
"prompt_number": 212
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Immutability is important so that Index objects can be safely shared among data structures:\n",
"index = pd.Index(np.arange(3))\n",
"obj2 = Series([1.5, -2.5, 0], index=index)\n",
"obj2 "
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 48,
"text": [
"0 1.5\n",
"1 -2.5\n",
"2 0.0\n",
"dtype: float64"
]
}
],
"prompt_number": 48
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"obj2.index is index"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 219,
"text": [
"False"
]
}
],
"prompt_number": 219
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"dir(frame3.index)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 50,
"text": [
"['T',\n",
" '__abs__',\n",
" '__add__',\n",
" '__and__',\n",
" '__array__',\n",
" '__array_finalize__',\n",
" '__array_interface__',\n",
" '__array_prepare__',\n",
" '__array_priority__',\n",
" '__array_struct__',\n",
" '__array_wrap__',\n",
" '__bytes__',\n",
" '__class__',\n",
" '__contains__',\n",
" '__copy__',\n",
" '__deepcopy__',\n",
" '__delattr__',\n",
" '__delitem__',\n",
" '__delslice__',\n",
" '__dict__',\n",
" '__div__',\n",
" '__divmod__',\n",
" '__doc__',\n",
" '__eq__',\n",
" '__float__',\n",
" '__floordiv__',\n",
" '__format__',\n",
" '__ge__',\n",
" '__getattribute__',\n",
" '__getitem__',\n",
" '__getslice__',\n",
" '__gt__',\n",
" '__hash__',\n",
" '__hex__',\n",
" '__iadd__',\n",
" '__iand__',\n",
" '__idiv__',\n",
" '__ifloordiv__',\n",
" '__ilshift__',\n",
" '__imod__',\n",
" '__imul__',\n",
" '__index__',\n",
" '__init__',\n",
" '__int__',\n",
" '__invert__',\n",
" '__ior__',\n",
" '__ipow__',\n",
" '__irshift__',\n",
" '__isub__',\n",
" '__iter__',\n",
" '__itruediv__',\n",
" '__ixor__',\n",
" '__le__',\n",
" '__len__',\n",
" '__long__',\n",
" '__lshift__',\n",
" '__lt__',\n",
" '__mod__',\n",
" '__module__',\n",
" '__mul__',\n",
" '__ne__',\n",
" '__neg__',\n",
" '__new__',\n",
" '__nonzero__',\n",
" '__oct__',\n",
" '__or__',\n",
" '__pos__',\n",
" '__pow__',\n",
" '__radd__',\n",
" '__rand__',\n",
" '__rdiv__',\n",
" '__rdivmod__',\n",
" '__reduce__',\n",
" '__reduce_ex__',\n",
" '__repr__',\n",
" '__rfloordiv__',\n",
" '__rlshift__',\n",
" '__rmod__',\n",
" '__rmul__',\n",
" '__ror__',\n",
" '__rpow__',\n",
" '__rrshift__',\n",
" '__rshift__',\n",
" '__rsub__',\n",
" '__rtruediv__',\n",
" '__rxor__',\n",
" '__setattr__',\n",
" '__setitem__',\n",
" '__setslice__',\n",
" '__setstate__',\n",
" '__sizeof__',\n",
" '__str__',\n",
" '__sub__',\n",
" '__subclasshook__',\n",
" '__truediv__',\n",
" '__unicode__',\n",
" '__xor__',\n",
" '_array_values',\n",
" '_arrmap',\n",
" '_assert_can_do_setop',\n",
" '_box_scalars',\n",
" '_cache',\n",
" '_cleanup',\n",
" '_constructor',\n",
" '_engine',\n",
" '_engine_type',\n",
" '_ensure_compat_concat',\n",
" '_format_native_types',\n",
" '_format_with_header',\n",
" '_get_duplicates',\n",
" '_get_level_number',\n",
" '_get_method',\n",
" '_get_names',\n",
" '_groupby',\n",
" '_has_complex_internals',\n",
" '_inner_indexer',\n",
" '_join_level',\n",
" '_join_monotonic',\n",
" '_join_non_unique',\n",
" '_join_precedence',\n",
" '_left_indexer',\n",
" '_left_indexer_unique',\n",
" '_mpl_repr',\n",
" '_outer_indexer',\n",
" '_possibly_promote',\n",
" '_set_names',\n",
" '_shallow_copy',\n",
" '_wrap_joined_index',\n",
" '_wrap_union_result',\n",
" 'all',\n",
" 'any',\n",
" 'append',\n",
" 'argmax',\n",
" 'argmin',\n",
" 'argsort',\n",
" 'asi8',\n",
" 'asof',\n",
" 'asof_locs',\n",
" 'astype',\n",
" 'base',\n",
" 'byteswap',\n",
" 'choose',\n",
" 'clip',\n",
" 'compress',\n",
" 'conj',\n",
" 'conjugate',\n",
" 'copy',\n",
" 'ctypes',\n",
" 'cumprod',\n",
" 'cumsum',\n",
" 'data',\n",
" 'delete',\n",
" 'diagonal',\n",
" 'diff',\n",
" 'dot',\n",
" 'drop',\n",
" 'dtype',\n",
" 'dump',\n",
" 'dumps',\n",
" 'equals',\n",
" 'fill',\n",
" 'flags',\n",
" 'flat',\n",
" 'flatten',\n",
" 'format',\n",
" 'get_duplicates',\n",
" 'get_indexer',\n",
" 'get_level_values',\n",
" 'get_loc',\n",
" 'get_value',\n",
" 'getfield',\n",
" 'groupby',\n",
" 'holds_integer',\n",
" 'imag',\n",
" 'inferred_type',\n",
" 'insert',\n",
" 'intersection',\n",
" 'is_all_dates',\n",
" 'is_lexsorted_for_tuple',\n",
" 'is_monotonic',\n",
" 'is_numeric',\n",
" 'is_type_compatible',\n",
" 'is_unique',\n",
" 'isin',\n",
" 'item',\n",
" 'itemset',\n",
" 'itemsize',\n",
" 'join',\n",
" 'map',\n",
" 'max',\n",
" 'mean',\n",
" 'min',\n",
" 'name',\n",
" 'names',\n",
" 'nbytes',\n",
" 'ndim',\n",
" 'newbyteorder',\n",
" 'nlevels',\n",
" 'nonzero',\n",
" 'order',\n",
" 'prod',\n",
" 'ptp',\n",
" 'put',\n",
" 'ravel',\n",
" 'real',\n",
" 'reindex',\n",
" 'repeat',\n",
" 'reshape',\n",
" 'resize',\n",
" 'round',\n",
" 'searchsorted',\n",
" 'set_value',\n",
" 'setasflat',\n",
" 'setfield',\n",
" 'setflags',\n",
" 'shape',\n",
" 'shift',\n",
" 'size',\n",
" 'slice_indexer',\n",
" 'slice_locs',\n",
" 'sort',\n",
" 'squeeze',\n",
" 'std',\n",
" 'strides',\n",
" 'sum',\n",
" 'summary',\n",
" 'swapaxes',\n",
" 'take',\n",
" 'to_datetime',\n",
" 'to_native_types',\n",
" 'to_series',\n",
" 'tofile',\n",
" 'tolist',\n",
" 'tostring',\n",
" 'trace',\n",
" 'transpose',\n",
" 'union',\n",
" 'unique',\n",
" 'values',\n",
" 'var',\n",
" 'view']"
]
}
],
"prompt_number": 50
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Essential Functionality\n",
"\n",
"### Reindexing\n",
"\n",
"A critical method on pandas objects is reindex, which means to create a new object with the data conformed to a new index."
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"obj = Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])\n",
"obj"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 51,
"text": [
"d 4.5\n",
"b 7.2\n",
"a -5.3\n",
"c 3.6\n",
"dtype: float64"
]
}
],
"prompt_number": 51
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])\n",
"obj2"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 52,
"text": [
"a -5.3\n",
"b 7.2\n",
"c 3.6\n",
"d 4.5\n",
"e NaN\n",
"dtype: float64"
]
}
],
"prompt_number": 52
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"obj.reindex(['a', 'b', 'c', 'd', 'e'], fill_value=0)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 53,
"text": [
"a -5.3\n",
"b 7.2\n",
"c 3.6\n",
"d 4.5\n",
"e 0.0\n",
"dtype: float64"
]
}
],
"prompt_number": 53
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"obj3 = Series(['blue', 'purple', 'yellow'], index=[0, 2, 5])\n",
"obj3"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 220,
"text": [
"0 blue\n",
"2 purple\n",
"5 yellow\n",
"dtype: object"
]
}
],
"prompt_number": 220
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"obj3.reindex(range(6), method='ffill')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 221,
"text": [
"0 blue\n",
"1 blue\n",
"2 purple\n",
"3 purple\n",
"4 purple\n",
"5 yellow\n",
"dtype: object"
]
}
],
"prompt_number": 221
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"frame = DataFrame(np.arange(9).reshape((3, 3)), index=['a', 'c', 'd'],\n",
" columns=['Ohio', 'Texas', 'California'])\n",
"frame"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Ohio</th>\n",
" <th>Texas</th>\n",
" <th>California</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>a</th>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>c</th>\n",
" <td> 3</td>\n",
" <td> 4</td>\n",
" <td> 5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>d</th>\n",
" <td> 6</td>\n",
" <td> 7</td>\n",
" <td> 8</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 223,
"text": [
" Ohio Texas California\n",
"a 0 1 2\n",
"c 3 4 5\n",
"d 6 7 8"
]
}
],
"prompt_number": 223
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# With DataFrame, reindex can alter either the (row) index, columns, or both. \n",
"# When passed just a sequence, the rows are reindexed in the result:\n",
"\n",
"frame2 = frame.reindex(['a', 'b', 'c', 'd'])\n",
"frame2"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Ohio</th>\n",
" <th>Texas</th>\n",
" <th>California</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>a</th>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>b</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>c</th>\n",
" <td> 3</td>\n",
" <td> 4</td>\n",
" <td> 5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>d</th>\n",
" <td> 6</td>\n",
" <td> 7</td>\n",
" <td> 8</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 226,
"text": [
" Ohio Texas California\n",
"a 0 1 2\n",
"b NaN NaN NaN\n",
"c 3 4 5\n",
"d 6 7 8"
]
}
],
"prompt_number": 226
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"states = ['Texas', 'Utah', 'California']\n",
"frame.reindex(columns=states)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Texas</th>\n",
" <th>Utah</th>\n",
" <th>California</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>a</th>\n",
" <td> 1</td>\n",
" <td>NaN</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>c</th>\n",
" <td> 4</td>\n",
" <td>NaN</td>\n",
" <td> 5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>d</th>\n",
" <td> 7</td>\n",
" <td>NaN</td>\n",
" <td> 8</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 60,
"text": [
" Texas Utah California\n",
"a 1 NaN 2\n",
"c 4 NaN 5\n",
"d 7 NaN 8"
]
}
],
"prompt_number": 60
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"frame.reindex(index=['a', 'b', 'c', 'd'], method='ffill', columns=states)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Texas</th>\n",
" <th>Utah</th>\n",
" <th>California</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>a</th>\n",
" <td> 1</td>\n",
" <td>NaN</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>b</th>\n",
" <td> 1</td>\n",
" <td>NaN</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>c</th>\n",
" <td> 4</td>\n",
" <td>NaN</td>\n",
" <td> 5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>d</th>\n",
" <td> 7</td>\n",
" <td>NaN</td>\n",
" <td> 8</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 230,
"text": [
" Texas Utah California\n",
"a 1 NaN 2\n",
"b 1 NaN 2\n",
"c 4 NaN 5\n",
"d 7 NaN 8"
]
}
],
"prompt_number": 230
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"frame"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Ohio</th>\n",
" <th>Texas</th>\n",
" <th>California</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>a</th>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>c</th>\n",
" <td> 3</td>\n",
" <td> 4</td>\n",
" <td> 5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>d</th>\n",
" <td> 6</td>\n",
" <td> 7</td>\n",
" <td> 8</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 63,
"text": [
" Ohio Texas California\n",
"a 0 1 2\n",
"c 3 4 5\n",
"d 6 7 8"
]
}
],
"prompt_number": 63
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"frame.ix[['a', 'b', 'c', 'd'], states]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Texas</th>\n",
" <th>Utah</th>\n",
" <th>California</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>a</th>\n",
" <td> 1</td>\n",
" <td>NaN</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>b</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>c</th>\n",
" <td> 4</td>\n",
" <td>NaN</td>\n",
" <td> 5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>d</th>\n",
" <td> 7</td>\n",
" <td>NaN</td>\n",
" <td> 8</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 66,
"text": [
" Texas Utah California\n",
"a 1 NaN 2\n",
"b NaN NaN NaN\n",
"c 4 NaN 5\n",
"d 7 NaN 8"
]
}
],
"prompt_number": 66
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Dropping entries from an axis\n",
"\n",
"Dropping one or more entries from an axis is easy if you have an index array or list without those entries. As that can require a bit of munging and set logic, the drop method will return a new object with the indicated value or values deleted from an axis:"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"obj = Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])\n",
"new_obj = obj.drop('c')\n",
"new_obj"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 232,
"text": [
"a 0\n",
"b 1\n",
"d 3\n",
"e 4\n",
"dtype: float64"
]
}
],
"prompt_number": 232
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"obj.drop(['d', 'c'])"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 68,
"text": [
"a 0\n",
"b 1\n",
"e 4\n",
"dtype: float64"
]
}
],
"prompt_number": 68
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"data = DataFrame(np.arange(16).reshape((4, 4)),\n",
" index=['Ohio', 'Colorado', 'Utah', 'New York'],\n",
" columns=['one', 'two', 'three', 'four'])\n",
"data.drop(['Colorado', 'Ohio'])"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>one</th>\n",
" <th>two</th>\n",
" <th>three</th>\n",
" <th>four</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Utah</th>\n",
" <td> 8</td>\n",
" <td> 9</td>\n",
" <td> 10</td>\n",
" <td> 11</td>\n",
" </tr>\n",
" <tr>\n",
" <th>New York</th>\n",
" <td> 12</td>\n",
" <td> 13</td>\n",
" <td> 14</td>\n",
" <td> 15</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 234,
"text": [
" one two three four\n",
"Utah 8 9 10 11\n",
"New York 12 13 14 15"
]
}
],
"prompt_number": 234
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"data.drop('two', axis=1)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>one</th>\n",
" <th>three</th>\n",
" <th>four</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Ohio</th>\n",
" <td> 0</td>\n",
" <td> 2</td>\n",
" <td> 3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Colorado</th>\n",
" <td> 4</td>\n",
" <td> 6</td>\n",
" <td> 7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Utah</th>\n",
" <td> 8</td>\n",
" <td> 10</td>\n",
" <td> 11</td>\n",
" </tr>\n",
" <tr>\n",
" <th>New York</th>\n",
" <td> 12</td>\n",
" <td> 14</td>\n",
" <td> 15</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 70,
"text": [
" one three four\n",
"Ohio 0 2 3\n",
"Colorado 4 6 7\n",
"Utah 8 10 11\n",
"New York 12 14 15"
]
}
],
"prompt_number": 70
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Indexing, selection, and filtering\n",
"\n",
"Series indexing (obj[...]) works analogously to NumPy array indexing, except you can use the Series\u2019s index values instead of only integers. "
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"obj = Series(np.arange(4.), index=['a', 'b', 'c', 'd'])\n",
"obj"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 71,
"text": [
"a 0\n",
"b 1\n",
"c 2\n",
"d 3\n",
"dtype: float64"
]
}
],
"prompt_number": 71
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"print obj['b']\n",
"print \n",
"print obj[1]\n",
"print \n",
"print type(obj[2])\n",
"print\n",
"print type(obj[2:4])\n",
"print \n",
"print obj[['b','a','d']]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"1.0\n",
"\n",
"1.0\n",
"\n",
"<type 'numpy.float64'>\n",
"\n",
"<class 'pandas.core.series.Series'>\n",
"\n",
"b 1\n",
"a 0\n",
"d 3\n",
"dtype: float64\n"
]
}
],
"prompt_number": 236
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"obj['b':'c'] = 5\n",
"obj"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 74,
"text": [
"a 0\n",
"b 5\n",
"c 5\n",
"d 3\n",
"dtype: float64"
]
}
],
"prompt_number": 74
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"data = DataFrame(np.arange(16).reshape((4, 4)),\n",
" index=['Ohio', 'Colorado', 'Utah', 'New York'],\n",
" columns=['one', 'two', 'three', 'four'])\n",
"data"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>one</th>\n",
" <th>two</th>\n",
" <th>three</th>\n",
" <th>four</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Ohio</th>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 2</td>\n",
" <td> 3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Colorado</th>\n",
" <td> 4</td>\n",
" <td> 5</td>\n",
" <td> 6</td>\n",
" <td> 7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Utah</th>\n",
" <td> 8</td>\n",
" <td> 9</td>\n",
" <td> 10</td>\n",
" <td> 11</td>\n",
" </tr>\n",
" <tr>\n",
" <th>New York</th>\n",
" <td> 12</td>\n",
" <td> 13</td>\n",
" <td> 14</td>\n",
" <td> 15</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 91,
"text": [
" one two three four\n",
"Ohio 0 1 2 3\n",
"Colorado 4 5 6 7\n",
"Utah 8 9 10 11\n",
"New York 12 13 14 15"
]
}
],
"prompt_number": 91
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"print data[:2]\n",
"print \n",
"print data[data['three'] > 5]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
" one two three four\n",
"Ohio 0 1 2 3\n",
"Colorado 4 5 6 7\n",
"\n",
" one two three four\n",
"Colorado 4 5 6 7\n",
"Utah 8 9 10 11\n",
"New York 12 13 14 15\n"
]
}
],
"prompt_number": 92
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"data < 5"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>one</th>\n",
" <th>two</th>\n",
" <th>three</th>\n",
" <th>four</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Ohio</th>\n",
" <td> True</td>\n",
" <td> True</td>\n",
" <td> True</td>\n",
" <td> True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Colorado</th>\n",
" <td> True</td>\n",
" <td> False</td>\n",
" <td> False</td>\n",
" <td> False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Utah</th>\n",
" <td> False</td>\n",
" <td> False</td>\n",
" <td> False</td>\n",
" <td> False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>New York</th>\n",
" <td> False</td>\n",
" <td> False</td>\n",
" <td> False</td>\n",
" <td> False</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 238,
"text": [
" one two three four\n",
"Ohio True True True True\n",
"Colorado True False False False\n",
"Utah False False False False\n",
"New York False False False False"
]
}
],
"prompt_number": 238
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"data[data < 5] = 0\n",
"data"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>one</th>\n",
" <th>two</th>\n",
" <th>three</th>\n",
" <th>four</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Ohio</th>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Colorado</th>\n",
" <td> 0</td>\n",
" <td> 5</td>\n",
" <td> 6</td>\n",
" <td> 7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Utah</th>\n",
" <td> 8</td>\n",
" <td> 9</td>\n",
" <td> 10</td>\n",
" <td> 11</td>\n",
" </tr>\n",
" <tr>\n",
" <th>New York</th>\n",
" <td> 12</td>\n",
" <td> 13</td>\n",
" <td> 14</td>\n",
" <td> 15</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 94,
"text": [
" one two three four\n",
"Ohio 0 0 0 0\n",
"Colorado 0 5 6 7\n",
"Utah 8 9 10 11\n",
"New York 12 13 14 15"
]
}
],
"prompt_number": 94
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"data.ix['Colorado', ['two', 'three']]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 240,
"text": [
"two 5\n",
"three 6\n",
"Name: Colorado, dtype: int64"
]
}
],
"prompt_number": 240
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"data.ix[['Colorado', 'Utah'], [3, 0, 1]]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>four</th>\n",
" <th>one</th>\n",
" <th>two</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Colorado</th>\n",
" <td> 7</td>\n",
" <td> 0</td>\n",
" <td> 5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Utah</th>\n",
" <td> 11</td>\n",
" <td> 8</td>\n",
" <td> 9</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 96,
"text": [
" four one two\n",
"Colorado 7 0 5\n",
"Utah 11 8 9"
]
}
],
"prompt_number": 96
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"print data.ix[2]\n",
"print \n",
"print data.ix[:'Utah', 'two']\n",
"print \n",
"print data.ix[data.three > 5, :3]\n"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"one 8\n",
"two 9\n",
"three 10\n",
"four 11\n",
"Name: Utah, dtype: int64\n",
"\n",
"Ohio 0\n",
"Colorado 5\n",
"Utah 9\n",
"Name: two, dtype: int64\n",
"\n",
" one two three\n",
"Colorado 0 5 6\n",
"Utah 8 9 10\n",
"New York 12 13 14\n"
]
}
],
"prompt_number": 97
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Arithmetic and data alignment\n",
"\n",
"One of the most important pandas features is the behavior of arithmetic between objects with different indexes. When adding together objects, if any index pairs are not the same, the respective index in the result will be the union of the index pairs."
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"s1 = Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])\n",
"s2 = Series([-2.1, 3.6, -1.5, 4, 3.1], index=['a', 'c', 'e', 'f', 'g'])\n",
"s1+s2"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 241,
"text": [
"a 5.2\n",
"c 1.1\n",
"d NaN\n",
"e 0.0\n",
"f NaN\n",
"g NaN\n",
"dtype: float64"
]
}
],
"prompt_number": 241
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Arithmetic methods with fill values\n",
"\n",
"df1 = DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'), index=['Ohio', 'Texas', 'Colorado'])\n",
"df2 = DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])\n",
"print df1\n",
"print \n",
"print df2\n",
"print \n",
"print df1+df2"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
" b c d\n",
"Ohio 0 1 2\n",
"Texas 3 4 5\n",
"Colorado 6 7 8\n",
"\n",
" b d e\n",
"Utah 0 1 2\n",
"Ohio 3 4 5\n",
"Texas 6 7 8\n",
"Oregon 9 10 11\n",
"\n",
" b c d e\n",
"Colorado NaN NaN NaN NaN\n",
"Ohio 3 NaN 6 NaN\n",
"Oregon NaN NaN NaN NaN\n",
"Texas 9 NaN 12 NaN\n",
"Utah NaN NaN NaN NaN\n"
]
}
],
"prompt_number": 242
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df1 = DataFrame(np.arange(12.).reshape((3, 4)), columns=list('abcd')) \n",
"df2 = DataFrame(np.arange(20.).reshape((4, 5)), columns=list('abcde'))\n",
"print df1\n",
"print \n",
"print df2\n",
"print \n",
"print df1 + df2 "
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
" a b c d\n",
"0 0 1 2 3\n",
"1 4 5 6 7\n",
"2 8 9 10 11\n",
"\n",
" a b c d e\n",
"0 0 1 2 3 4\n",
"1 5 6 7 8 9\n",
"2 10 11 12 13 14\n",
"3 15 16 17 18 19\n",
"\n",
" a b c d e\n",
"0 0 2 4 6 NaN\n",
"1 9 11 13 15 NaN\n",
"2 18 20 22 24 NaN\n",
"3 NaN NaN NaN NaN NaN\n"
]
}
],
"prompt_number": 243
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df1.add(df2, fill_value=0)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>a</th>\n",
" <th>b</th>\n",
" <th>c</th>\n",
" <th>d</th>\n",
" <th>e</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td> 0</td>\n",
" <td> 2</td>\n",
" <td> 4</td>\n",
" <td> 6</td>\n",
" <td> 4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td> 9</td>\n",
" <td> 11</td>\n",
" <td> 13</td>\n",
" <td> 15</td>\n",
" <td> 9</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td> 18</td>\n",
" <td> 20</td>\n",
" <td> 22</td>\n",
" <td> 24</td>\n",
" <td> 14</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td> 15</td>\n",
" <td> 16</td>\n",
" <td> 17</td>\n",
" <td> 18</td>\n",
" <td> 19</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 104,
"text": [
" a b c d e\n",
"0 0 2 4 6 4\n",
"1 9 11 13 15 9\n",
"2 18 20 22 24 14\n",
"3 15 16 17 18 19"
]
}
],
"prompt_number": 104
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df1.reindex(columns=df2.columns, fill_value=0)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>a</th>\n",
" <th>b</th>\n",
" <th>c</th>\n",
" <th>d</th>\n",
" <th>e</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 2</td>\n",
" <td> 3</td>\n",
" <td> 0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td> 4</td>\n",
" <td> 5</td>\n",
" <td> 6</td>\n",
" <td> 7</td>\n",
" <td> 0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td> 8</td>\n",
" <td> 9</td>\n",
" <td> 10</td>\n",
" <td> 11</td>\n",
" <td> 0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 105,
"text": [
" a b c d e\n",
"0 0 1 2 3 0\n",
"1 4 5 6 7 0\n",
"2 8 9 10 11 0"
]
}
],
"prompt_number": 105
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Operations between DataFrame and Series\n",
"\n",
"arr = np.arange(12.).reshape((3, 4))\n",
"print arr\n",
"print \n",
"print arr[0]\n",
"print \n",
"print arr - arr[0]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"[[ 0. 1. 2. 3.]\n",
" [ 4. 5. 6. 7.]\n",
" [ 8. 9. 10. 11.]]\n",
"\n",
"[ 0. 1. 2. 3.]\n",
"\n",
"[[ 0. 0. 0. 0.]\n",
" [ 4. 4. 4. 4.]\n",
" [ 8. 8. 8. 8.]]\n"
]
}
],
"prompt_number": 108
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"frame = DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),\n",
" index=['Utah', 'Ohio', 'Texas', 'Oregon'])\n",
"\n",
"series = frame.ix[0]\n",
"\n",
"print frame\n",
"print \n",
"print series \n"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
" b d e\n",
"Utah 0 1 2\n",
"Ohio 3 4 5\n",
"Texas 6 7 8\n",
"Oregon 9 10 11\n",
"\n",
"b 0\n",
"d 1\n",
"e 2\n",
"Name: Utah, dtype: float64\n"
]
}
],
"prompt_number": 111
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"frame - series"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>b</th>\n",
" <th>d</th>\n",
" <th>e</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Utah</th>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Ohio</th>\n",
" <td> 3</td>\n",
" <td> 3</td>\n",
" <td> 3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Texas</th>\n",
" <td> 6</td>\n",
" <td> 6</td>\n",
" <td> 6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Oregon</th>\n",
" <td> 9</td>\n",
" <td> 9</td>\n",
" <td> 9</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 112,
"text": [
" b d e\n",
"Utah 0 0 0\n",
"Ohio 3 3 3\n",
"Texas 6 6 6\n",
"Oregon 9 9 9"
]
}
],
"prompt_number": 112
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"series2 = Series(range(3), index=['b', 'e', 'f'])\n",
"frame + series2"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>b</th>\n",
" <th>d</th>\n",
" <th>e</th>\n",
" <th>f</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Utah</th>\n",
" <td> 0</td>\n",
" <td>NaN</td>\n",
" <td> 3</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Ohio</th>\n",
" <td> 3</td>\n",
" <td>NaN</td>\n",
" <td> 6</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Texas</th>\n",
" <td> 6</td>\n",
" <td>NaN</td>\n",
" <td> 9</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Oregon</th>\n",
" <td> 9</td>\n",
" <td>NaN</td>\n",
" <td> 12</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 113,
"text": [
" b d e f\n",
"Utah 0 NaN 3 NaN\n",
"Ohio 3 NaN 6 NaN\n",
"Texas 6 NaN 9 NaN\n",
"Oregon 9 NaN 12 NaN"
]
}
],
"prompt_number": 113
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"series3 = frame['d']\n",
"series3"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 114,
"text": [
"Utah 1\n",
"Ohio 4\n",
"Texas 7\n",
"Oregon 10\n",
"Name: d, dtype: float64"
]
}
],
"prompt_number": 114
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"frame.sub(series3, axis=0)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>b</th>\n",
" <th>d</th>\n",
" <th>e</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Utah</th>\n",
" <td>-1</td>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Ohio</th>\n",
" <td>-1</td>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Texas</th>\n",
" <td>-1</td>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Oregon</th>\n",
" <td>-1</td>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 115,
"text": [
" b d e\n",
"Utah -1 0 1\n",
"Ohio -1 0 1\n",
"Texas -1 0 1\n",
"Oregon -1 0 1"
]
}
],
"prompt_number": 115
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Function application and mapping\n",
"\n",
"NumPy ufuncs (element-wise array methods) work fine with pandas objects:\n"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"frame = DataFrame(np.random.randn(4, 3), columns=list('bde'),\n",
" index=['Utah', 'Ohio', 'Texas', 'Oregon'])\n",
"frame"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>b</th>\n",
" <th>d</th>\n",
" <th>e</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Utah</th>\n",
" <td> 0.531372</td>\n",
" <td> 0.016855</td>\n",
" <td> 0.006387</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Ohio</th>\n",
" <td> 0.253249</td>\n",
" <td> 0.679284</td>\n",
" <td> 1.249851</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Texas</th>\n",
" <td> 1.393670</td>\n",
" <td> 0.544011</td>\n",
" <td>-0.598251</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Oregon</th>\n",
" <td>-1.405705</td>\n",
" <td> 0.481606</td>\n",
" <td> 0.755763</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 116,
"text": [
" b d e\n",
"Utah 0.531372 0.016855 0.006387\n",
"Ohio 0.253249 0.679284 1.249851\n",
"Texas 1.393670 0.544011 -0.598251\n",
"Oregon -1.405705 0.481606 0.755763"
]
}
],
"prompt_number": 116
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"np.abs(frame)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>b</th>\n",
" <th>d</th>\n",
" <th>e</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Utah</th>\n",
" <td> 0.531372</td>\n",
" <td> 0.016855</td>\n",
" <td> 0.006387</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Ohio</th>\n",
" <td> 0.253249</td>\n",
" <td> 0.679284</td>\n",
" <td> 1.249851</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Texas</th>\n",
" <td> 1.393670</td>\n",
" <td> 0.544011</td>\n",
" <td> 0.598251</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Oregon</th>\n",
" <td> 1.405705</td>\n",
" <td> 0.481606</td>\n",
" <td> 0.755763</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 117,
"text": [
" b d e\n",
"Utah 0.531372 0.016855 0.006387\n",
"Ohio 0.253249 0.679284 1.249851\n",
"Texas 1.393670 0.544011 0.598251\n",
"Oregon 1.405705 0.481606 0.755763"
]
}
],
"prompt_number": 117
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"f = lambda x: x.max() - x.min()\n",
"\n",
"frame.apply(f)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 118,
"text": [
"b 2.799376\n",
"d 0.662429\n",
"e 1.848102\n",
"dtype: float64"
]
}
],
"prompt_number": 118
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"frame.apply(f, axis=1)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 119,
"text": [
"Utah 0.524986\n",
"Ohio 0.996602\n",
"Texas 1.991921\n",
"Oregon 2.161469\n",
"dtype: float64"
]
}
],
"prompt_number": 119
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def f(x):\n",
" return Series([x.min(), x.max()], index=['min', 'max'])\n",
"frame.apply(f)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>b</th>\n",
" <th>d</th>\n",
" <th>e</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>-1.405705</td>\n",
" <td> 0.016855</td>\n",
" <td>-0.598251</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td> 1.393670</td>\n",
" <td> 0.679284</td>\n",
" <td> 1.249851</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 124,
"text": [
" b d e\n",
"min -1.405705 0.016855 -0.598251\n",
"max 1.393670 0.679284 1.249851"
]
}
],
"prompt_number": 124
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"format = lambda x: '%.2f' % x\n",
"frame.applymap(format)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>b</th>\n",
" <th>d</th>\n",
" <th>e</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Utah</th>\n",
" <td> 0.53</td>\n",
" <td> 0.02</td>\n",
" <td> 0.01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Ohio</th>\n",
" <td> 0.25</td>\n",
" <td> 0.68</td>\n",
" <td> 1.25</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Texas</th>\n",
" <td> 1.39</td>\n",
" <td> 0.54</td>\n",
" <td> -0.60</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Oregon</th>\n",
" <td> -1.41</td>\n",
" <td> 0.48</td>\n",
" <td> 0.76</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 125,
"text": [
" b d e\n",
"Utah 0.53 0.02 0.01\n",
"Ohio 0.25 0.68 1.25\n",
"Texas 1.39 0.54 -0.60\n",
"Oregon -1.41 0.48 0.76"
]
}
],
"prompt_number": 125
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"frame['e'].map(format)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 126,
"text": [
"Utah 0.01\n",
"Ohio 1.25\n",
"Texas -0.60\n",
"Oregon 0.76\n",
"Name: e, dtype: object"
]
}
],
"prompt_number": 126
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Sorting and ranking \n",
"\n",
"Sorting a data set by some criterion is another important built-in operation. To sort lexicographically by row or column index, use the sort_index method, which returns a new, sorted object:"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"obj = Series(range(4), index=['d', 'a', 'b', 'c'])\n",
"obj.sort_index()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 127,
"text": [
"a 1\n",
"b 2\n",
"c 3\n",
"d 0\n",
"dtype: int64"
]
}
],
"prompt_number": 127
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"frame = DataFrame(np.arange(8).reshape((2, 4)), \n",
" index=['three', 'one'], \n",
" columns=['d', 'a', 'b', 'c'])\n",
"\n",
"frame"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>d</th>\n",
" <th>a</th>\n",
" <th>b</th>\n",
" <th>c</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>three</th>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 2</td>\n",
" <td> 3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>one</th>\n",
" <td> 4</td>\n",
" <td> 5</td>\n",
" <td> 6</td>\n",
" <td> 7</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 137,
"text": [
" d a b c\n",
"three 0 1 2 3\n",
"one 4 5 6 7"
]
}
],
"prompt_number": 137
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"frame.sort_index()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>d</th>\n",
" <th>a</th>\n",
" <th>b</th>\n",
" <th>c</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>one</th>\n",
" <td> 4</td>\n",
" <td> 5</td>\n",
" <td> 6</td>\n",
" <td> 7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>three</th>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 2</td>\n",
" <td> 3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 138,
"text": [
" d a b c\n",
"one 4 5 6 7\n",
"three 0 1 2 3"
]
}
],
"prompt_number": 138
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"frame.sort_index(axis=1)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>a</th>\n",
" <th>b</th>\n",
" <th>c</th>\n",
" <th>d</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>three</th>\n",
" <td> 1</td>\n",
" <td> 2</td>\n",
" <td> 3</td>\n",
" <td> 0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>one</th>\n",
" <td> 5</td>\n",
" <td> 6</td>\n",
" <td> 7</td>\n",
" <td> 4</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 139,
"text": [
" a b c d\n",
"three 1 2 3 0\n",
"one 5 6 7 4"
]
}
],
"prompt_number": 139
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"frame.sort_index(axis=0)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>d</th>\n",
" <th>a</th>\n",
" <th>b</th>\n",
" <th>c</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>one</th>\n",
" <td> 4</td>\n",
" <td> 5</td>\n",
" <td> 6</td>\n",
" <td> 7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>three</th>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 2</td>\n",
" <td> 3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 140,
"text": [
" d a b c\n",
"one 4 5 6 7\n",
"three 0 1 2 3"
]
}
],
"prompt_number": 140
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"frame.sort_index(axis=2)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"ename": "ValueError",
"evalue": "No axis named 2",
"output_type": "pyerr",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-141-c7a53c74c9ed>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mframe\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msort_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m/Library/Python/2.7/site-packages/pandas/core/frame.pyc\u001b[0m in \u001b[0;36msort_index\u001b[0;34m(self, axis, by, ascending, inplace)\u001b[0m\n\u001b[1;32m 3126\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mpandas\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcore\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgroupby\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0m_lexsort_indexer\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3127\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3128\u001b[0;31m \u001b[0maxis\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_axis_number\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3129\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0maxis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3130\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Axis must be 0 or 1, got %s'\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/Library/Python/2.7/site-packages/pandas/core/generic.pyc\u001b[0m in \u001b[0;36m_get_axis_number\u001b[0;34m(self, axis)\u001b[0m\n\u001b[1;32m 45\u001b[0m \u001b[0;32mexcept\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 46\u001b[0m \u001b[0;32mpass\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 47\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'No axis named %s'\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 48\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 49\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_get_axis_name\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mValueError\u001b[0m: No axis named 2"
]
}
],
"prompt_number": 141
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"frame.sort_index(axis=1, ascending=False)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>d</th>\n",
" <th>c</th>\n",
" <th>b</th>\n",
" <th>a</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>three</th>\n",
" <td> 0</td>\n",
" <td> 3</td>\n",
" <td> 2</td>\n",
" <td> 1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>one</th>\n",
" <td> 4</td>\n",
" <td> 7</td>\n",
" <td> 6</td>\n",
" <td> 5</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 142,
"text": [
" d c b a\n",
"three 0 3 2 1\n",
"one 4 7 6 5"
]
}
],
"prompt_number": 142
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"obj = Series([4, 7, -3, 2])\n",
"obj"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 143,
"text": [
"0 4\n",
"1 7\n",
"2 -3\n",
"3 2\n",
"dtype: int64"
]
}
],
"prompt_number": 143
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# To sort a Series by its values, use its order method:\n",
"obj.order()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 144,
"text": [
"2 -3\n",
"3 2\n",
"0 4\n",
"1 7\n",
"dtype: int64"
]
}
],
"prompt_number": 144
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Any missing values are sorted to the end of the Series by default:\n",
"obj = Series([4, np.nan, 7, np.nan, -3, 2])\n",
"obj.order()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 145,
"text": [
"4 -3\n",
"5 2\n",
"0 4\n",
"2 7\n",
"1 NaN\n",
"3 NaN\n",
"dtype: float64"
]
}
],
"prompt_number": 145
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# you may want to sort by the values in one or more columns. \n",
"frame = DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})\n",
"\n",
"print frame.sort_index(ascending=False)\n",
"print \n",
"print frame.sort_index(by='b')\n",
"print \n",
"print frame.sort_index(by=['a','b'])"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
" a b\n",
"3 1 2\n",
"2 0 -3\n",
"1 1 7\n",
"0 0 4\n",
"\n",
" a b\n",
"2 0 -3\n",
"3 1 2\n",
"0 0 4\n",
"1 1 7\n",
"\n",
" a b\n",
"2 0 -3\n",
"0 0 4\n",
"3 1 2\n",
"1 1 7\n"
]
}
],
"prompt_number": 149
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"__Ranking__ is closely related to sorting, assigning ranks from one through the number of valid data points in an array. It is similar to the indirect sort indices produced by __numpy.argsort__, except that ties are broken according to a rule. The __rank__ methods for Series and DataFrame are the place to look; by default __rank__ breaks ties by assigning each group the mean rank:"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"obj = Series([7, -5, 7, 4, 2, 0, 4])\n",
"obj"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 150,
"text": [
"0 7\n",
"1 -5\n",
"2 7\n",
"3 4\n",
"4 2\n",
"5 0\n",
"6 4\n",
"dtype: int64"
]
}
],
"prompt_number": 150
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"obj.rank()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 151,
"text": [
"0 6.5\n",
"1 1.0\n",
"2 6.5\n",
"3 4.5\n",
"4 3.0\n",
"5 2.0\n",
"6 4.5\n",
"dtype: float64"
]
}
],
"prompt_number": 151
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"obj.order()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 152,
"text": [
"1 -5\n",
"5 0\n",
"4 2\n",
"3 4\n",
"6 4\n",
"0 7\n",
"2 7\n",
"dtype: int64"
]
}
],
"prompt_number": 152
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"obj.rank(method='first')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 153,
"text": [
"0 6\n",
"1 1\n",
"2 7\n",
"3 4\n",
"4 3\n",
"5 2\n",
"6 5\n",
"dtype: float64"
]
}
],
"prompt_number": 153
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"obj.rank(ascending=False, method='max')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 157,
"text": [
"0 2\n",
"1 7\n",
"2 2\n",
"3 4\n",
"4 5\n",
"5 6\n",
"6 4\n",
"dtype: float64"
]
}
],
"prompt_number": 157
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"frame = DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1], \n",
" 'c': [-2, 5, 8, -2.5]})\n",
"frame"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>a</th>\n",
" <th>b</th>\n",
" <th>c</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td> 0</td>\n",
" <td> 4.3</td>\n",
" <td>-2.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td> 1</td>\n",
" <td> 7.0</td>\n",
" <td> 5.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td> 0</td>\n",
" <td>-3.0</td>\n",
" <td> 8.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td> 1</td>\n",
" <td> 2.0</td>\n",
" <td>-2.5</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 158,
"text": [
" a b c\n",
"0 0 4.3 -2.0\n",
"1 1 7.0 5.0\n",
"2 0 -3.0 8.0\n",
"3 1 2.0 -2.5"
]
}
],
"prompt_number": 158
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"frame.rank(axis=1)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>a</th>\n",
" <th>b</th>\n",
" <th>c</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td> 2</td>\n",
" <td> 3</td>\n",
" <td> 1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td> 1</td>\n",
" <td> 3</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td> 2</td>\n",
" <td> 1</td>\n",
" <td> 3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td> 2</td>\n",
" <td> 3</td>\n",
" <td> 1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 159,
"text": [
" a b c\n",
"0 2 3 1\n",
"1 1 3 2\n",
"2 2 1 3\n",
"3 2 3 1"
]
}
],
"prompt_number": 159
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Axis indexes with duplicate values\n",
"\n",
"Up until now all of the examples I\u2019ve showed you have had unique axis labels (index values). While many pandas functions (like reindex) require that the labels be unique, it\u2019s not mandatory. Let\u2019s consider a small Series with duplicate indices:"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"obj = Series(range(5), index=['a', 'a', 'b', 'b', 'c'])\n",
"obj"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 160,
"text": [
"a 0\n",
"a 1\n",
"b 2\n",
"b 3\n",
"c 4\n",
"dtype: int64"
]
}
],
"prompt_number": 160
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"obj.index.is_unique"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 161,
"text": [
"False"
]
}
],
"prompt_number": 161
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"obj['a']"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 162,
"text": [
"a 0\n",
"a 1\n",
"dtype: int64"
]
}
],
"prompt_number": 162
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df = DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])\n",
"df"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>a</th>\n",
" <td> 0.659189</td>\n",
" <td> 2.247419</td>\n",
" <td> 0.317194</td>\n",
" </tr>\n",
" <tr>\n",
" <th>a</th>\n",
" <td>-0.812310</td>\n",
" <td>-0.311062</td>\n",
" <td> 1.263877</td>\n",
" </tr>\n",
" <tr>\n",
" <th>b</th>\n",
" <td>-0.078266</td>\n",
" <td>-0.226042</td>\n",
" <td>-0.265894</td>\n",
" </tr>\n",
" <tr>\n",
" <th>b</th>\n",
" <td>-0.972430</td>\n",
" <td> 0.833500</td>\n",
" <td>-0.604137</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 163,
"text": [
" 0 1 2\n",
"a 0.659189 2.247419 0.317194\n",
"a -0.812310 -0.311062 1.263877\n",
"b -0.078266 -0.226042 -0.265894\n",
"b -0.972430 0.833500 -0.604137"
]
}
],
"prompt_number": 163
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df.ix['b']"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>b</th>\n",
" <td>-0.078266</td>\n",
" <td>-0.226042</td>\n",
" <td>-0.265894</td>\n",
" </tr>\n",
" <tr>\n",
" <th>b</th>\n",
" <td>-0.972430</td>\n",
" <td> 0.833500</td>\n",
" <td>-0.604137</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 164,
"text": [
" 0 1 2\n",
"b -0.078266 -0.226042 -0.265894\n",
"b -0.972430 0.833500 -0.604137"
]
}
],
"prompt_number": 164
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Summarizing and Computing Descriptive Statistics \n",
"\n",
"pandas objects are equipped with a set of common mathematical and statistical meth- ods. Most of these fall into the category of reductions or summary statistics, methods that extract a single value (like the sum or mean) from a Series or a Series of values from the rows or columns of a DataFrame. Compared with the equivalent methods of vanilla NumPy arrays, they are all built from the ground up to exclude missing data. Consider a small DataFrame:\n",
"\n"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df = DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]],\n",
" index=['a', 'b', 'c', 'd'], columns=['one', 'two'])\n",
"df"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>one</th>\n",
" <th>two</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>a</th>\n",
" <td> 1.40</td>\n",
" <td> NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>b</th>\n",
" <td> 7.10</td>\n",
" <td>-4.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>c</th>\n",
" <td> NaN</td>\n",
" <td> NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>d</th>\n",
" <td> 0.75</td>\n",
" <td>-1.3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 6,
"text": [
" one two\n",
"a 1.40 NaN\n",
"b 7.10 -4.5\n",
"c NaN NaN\n",
"d 0.75 -1.3"
]
}
],
"prompt_number": 6
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df.sum()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 7,
"text": [
"one 9.25\n",
"two -5.80\n",
"dtype: float64"
]
}
],
"prompt_number": 7
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df.sum(axis=1)\n"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 11,
"text": [
"a 1.40\n",
"b 2.60\n",
"c NaN\n",
"d -0.55\n",
"dtype: float64"
]
}
],
"prompt_number": 11
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df.sum(axis=1, skipna=False)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 12,
"text": [
"a NaN\n",
"b 2.60\n",
"c NaN\n",
"d -0.55\n",
"dtype: float64"
]
}
],
"prompt_number": 12
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df.cumsum()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>one</th>\n",
" <th>two</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>a</th>\n",
" <td> 1.40</td>\n",
" <td> NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>b</th>\n",
" <td> 8.50</td>\n",
" <td>-4.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>c</th>\n",
" <td> NaN</td>\n",
" <td> NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>d</th>\n",
" <td> 9.25</td>\n",
" <td>-5.8</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 13,
"text": [
" one two\n",
"a 1.40 NaN\n",
"b 8.50 -4.5\n",
"c NaN NaN\n",
"d 9.25 -5.8"
]
}
],
"prompt_number": 13
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df.describe()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>one</th>\n",
" <th>two</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td> 3.000000</td>\n",
" <td> 2.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td> 3.083333</td>\n",
" <td>-2.900000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td> 3.493685</td>\n",
" <td> 2.262742</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td> 0.750000</td>\n",
" <td>-4.500000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td> 1.075000</td>\n",
" <td>-3.700000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td> 1.400000</td>\n",
" <td>-2.900000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td> 4.250000</td>\n",
" <td>-2.100000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td> 7.100000</td>\n",
" <td>-1.300000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 14,
"text": [
" one two\n",
"count 3.000000 2.000000\n",
"mean 3.083333 -2.900000\n",
"std 3.493685 2.262742\n",
"min 0.750000 -4.500000\n",
"25% 1.075000 -3.700000\n",
"50% 1.400000 -2.900000\n",
"75% 4.250000 -2.100000\n",
"max 7.100000 -1.300000"
]
}
],
"prompt_number": 14
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Correlation and Covariance"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import pandas.io.data as web\n",
"all_data = {}\n",
"for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']:\n",
" all_data[ticker] = web.get_data_yahoo(ticker, '1/1/2000', '1/1/2010')\n",
"\n",
"price = DataFrame({tic: data['Adj Close'] for tic, data in all_data.iteritems()})\n",
"volume = DataFrame({tic: data['Volume'] for tic, data in all_data.iteritems()})\n"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 15
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"price"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<pre>\n",
"&ltclass 'pandas.core.frame.DataFrame'&gt\n",
"DatetimeIndex: 2515 entries, 2000-01-03 00:00:00 to 2009-12-31 00:00:00\n",
"Data columns (total 4 columns):\n",
"AAPL 2515 non-null values\n",
"GOOG 1353 non-null values\n",
"IBM 2515 non-null values\n",
"MSFT 2515 non-null values\n",
"dtypes: float64(4)\n",
"</pre>"
],
"output_type": "pyout",
"prompt_number": 16,
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"DatetimeIndex: 2515 entries, 2000-01-03 00:00:00 to 2009-12-31 00:00:00\n",
"Data columns (total 4 columns):\n",
"AAPL 2515 non-null values\n",
"GOOG 1353 non-null values\n",
"IBM 2515 non-null values\n",
"MSFT 2515 non-null values\n",
"dtypes: float64(4)"
]
}
],
"prompt_number": 16
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"volume"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<pre>\n",
"&ltclass 'pandas.core.frame.DataFrame'&gt\n",
"DatetimeIndex: 2515 entries, 2000-01-03 00:00:00 to 2009-12-31 00:00:00\n",
"Data columns (total 4 columns):\n",
"AAPL 2515 non-null values\n",
"GOOG 1353 non-null values\n",
"IBM 2515 non-null values\n",
"MSFT 2515 non-null values\n",
"dtypes: float64(1), int64(3)\n",
"</pre>"
],
"output_type": "pyout",
"prompt_number": 18,
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"DatetimeIndex: 2515 entries, 2000-01-03 00:00:00 to 2009-12-31 00:00:00\n",
"Data columns (total 4 columns):\n",
"AAPL 2515 non-null values\n",
"GOOG 1353 non-null values\n",
"IBM 2515 non-null values\n",
"MSFT 2515 non-null values\n",
"dtypes: float64(1), int64(3)"
]
}
],
"prompt_number": 18
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"returns = price.pct_change()\n",
"returns.tail()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>AAPL</th>\n",
" <th>GOOG</th>\n",
" <th>IBM</th>\n",
" <th>MSFT</th>\n",
" </tr>\n",
" <tr>\n",
" <th>Date</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2009-12-24</th>\n",
" <td> 0.034342</td>\n",
" <td> 0.011117</td>\n",
" <td> 0.004439</td>\n",
" <td> 0.002495</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2009-12-28</th>\n",
" <td> 0.012297</td>\n",
" <td> 0.007098</td>\n",
" <td> 0.013257</td>\n",
" <td> 0.005688</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2009-12-29</th>\n",
" <td>-0.011856</td>\n",
" <td>-0.005571</td>\n",
" <td>-0.003473</td>\n",
" <td> 0.007070</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2009-12-30</th>\n",
" <td> 0.012146</td>\n",
" <td> 0.005376</td>\n",
" <td> 0.005511</td>\n",
" <td>-0.013689</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2009-12-31</th>\n",
" <td>-0.004275</td>\n",
" <td>-0.004416</td>\n",
" <td>-0.012574</td>\n",
" <td>-0.015658</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 20,
"text": [
" AAPL GOOG IBM MSFT\n",
"Date \n",
"2009-12-24 0.034342 0.011117 0.004439 0.002495\n",
"2009-12-28 0.012297 0.007098 0.013257 0.005688\n",
"2009-12-29 -0.011856 -0.005571 -0.003473 0.007070\n",
"2009-12-30 0.012146 0.005376 0.005511 -0.013689\n",
"2009-12-31 -0.004275 -0.004416 -0.012574 -0.015658"
]
}
],
"prompt_number": 20
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"returns.MSFT.corr(returns.IBM)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 21,
"text": [
"0.49607155409807629"
]
}
],
"prompt_number": 21
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"returns.MSFT.cov(returns.IBM)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 22,
"text": [
"0.00021598877352704989"
]
}
],
"prompt_number": 22
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"returns.corr()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>AAPL</th>\n",
" <th>GOOG</th>\n",
" <th>IBM</th>\n",
" <th>MSFT</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>AAPL</th>\n",
" <td> 1.000000</td>\n",
" <td> 0.470740</td>\n",
" <td> 0.409959</td>\n",
" <td> 0.424209</td>\n",
" </tr>\n",
" <tr>\n",
" <th>GOOG</th>\n",
" <td> 0.470740</td>\n",
" <td> 1.000000</td>\n",
" <td> 0.390578</td>\n",
" <td> 0.443412</td>\n",
" </tr>\n",
" <tr>\n",
" <th>IBM</th>\n",
" <td> 0.409959</td>\n",
" <td> 0.390578</td>\n",
" <td> 1.000000</td>\n",
" <td> 0.496072</td>\n",
" </tr>\n",
" <tr>\n",
" <th>MSFT</th>\n",
" <td> 0.424209</td>\n",
" <td> 0.443412</td>\n",
" <td> 0.496072</td>\n",
" <td> 1.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 23,
"text": [
" AAPL GOOG IBM MSFT\n",
"AAPL 1.000000 0.470740 0.409959 0.424209\n",
"GOOG 0.470740 1.000000 0.390578 0.443412\n",
"IBM 0.409959 0.390578 1.000000 0.496072\n",
"MSFT 0.424209 0.443412 0.496072 1.000000"
]
}
],
"prompt_number": 23
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"returns.cov()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>AAPL</th>\n",
" <th>GOOG</th>\n",
" <th>IBM</th>\n",
" <th>MSFT</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>AAPL</th>\n",
" <td> 0.001028</td>\n",
" <td> 0.000303</td>\n",
" <td> 0.000252</td>\n",
" <td> 0.000309</td>\n",
" </tr>\n",
" <tr>\n",
" <th>GOOG</th>\n",
" <td> 0.000303</td>\n",
" <td> 0.000580</td>\n",
" <td> 0.000142</td>\n",
" <td> 0.000205</td>\n",
" </tr>\n",
" <tr>\n",
" <th>IBM</th>\n",
" <td> 0.000252</td>\n",
" <td> 0.000142</td>\n",
" <td> 0.000367</td>\n",
" <td> 0.000216</td>\n",
" </tr>\n",
" <tr>\n",
" <th>MSFT</th>\n",
" <td> 0.000309</td>\n",
" <td> 0.000205</td>\n",
" <td> 0.000216</td>\n",
" <td> 0.000516</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 24,
"text": [
" AAPL GOOG IBM MSFT\n",
"AAPL 0.001028 0.000303 0.000252 0.000309\n",
"GOOG 0.000303 0.000580 0.000142 0.000205\n",
"IBM 0.000252 0.000142 0.000367 0.000216\n",
"MSFT 0.000309 0.000205 0.000216 0.000516"
]
}
],
"prompt_number": 24
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"returns.corrwith(returns.IBM)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 25,
"text": [
"AAPL 0.409959\n",
"GOOG 0.390578\n",
"IBM 1.000000\n",
"MSFT 0.496072\n",
"dtype: float64"
]
}
],
"prompt_number": 25
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Unique Values, Value Counts, and Membership"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"obj = Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])\n",
"uniques = obj.unique()\n",
"uniques "
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 26,
"text": [
"array([c, a, d, b], dtype=object)"
]
}
],
"prompt_number": 26
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"obj.value_counts()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 27,
"text": [
"c 3\n",
"a 3\n",
"b 2\n",
"d 1\n",
"dtype: int64"
]
}
],
"prompt_number": 27
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"pd.value_counts(obj.values, sort=False)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 28,
"text": [
"a 3\n",
"c 3\n",
"b 2\n",
"d 1\n",
"dtype: int64"
]
}
],
"prompt_number": 28
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"pd.value_counts(obj.values, sort=True)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 29,
"text": [
"c 3\n",
"a 3\n",
"b 2\n",
"d 1\n",
"dtype: int64"
]
}
],
"prompt_number": 29
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"mask = obj.isin(['b', 'c'])\n",
"mask"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 30,
"text": [
"0 True\n",
"1 False\n",
"2 False\n",
"3 False\n",
"4 False\n",
"5 True\n",
"6 True\n",
"7 True\n",
"8 True\n",
"dtype: bool"
]
}
],
"prompt_number": 30
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"data = DataFrame({'Qu1': [1, 3, 4, 3, 4], \n",
" 'Qu2': [2, 3, 1, 2, 3],\n",
" 'Qu3': [1, 5, 2, 4, 4]})\n",
"data"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Qu1</th>\n",
" <th>Qu2</th>\n",
" <th>Qu3</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td> 1</td>\n",
" <td> 2</td>\n",
" <td> 1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td> 3</td>\n",
" <td> 3</td>\n",
" <td> 5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td> 4</td>\n",
" <td> 1</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td> 3</td>\n",
" <td> 2</td>\n",
" <td> 4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td> 4</td>\n",
" <td> 3</td>\n",
" <td> 4</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 31,
"text": [
" Qu1 Qu2 Qu3\n",
"0 1 2 1\n",
"1 3 3 5\n",
"2 4 1 2\n",
"3 3 2 4\n",
"4 4 3 4"
]
}
],
"prompt_number": 31
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"result = data.apply(pd.value_counts)\n",
"result"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Qu1</th>\n",
" <th>Qu2</th>\n",
" <th>Qu3</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td> 1</td>\n",
" <td> 1</td>\n",
" <td> 1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>NaN</td>\n",
" <td> 2</td>\n",
" <td> 1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td> 2</td>\n",
" <td> 2</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td> 2</td>\n",
" <td>NaN</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td> 1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 33,
"text": [
" Qu1 Qu2 Qu3\n",
"1 1 1 1\n",
"2 NaN 2 1\n",
"3 2 2 NaN\n",
"4 2 NaN 2\n",
"5 NaN NaN 1"
]
}
],
"prompt_number": 33
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"result = data.apply(pd.value_counts).fillna(0)\n",
"result"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Qu1</th>\n",
" <th>Qu2</th>\n",
" <th>Qu3</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td> 1</td>\n",
" <td> 1</td>\n",
" <td> 1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td> 0</td>\n",
" <td> 2</td>\n",
" <td> 1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td> 2</td>\n",
" <td> 2</td>\n",
" <td> 0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td> 2</td>\n",
" <td> 0</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 35,
"text": [
" Qu1 Qu2 Qu3\n",
"1 1 1 1\n",
"2 0 2 1\n",
"3 2 2 0\n",
"4 2 0 2\n",
"5 0 0 1"
]
}
],
"prompt_number": 35
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Handling Missing Data\n",
"\n",
"Missing data is common in most data analysis applications. One of the goals in de- signing pandas was to make working with missing data as painless as possible. For example, all of the descriptive statistics on pandas objects exclude missing data as you\u2019ve seen earlier in the chapter."
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"string_data = Series(['aardvark', 'artichoke', np.nan, 'avocado'])\n",
"string_data"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 36,
"text": [
"0 aardvark\n",
"1 artichoke\n",
"2 NaN\n",
"3 avocado\n",
"dtype: object"
]
}
],
"prompt_number": 36
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"string_data.isnull()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 37,
"text": [
"0 False\n",
"1 False\n",
"2 True\n",
"3 False\n",
"dtype: bool"
]
}
],
"prompt_number": 37
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"string_data[0] = None\n",
"string_data"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 39,
"text": [
"0 None\n",
"1 artichoke\n",
"2 NaN\n",
"3 avocado\n",
"dtype: object"
]
}
],
"prompt_number": 39
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Filtering Out Missing Data"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from numpy import nan as NA\n",
"data = Series([1,NA,3.5,NA,7])\n",
"data.dropna()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 41,
"text": [
"0 1.0\n",
"2 3.5\n",
"4 7.0\n",
"dtype: float64"
]
}
],
"prompt_number": 41
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"data[data.notnull()]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 42,
"text": [
"0 1.0\n",
"2 3.5\n",
"4 7.0\n",
"dtype: float64"
]
}
],
"prompt_number": 42
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"data = DataFrame([[1., 6.5, 3.], [1., NA, NA], [NA, NA, NA], [NA, 6.5, 3.]])\n",
"cleaned = data.dropna()\n",
"cleaned"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td> 1</td>\n",
" <td> 6.5</td>\n",
" <td> 3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 44,
"text": [
" 0 1 2\n",
"0 1 6.5 3"
]
}
],
"prompt_number": 44
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"data.dropna(how='all')\n"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td> 1</td>\n",
" <td> 6.5</td>\n",
" <td> 3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td> 1</td>\n",
" <td> NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>NaN</td>\n",
" <td> 6.5</td>\n",
" <td> 3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 46,
"text": [
" 0 1 2\n",
"0 1 6.5 3\n",
"1 1 NaN NaN\n",
"3 NaN 6.5 3"
]
}
],
"prompt_number": 46
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df = DataFrame(np.random.randn(7,3))\n",
"df"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td> 0.404052</td>\n",
" <td> 0.409225</td>\n",
" <td> 0.978779</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>-0.494410</td>\n",
" <td>-0.398292</td>\n",
" <td>-0.958605</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>-0.197636</td>\n",
" <td> 1.250569</td>\n",
" <td> 0.626912</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td> 0.806719</td>\n",
" <td>-2.660904</td>\n",
" <td> 0.142528</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>-0.528672</td>\n",
" <td>-2.018280</td>\n",
" <td>-1.702336</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>-0.594223</td>\n",
" <td> 1.046469</td>\n",
" <td>-0.478642</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td> 1.096200</td>\n",
" <td> 0.750286</td>\n",
" <td> 0.823056</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 48,
"text": [
" 0 1 2\n",
"0 0.404052 0.409225 0.978779\n",
"1 -0.494410 -0.398292 -0.958605\n",
"2 -0.197636 1.250569 0.626912\n",
"3 0.806719 -2.660904 0.142528\n",
"4 -0.528672 -2.018280 -1.702336\n",
"5 -0.594223 1.046469 -0.478642\n",
"6 1.096200 0.750286 0.823056"
]
}
],
"prompt_number": 48
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df.ix[:4,1]=NA\n",
"df.ix[:2,2]=NA\n",
"df"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td> 0.404052</td>\n",
" <td> NaN</td>\n",
" <td> NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>-0.494410</td>\n",
" <td> NaN</td>\n",
" <td> NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>-0.197636</td>\n",
" <td> NaN</td>\n",
" <td> NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td> 0.806719</td>\n",
" <td> NaN</td>\n",
" <td> 0.142528</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>-0.528672</td>\n",
" <td> NaN</td>\n",
" <td>-1.702336</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>-0.594223</td>\n",
" <td> 1.046469</td>\n",
" <td>-0.478642</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td> 1.096200</td>\n",
" <td> 0.750286</td>\n",
" <td> 0.823056</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 49,
"text": [
" 0 1 2\n",
"0 0.404052 NaN NaN\n",
"1 -0.494410 NaN NaN\n",
"2 -0.197636 NaN NaN\n",
"3 0.806719 NaN 0.142528\n",
"4 -0.528672 NaN -1.702336\n",
"5 -0.594223 1.046469 -0.478642\n",
"6 1.096200 0.750286 0.823056"
]
}
],
"prompt_number": 49
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# A related way to filter out DataFrame rows tends to concern time series data.\n",
"# Suppose you want to keep only rows containing a certain number of observations. \n",
"# You can indicate this with the thresh argument:\n",
"\n",
"df.dropna(thresh=3)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>-0.594223</td>\n",
" <td> 1.046469</td>\n",
" <td>-0.478642</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td> 1.096200</td>\n",
" <td> 0.750286</td>\n",
" <td> 0.823056</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 56,
"text": [
" 0 1 2\n",
"5 -0.594223 1.046469 -0.478642\n",
"6 1.096200 0.750286 0.823056"
]
}
],
"prompt_number": 56
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Filling in Missing Data"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df.fillna(0)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td> 0.404052</td>\n",
" <td> 0.000000</td>\n",
" <td> 0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>-0.494410</td>\n",
" <td> 0.000000</td>\n",
" <td> 0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>-0.197636</td>\n",
" <td> 0.000000</td>\n",
" <td> 0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td> 0.806719</td>\n",
" <td> 0.000000</td>\n",
" <td> 0.142528</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>-0.528672</td>\n",
" <td> 0.000000</td>\n",
" <td>-1.702336</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>-0.594223</td>\n",
" <td> 1.046469</td>\n",
" <td>-0.478642</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td> 1.096200</td>\n",
" <td> 0.750286</td>\n",
" <td> 0.823056</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 57,
"text": [
" 0 1 2\n",
"0 0.404052 0.000000 0.000000\n",
"1 -0.494410 0.000000 0.000000\n",
"2 -0.197636 0.000000 0.000000\n",
"3 0.806719 0.000000 0.142528\n",
"4 -0.528672 0.000000 -1.702336\n",
"5 -0.594223 1.046469 -0.478642\n",
"6 1.096200 0.750286 0.823056"
]
}
],
"prompt_number": 57
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df.fillna({1:0.5, 3:-1, 2:0.1})"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td> 0.404052</td>\n",
" <td> 0.500000</td>\n",
" <td> 0.100000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>-0.494410</td>\n",
" <td> 0.500000</td>\n",
" <td> 0.100000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>-0.197636</td>\n",
" <td> 0.500000</td>\n",
" <td> 0.100000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td> 0.806719</td>\n",
" <td> 0.500000</td>\n",
" <td> 0.142528</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>-0.528672</td>\n",
" <td> 0.500000</td>\n",
" <td>-1.702336</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>-0.594223</td>\n",
" <td> 1.046469</td>\n",
" <td>-0.478642</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td> 1.096200</td>\n",
" <td> 0.750286</td>\n",
" <td> 0.823056</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 58,
"text": [
" 0 1 2\n",
"0 0.404052 0.500000 0.100000\n",
"1 -0.494410 0.500000 0.100000\n",
"2 -0.197636 0.500000 0.100000\n",
"3 0.806719 0.500000 0.142528\n",
"4 -0.528672 0.500000 -1.702336\n",
"5 -0.594223 1.046469 -0.478642\n",
"6 1.096200 0.750286 0.823056"
]
}
],
"prompt_number": 58
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"_ = df.fillna(0, inplace=True)\n",
"df"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td> 0.404052</td>\n",
" <td> 0.000000</td>\n",
" <td> 0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>-0.494410</td>\n",
" <td> 0.000000</td>\n",
" <td> 0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>-0.197636</td>\n",
" <td> 0.000000</td>\n",
" <td> 0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td> 0.806719</td>\n",
" <td> 0.000000</td>\n",
" <td> 0.142528</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>-0.528672</td>\n",
" <td> 0.000000</td>\n",
" <td>-1.702336</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>-0.594223</td>\n",
" <td> 1.046469</td>\n",
" <td>-0.478642</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td> 1.096200</td>\n",
" <td> 0.750286</td>\n",
" <td> 0.823056</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 59,
"text": [
" 0 1 2\n",
"0 0.404052 0.000000 0.000000\n",
"1 -0.494410 0.000000 0.000000\n",
"2 -0.197636 0.000000 0.000000\n",
"3 0.806719 0.000000 0.142528\n",
"4 -0.528672 0.000000 -1.702336\n",
"5 -0.594223 1.046469 -0.478642\n",
"6 1.096200 0.750286 0.823056"
]
}
],
"prompt_number": 59
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df = DataFrame(np.random.randn(6,3))\n",
"df.ix[2:,1] = NA\n",
"df.ix[4:,2] = NA\n",
"df"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>-1.371226</td>\n",
" <td> 0.397127</td>\n",
" <td>-1.457344</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>-0.535057</td>\n",
" <td>-0.816477</td>\n",
" <td>-0.277766</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td> 1.041361</td>\n",
" <td> NaN</td>\n",
" <td>-0.165556</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td> 2.231523</td>\n",
" <td> NaN</td>\n",
" <td>-0.005304</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>-0.247808</td>\n",
" <td> NaN</td>\n",
" <td> NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td> 0.888134</td>\n",
" <td> NaN</td>\n",
" <td> NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 62,
"text": [
" 0 1 2\n",
"0 -1.371226 0.397127 -1.457344\n",
"1 -0.535057 -0.816477 -0.277766\n",
"2 1.041361 NaN -0.165556\n",
"3 2.231523 NaN -0.005304\n",
"4 -0.247808 NaN NaN\n",
"5 0.888134 NaN NaN"
]
}
],
"prompt_number": 62
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df.fillna(method='ffill')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>-1.371226</td>\n",
" <td> 0.397127</td>\n",
" <td>-1.457344</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>-0.535057</td>\n",
" <td>-0.816477</td>\n",
" <td>-0.277766</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td> 1.041361</td>\n",
" <td>-0.816477</td>\n",
" <td>-0.165556</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td> 2.231523</td>\n",
" <td>-0.816477</td>\n",
" <td>-0.005304</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>-0.247808</td>\n",
" <td>-0.816477</td>\n",
" <td>-0.005304</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td> 0.888134</td>\n",
" <td>-0.816477</td>\n",
" <td>-0.005304</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 63,
"text": [
" 0 1 2\n",
"0 -1.371226 0.397127 -1.457344\n",
"1 -0.535057 -0.816477 -0.277766\n",
"2 1.041361 -0.816477 -0.165556\n",
"3 2.231523 -0.816477 -0.005304\n",
"4 -0.247808 -0.816477 -0.005304\n",
"5 0.888134 -0.816477 -0.005304"
]
}
],
"prompt_number": 63
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"data = Series([1,NA,3.5,NA,7,3.5])\n",
"data"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 72,
"text": [
"0 1.0\n",
"1 NaN\n",
"2 3.5\n",
"3 NaN\n",
"4 7.0\n",
"5 3.5\n",
"dtype: float64"
]
}
],
"prompt_number": 72
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"data.fillna(data.mean())"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 73,
"text": [
"0 1.00\n",
"1 3.75\n",
"2 3.50\n",
"3 3.75\n",
"4 7.00\n",
"5 3.50\n",
"dtype: float64"
]
}
],
"prompt_number": 73
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"data.fillna(data.value_counts().index[0])"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 75,
"text": [
"0 1.0\n",
"1 3.5\n",
"2 3.5\n",
"3 3.5\n",
"4 7.0\n",
"5 3.5\n",
"dtype: float64"
]
}
],
"prompt_number": 75
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Hierarchical Indexing\n",
"\n",
"Hierarchical indexing is an important feature of pandas enabling you to have multiple (two or more) index levels on an axis. Somewhat abstractly, it provides a way for you to work with higher dimensional data in a lower dimensional form. Let\u2019s start with a simple example; create a Series with a list of lists or arrays as the index:"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"data = Series(np.random.randn(10),\n",
" index=[['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd'],\n",
" [1, 2, 3, 1, 2, 3, 1, 2, 2, 3]])\n",
"data"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 77,
"text": [
"a 1 -0.949916\n",
" 2 -0.812244\n",
" 3 0.840604\n",
"b 1 1.197306\n",
" 2 -0.204776\n",
" 3 -1.448452\n",
"c 1 0.202745\n",
" 2 0.938151\n",
"d 2 -1.122258\n",
" 3 0.214271\n",
"dtype: float64"
]
}
],
"prompt_number": 77
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"data.index"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 80,
"text": [
"MultiIndex\n",
"[a 1, 2, 3, b 1, 2, 3, c 1, 2, d 2, 3]"
]
}
],
"prompt_number": 80
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"data['b']"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 81,
"text": [
"1 1.197306\n",
"2 -0.204776\n",
"3 -1.448452\n",
"dtype: float64"
]
}
],
"prompt_number": 81
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"data.unstack()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" <th>3</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>a</th>\n",
" <td>-0.949916</td>\n",
" <td>-0.812244</td>\n",
" <td> 0.840604</td>\n",
" </tr>\n",
" <tr>\n",
" <th>b</th>\n",
" <td> 1.197306</td>\n",
" <td>-0.204776</td>\n",
" <td>-1.448452</td>\n",
" </tr>\n",
" <tr>\n",
" <th>c</th>\n",
" <td> 0.202745</td>\n",
" <td> 0.938151</td>\n",
" <td> NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>d</th>\n",
" <td> NaN</td>\n",
" <td>-1.122258</td>\n",
" <td> 0.214271</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 82,
"text": [
" 1 2 3\n",
"a -0.949916 -0.812244 0.840604\n",
"b 1.197306 -0.204776 -1.448452\n",
"c 0.202745 0.938151 NaN\n",
"d NaN -1.122258 0.214271"
]
}
],
"prompt_number": 82
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"data.unstack().stack()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 83,
"text": [
"a 1 -0.949916\n",
" 2 -0.812244\n",
" 3 0.840604\n",
"b 1 1.197306\n",
" 2 -0.204776\n",
" 3 -1.448452\n",
"c 1 0.202745\n",
" 2 0.938151\n",
"d 2 -1.122258\n",
" 3 0.214271\n",
"dtype: float64"
]
}
],
"prompt_number": 83
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"frame = DataFrame(np.arange(12).reshape((4, 3)),\n",
" index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]], \n",
" columns=[['Ohio', 'Ohio', 'Colorado'],\n",
" ['Green', 'Red', 'Green']])\n",
"frame"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr>\n",
" <th></th>\n",
" <th></th>\n",
" <th colspan=\"2\" halign=\"left\">Ohio</th>\n",
" <th>Colorado</th>\n",
" </tr>\n",
" <tr>\n",
" <th></th>\n",
" <th></th>\n",
" <th>Green</th>\n",
" <th>Red</th>\n",
" <th>Green</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"2\" valign=\"top\">a</th>\n",
" <th>1</th>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td> 3</td>\n",
" <td> 4</td>\n",
" <td> 5</td>\n",
" </tr>\n",
" <tr>\n",
" <th rowspan=\"2\" valign=\"top\">b</th>\n",
" <th>1</th>\n",
" <td> 6</td>\n",
" <td> 7</td>\n",
" <td> 8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td> 9</td>\n",
" <td> 10</td>\n",
" <td> 11</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 84,
"text": [
" Ohio Colorado\n",
" Green Red Green\n",
"a 1 0 1 2\n",
" 2 3 4 5\n",
"b 1 6 7 8\n",
" 2 9 10 11"
]
}
],
"prompt_number": 84
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"frame.index.names = ['key1', 'key2']\n",
"frame.columns.names = ['state', 'color']\n",
"frame"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr>\n",
" <th></th>\n",
" <th>state</th>\n",
" <th colspan=\"2\" halign=\"left\">Ohio</th>\n",
" <th>Colorado</th>\n",
" </tr>\n",
" <tr>\n",
" <th></th>\n",
" <th>color</th>\n",
" <th>Green</th>\n",
" <th>Red</th>\n",
" <th>Green</th>\n",
" </tr>\n",
" <tr>\n",
" <th>key1</th>\n",
" <th>key2</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"2\" valign=\"top\">a</th>\n",
" <th>1</th>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td> 3</td>\n",
" <td> 4</td>\n",
" <td> 5</td>\n",
" </tr>\n",
" <tr>\n",
" <th rowspan=\"2\" valign=\"top\">b</th>\n",
" <th>1</th>\n",
" <td> 6</td>\n",
" <td> 7</td>\n",
" <td> 8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td> 9</td>\n",
" <td> 10</td>\n",
" <td> 11</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 87,
"text": [
"state Ohio Colorado\n",
"color Green Red Green\n",
"key1 key2 \n",
"a 1 0 1 2\n",
" 2 3 4 5\n",
"b 1 6 7 8\n",
" 2 9 10 11"
]
}
],
"prompt_number": 87
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"frame['Ohio']"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>color</th>\n",
" <th>Green</th>\n",
" <th>Red</th>\n",
" </tr>\n",
" <tr>\n",
" <th>key1</th>\n",
" <th>key2</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"2\" valign=\"top\">a</th>\n",
" <th>1</th>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td> 3</td>\n",
" <td> 4</td>\n",
" </tr>\n",
" <tr>\n",
" <th rowspan=\"2\" valign=\"top\">b</th>\n",
" <th>1</th>\n",
" <td> 6</td>\n",
" <td> 7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td> 9</td>\n",
" <td> 10</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 88,
"text": [
"color Green Red\n",
"key1 key2 \n",
"a 1 0 1\n",
" 2 3 4\n",
"b 1 6 7\n",
" 2 9 10"
]
}
],
"prompt_number": 88
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"frame['Ohio'].ix['a']"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th>color</th>\n",
" <th>Green</th>\n",
" <th>Red</th>\n",
" </tr>\n",
" <tr>\n",
" <th>key2</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td> 3</td>\n",
" <td> 4</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 99,
"text": [
"color Green Red\n",
"key2 \n",
"1 0 1\n",
"2 3 4"
]
}
],
"prompt_number": 99
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Reordering and Sorting Levels\n",
"\n",
"At times you will need to rearrange the order of the levels on an axis or sort the data by the values in one specific level. The swaplevel takes two level numbers or names and returns a new object with the levels interchanged (but the data is otherwise unaltered):"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"frame.swaplevel('key1','key2')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr>\n",
" <th></th>\n",
" <th>state</th>\n",
" <th colspan=\"2\" halign=\"left\">Ohio</th>\n",
" <th>Colorado</th>\n",
" </tr>\n",
" <tr>\n",
" <th></th>\n",
" <th>color</th>\n",
" <th>Green</th>\n",
" <th>Red</th>\n",
" <th>Green</th>\n",
" </tr>\n",
" <tr>\n",
" <th>key2</th>\n",
" <th>key1</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <th>a</th>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <th>a</th>\n",
" <td> 3</td>\n",
" <td> 4</td>\n",
" <td> 5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <th>b</th>\n",
" <td> 6</td>\n",
" <td> 7</td>\n",
" <td> 8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <th>b</th>\n",
" <td> 9</td>\n",
" <td> 10</td>\n",
" <td> 11</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 100,
"text": [
"state Ohio Colorado\n",
"color Green Red Green\n",
"key2 key1 \n",
"1 a 0 1 2\n",
"2 a 3 4 5\n",
"1 b 6 7 8\n",
"2 b 9 10 11"
]
}
],
"prompt_number": 100
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"frame.sortlevel(1)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr>\n",
" <th></th>\n",
" <th>state</th>\n",
" <th colspan=\"2\" halign=\"left\">Ohio</th>\n",
" <th>Colorado</th>\n",
" </tr>\n",
" <tr>\n",
" <th></th>\n",
" <th>color</th>\n",
" <th>Green</th>\n",
" <th>Red</th>\n",
" <th>Green</th>\n",
" </tr>\n",
" <tr>\n",
" <th>key1</th>\n",
" <th>key2</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>a</th>\n",
" <th>1</th>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>b</th>\n",
" <th>1</th>\n",
" <td> 6</td>\n",
" <td> 7</td>\n",
" <td> 8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>a</th>\n",
" <th>2</th>\n",
" <td> 3</td>\n",
" <td> 4</td>\n",
" <td> 5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>b</th>\n",
" <th>2</th>\n",
" <td> 9</td>\n",
" <td> 10</td>\n",
" <td> 11</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 101,
"text": [
"state Ohio Colorado\n",
"color Green Red Green\n",
"key1 key2 \n",
"a 1 0 1 2\n",
"b 1 6 7 8\n",
"a 2 3 4 5\n",
"b 2 9 10 11"
]
}
],
"prompt_number": 101
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"frame.swaplevel(0,1).sortlevel(0)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr>\n",
" <th></th>\n",
" <th>state</th>\n",
" <th colspan=\"2\" halign=\"left\">Ohio</th>\n",
" <th>Colorado</th>\n",
" </tr>\n",
" <tr>\n",
" <th></th>\n",
" <th>color</th>\n",
" <th>Green</th>\n",
" <th>Red</th>\n",
" <th>Green</th>\n",
" </tr>\n",
" <tr>\n",
" <th>key2</th>\n",
" <th>key1</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"2\" valign=\"top\">1</th>\n",
" <th>a</th>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>b</th>\n",
" <td> 6</td>\n",
" <td> 7</td>\n",
" <td> 8</td>\n",
" </tr>\n",
" <tr>\n",
" <th rowspan=\"2\" valign=\"top\">2</th>\n",
" <th>a</th>\n",
" <td> 3</td>\n",
" <td> 4</td>\n",
" <td> 5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>b</th>\n",
" <td> 9</td>\n",
" <td> 10</td>\n",
" <td> 11</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 107,
"text": [
"state Ohio Colorado\n",
"color Green Red Green\n",
"key2 key1 \n",
"1 a 0 1 2\n",
" b 6 7 8\n",
"2 a 3 4 5\n",
" b 9 10 11"
]
}
],
"prompt_number": 107
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"frame.swaplevel(0,1).sortlevel(1)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr>\n",
" <th></th>\n",
" <th>state</th>\n",
" <th colspan=\"2\" halign=\"left\">Ohio</th>\n",
" <th>Colorado</th>\n",
" </tr>\n",
" <tr>\n",
" <th></th>\n",
" <th>color</th>\n",
" <th>Green</th>\n",
" <th>Red</th>\n",
" <th>Green</th>\n",
" </tr>\n",
" <tr>\n",
" <th>key2</th>\n",
" <th>key1</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <th>a</th>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <th>a</th>\n",
" <td> 3</td>\n",
" <td> 4</td>\n",
" <td> 5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <th>b</th>\n",
" <td> 6</td>\n",
" <td> 7</td>\n",
" <td> 8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <th>b</th>\n",
" <td> 9</td>\n",
" <td> 10</td>\n",
" <td> 11</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 108,
"text": [
"state Ohio Colorado\n",
"color Green Red Green\n",
"key2 key1 \n",
"1 a 0 1 2\n",
"2 a 3 4 5\n",
"1 b 6 7 8\n",
"2 b 9 10 11"
]
}
],
"prompt_number": 108
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Summary Statistics by Level"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"frame.sum(level='key2')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr>\n",
" <th>state</th>\n",
" <th colspan=\"2\" halign=\"left\">Ohio</th>\n",
" <th>Colorado</th>\n",
" </tr>\n",
" <tr>\n",
" <th>color</th>\n",
" <th>Green</th>\n",
" <th>Red</th>\n",
" <th>Green</th>\n",
" </tr>\n",
" <tr>\n",
" <th>key2</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td> 6</td>\n",
" <td> 8</td>\n",
" <td> 10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td> 12</td>\n",
" <td> 14</td>\n",
" <td> 16</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 109,
"text": [
"state Ohio Colorado\n",
"color Green Red Green\n",
"key2 \n",
"1 6 8 10\n",
"2 12 14 16"
]
}
],
"prompt_number": 109
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"frame.sum(level='color', axis=1)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>color</th>\n",
" <th>Green</th>\n",
" <th>Red</th>\n",
" </tr>\n",
" <tr>\n",
" <th>key1</th>\n",
" <th>key2</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"2\" valign=\"top\">a</th>\n",
" <th>1</th>\n",
" <td> 2</td>\n",
" <td> 1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td> 8</td>\n",
" <td> 4</td>\n",
" </tr>\n",
" <tr>\n",
" <th rowspan=\"2\" valign=\"top\">b</th>\n",
" <th>1</th>\n",
" <td> 14</td>\n",
" <td> 7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td> 20</td>\n",
" <td> 10</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 110,
"text": [
"color Green Red\n",
"key1 key2 \n",
"a 1 2 1\n",
" 2 8 4\n",
"b 1 14 7\n",
" 2 20 10"
]
}
],
"prompt_number": 110
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Using a DataFrame's Columns "
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"frame = DataFrame({'a': range(7), 'b': range(7, 0, -1),\n",
" 'c': ['one', 'one', 'one', 'two', 'two', 'two', 'two'],\n",
" 'd': [0, 1, 2, 0, 1, 2, 3]})\n",
"frame"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>a</th>\n",
" <th>b</th>\n",
" <th>c</th>\n",
" <th>d</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td> 0</td>\n",
" <td> 7</td>\n",
" <td> one</td>\n",
" <td> 0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td> 1</td>\n",
" <td> 6</td>\n",
" <td> one</td>\n",
" <td> 1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td> 2</td>\n",
" <td> 5</td>\n",
" <td> one</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td> 3</td>\n",
" <td> 4</td>\n",
" <td> two</td>\n",
" <td> 0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td> 4</td>\n",
" <td> 3</td>\n",
" <td> two</td>\n",
" <td> 1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td> 5</td>\n",
" <td> 2</td>\n",
" <td> two</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td> 6</td>\n",
" <td> 1</td>\n",
" <td> two</td>\n",
" <td> 3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 111,
"text": [
" a b c d\n",
"0 0 7 one 0\n",
"1 1 6 one 1\n",
"2 2 5 one 2\n",
"3 3 4 two 0\n",
"4 4 3 two 1\n",
"5 5 2 two 2\n",
"6 6 1 two 3"
]
}
],
"prompt_number": 111
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"frame2 = frame.set_index(['c','d'])\n",
"frame2"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th>a</th>\n",
" <th>b</th>\n",
" </tr>\n",
" <tr>\n",
" <th>c</th>\n",
" <th>d</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"3\" valign=\"top\">one</th>\n",
" <th>0</th>\n",
" <td> 0</td>\n",
" <td> 7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td> 1</td>\n",
" <td> 6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td> 2</td>\n",
" <td> 5</td>\n",
" </tr>\n",
" <tr>\n",
" <th rowspan=\"4\" valign=\"top\">two</th>\n",
" <th>0</th>\n",
" <td> 3</td>\n",
" <td> 4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td> 4</td>\n",
" <td> 3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td> 5</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td> 6</td>\n",
" <td> 1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 112,
"text": [
" a b\n",
"c d \n",
"one 0 0 7\n",
" 1 1 6\n",
" 2 2 5\n",
"two 0 3 4\n",
" 1 4 3\n",
" 2 5 2\n",
" 3 6 1"
]
}
],
"prompt_number": 112
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"frame.set_index(['c','d'], drop=False)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th>a</th>\n",
" <th>b</th>\n",
" <th>c</th>\n",
" <th>d</th>\n",
" </tr>\n",
" <tr>\n",
" <th>c</th>\n",
" <th>d</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"3\" valign=\"top\">one</th>\n",
" <th>0</th>\n",
" <td> 0</td>\n",
" <td> 7</td>\n",
" <td> one</td>\n",
" <td> 0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td> 1</td>\n",
" <td> 6</td>\n",
" <td> one</td>\n",
" <td> 1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td> 2</td>\n",
" <td> 5</td>\n",
" <td> one</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th rowspan=\"4\" valign=\"top\">two</th>\n",
" <th>0</th>\n",
" <td> 3</td>\n",
" <td> 4</td>\n",
" <td> two</td>\n",
" <td> 0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td> 4</td>\n",
" <td> 3</td>\n",
" <td> two</td>\n",
" <td> 1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td> 5</td>\n",
" <td> 2</td>\n",
" <td> two</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td> 6</td>\n",
" <td> 1</td>\n",
" <td> two</td>\n",
" <td> 3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 113,
"text": [
" a b c d\n",
"c d \n",
"one 0 0 7 one 0\n",
" 1 1 6 one 1\n",
" 2 2 5 one 2\n",
"two 0 3 4 two 0\n",
" 1 4 3 two 1\n",
" 2 5 2 two 2\n",
" 3 6 1 two 3"
]
}
],
"prompt_number": 113
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"frame2.reset_index()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>c</th>\n",
" <th>d</th>\n",
" <th>a</th>\n",
" <th>b</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td> one</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td> one</td>\n",
" <td> 1</td>\n",
" <td> 1</td>\n",
" <td> 6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td> one</td>\n",
" <td> 2</td>\n",
" <td> 2</td>\n",
" <td> 5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td> two</td>\n",
" <td> 0</td>\n",
" <td> 3</td>\n",
" <td> 4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td> two</td>\n",
" <td> 1</td>\n",
" <td> 4</td>\n",
" <td> 3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td> two</td>\n",
" <td> 2</td>\n",
" <td> 5</td>\n",
" <td> 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td> two</td>\n",
" <td> 3</td>\n",
" <td> 6</td>\n",
" <td> 1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 115,
"text": [
" c d a b\n",
"0 one 0 0 7\n",
"1 one 1 1 6\n",
"2 one 2 2 5\n",
"3 two 0 3 4\n",
"4 two 1 4 3\n",
"5 two 2 5 2\n",
"6 two 3 6 1"
]
}
],
"prompt_number": 115
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Other pandas Topics"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Integer Indexing"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"ser = Series(np.arange(3))\n",
"ser"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 117,
"text": [
"0 0\n",
"1 1\n",
"2 2\n",
"dtype: int64"
]
}
],
"prompt_number": 117
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"ser2 = Series(np.arange(3), index=['a','b','c'])\n",
"ser2[-1]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 119,
"text": [
"2"
]
}
],
"prompt_number": 119
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"ser.ix[:1]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 122,
"text": [
"0 0\n",
"1 1\n",
"dtype: int64"
]
}
],
"prompt_number": 122
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"ser3 = Series(range(3), index=[-5,1,3])\n",
"print ser3\n",
"print ser3.iget_value(0)\n",
"print ser3.iget_value(1)\n",
"print ser3.iget_value(2)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"-5 0\n",
" 1 1\n",
" 3 2\n",
"dtype: int64\n",
"0\n",
"1\n",
"2\n"
]
}
],
"prompt_number": 132
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"frame = DataFrame(np.arange(6).reshape(3,2), index=[2,0,1])\n",
"frame"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2</th>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td> 2</td>\n",
" <td> 3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td> 4</td>\n",
" <td> 5</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 126,
"text": [
" 0 1\n",
"2 0 1\n",
"0 2 3\n",
"1 4 5"
]
}
],
"prompt_number": 126
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"frame.irow(1)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 128,
"text": [
"0 2\n",
"1 3\n",
"Name: 0, dtype: int64"
]
}
],
"prompt_number": 128
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Panel Data "
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import pandas.io.data as web\n",
"pdata = pd.Panel(dict((stk, web.get_data_yahoo(stk, '1/1/2009', '6/1/2012')) \n",
" for stk in ['AAPL', 'GOOG', 'MSFT', 'DELL']))\n",
"pdata"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 135,
"text": [
"<class 'pandas.core.panel.Panel'>\n",
"Dimensions: 4 (items) x 861 (major_axis) x 6 (minor_axis)\n",
"Items axis: AAPL to MSFT\n",
"Major_axis axis: 2009-01-02 00:00:00 to 2012-06-01 00:00:00\n",
"Minor_axis axis: Open to Adj Close"
]
}
],
"prompt_number": 135
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"pdata = pdata.swapaxes('items', 'minor')\n",
"pdata['Adj Close']"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<pre>\n",
"&ltclass 'pandas.core.frame.DataFrame'&gt\n",
"DatetimeIndex: 861 entries, 2009-01-02 00:00:00 to 2012-06-01 00:00:00\n",
"Data columns (total 4 columns):\n",
"AAPL 861 non-null values\n",
"DELL 861 non-null values\n",
"GOOG 861 non-null values\n",
"MSFT 861 non-null values\n",
"dtypes: float64(4)\n",
"</pre>"
],
"output_type": "pyout",
"prompt_number": 136,
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"DatetimeIndex: 861 entries, 2009-01-02 00:00:00 to 2012-06-01 00:00:00\n",
"Data columns (total 4 columns):\n",
"AAPL 861 non-null values\n",
"DELL 861 non-null values\n",
"GOOG 861 non-null values\n",
"MSFT 861 non-null values\n",
"dtypes: float64(4)"
]
}
],
"prompt_number": 136
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"pdata.ix[:, '6/1/2012', :]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Open</th>\n",
" <th>High</th>\n",
" <th>Low</th>\n",
" <th>Close</th>\n",
" <th>Volume</th>\n",
" <th>Adj Close</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>AAPL</th>\n",
" <td> 569.16</td>\n",
" <td> 572.65</td>\n",
" <td> 560.52</td>\n",
" <td> 560.99</td>\n",
" <td> 18606700</td>\n",
" <td> 545.59</td>\n",
" </tr>\n",
" <tr>\n",
" <th>DELL</th>\n",
" <td> 12.15</td>\n",
" <td> 12.30</td>\n",
" <td> 12.05</td>\n",
" <td> 12.07</td>\n",
" <td> 19396700</td>\n",
" <td> 11.74</td>\n",
" </tr>\n",
" <tr>\n",
" <th>GOOG</th>\n",
" <td> 571.79</td>\n",
" <td> 572.65</td>\n",
" <td> 568.35</td>\n",
" <td> 570.98</td>\n",
" <td> 3057900</td>\n",
" <td> 570.98</td>\n",
" </tr>\n",
" <tr>\n",
" <th>MSFT</th>\n",
" <td> 28.76</td>\n",
" <td> 28.96</td>\n",
" <td> 28.44</td>\n",
" <td> 28.45</td>\n",
" <td> 56634300</td>\n",
" <td> 27.42</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 137,
"text": [
" Open High Low Close Volume Adj Close\n",
"AAPL 569.16 572.65 560.52 560.99 18606700 545.59\n",
"DELL 12.15 12.30 12.05 12.07 19396700 11.74\n",
"GOOG 571.79 572.65 568.35 570.98 3057900 570.98\n",
"MSFT 28.76 28.96 28.44 28.45 56634300 27.42"
]
}
],
"prompt_number": 137
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"pdata.ix['Adj Close', '5/22/2012':, :]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>AAPL</th>\n",
" <th>DELL</th>\n",
" <th>GOOG</th>\n",
" <th>MSFT</th>\n",
" </tr>\n",
" <tr>\n",
" <th>Date</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2012-05-22</th>\n",
" <td> 541.68</td>\n",
" <td> 14.67</td>\n",
" <td> 600.80</td>\n",
" <td> 28.68</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2012-05-23</th>\n",
" <td> 554.90</td>\n",
" <td> 12.15</td>\n",
" <td> 609.46</td>\n",
" <td> 28.05</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2012-05-24</th>\n",
" <td> 549.80</td>\n",
" <td> 12.11</td>\n",
" <td> 603.66</td>\n",
" <td> 28.01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2012-05-25</th>\n",
" <td> 546.86</td>\n",
" <td> 12.12</td>\n",
" <td> 591.53</td>\n",
" <td> 28.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2012-05-29</th>\n",
" <td> 556.56</td>\n",
" <td> 12.32</td>\n",
" <td> 594.34</td>\n",
" <td> 28.49</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2012-05-30</th>\n",
" <td> 563.27</td>\n",
" <td> 12.22</td>\n",
" <td> 588.23</td>\n",
" <td> 28.27</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2012-05-31</th>\n",
" <td> 561.87</td>\n",
" <td> 12.00</td>\n",
" <td> 580.86</td>\n",
" <td> 28.13</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2012-06-01</th>\n",
" <td> 545.59</td>\n",
" <td> 11.74</td>\n",
" <td> 570.98</td>\n",
" <td> 27.42</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 138,
"text": [
" AAPL DELL GOOG MSFT\n",
"Date \n",
"2012-05-22 541.68 14.67 600.80 28.68\n",
"2012-05-23 554.90 12.15 609.46 28.05\n",
"2012-05-24 549.80 12.11 603.66 28.01\n",
"2012-05-25 546.86 12.12 591.53 28.00\n",
"2012-05-29 556.56 12.32 594.34 28.49\n",
"2012-05-30 563.27 12.22 588.23 28.27\n",
"2012-05-31 561.87 12.00 580.86 28.13\n",
"2012-06-01 545.59 11.74 570.98 27.42"
]
}
],
"prompt_number": 138
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"stacked = pdata.ix[:, '5/30/2012':, :].to_frame()\n",
"stacked"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th>Open</th>\n",
" <th>High</th>\n",
" <th>Low</th>\n",
" <th>Close</th>\n",
" <th>Volume</th>\n",
" <th>Adj Close</th>\n",
" </tr>\n",
" <tr>\n",
" <th>Date</th>\n",
" <th>minor</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"4\" valign=\"top\">2012-05-30</th>\n",
" <th>AAPL</th>\n",
" <td> 569.20</td>\n",
" <td> 579.99</td>\n",
" <td> 566.56</td>\n",
" <td> 579.17</td>\n",
" <td> 18908200</td>\n",
" <td> 563.27</td>\n",
" </tr>\n",
" <tr>\n",
" <th>DELL</th>\n",
" <td> 12.59</td>\n",
" <td> 12.70</td>\n",
" <td> 12.46</td>\n",
" <td> 12.56</td>\n",
" <td> 19787800</td>\n",
" <td> 12.22</td>\n",
" </tr>\n",
" <tr>\n",
" <th>GOOG</th>\n",
" <td> 588.16</td>\n",
" <td> 591.90</td>\n",
" <td> 583.53</td>\n",
" <td> 588.23</td>\n",
" <td> 1906700</td>\n",
" <td> 588.23</td>\n",
" </tr>\n",
" <tr>\n",
" <th>MSFT</th>\n",
" <td> 29.35</td>\n",
" <td> 29.48</td>\n",
" <td> 29.12</td>\n",
" <td> 29.34</td>\n",
" <td> 41585500</td>\n",
" <td> 28.27</td>\n",
" </tr>\n",
" <tr>\n",
" <th rowspan=\"4\" valign=\"top\">2012-05-31</th>\n",
" <th>AAPL</th>\n",
" <td> 580.74</td>\n",
" <td> 581.50</td>\n",
" <td> 571.46</td>\n",
" <td> 577.73</td>\n",
" <td> 17559800</td>\n",
" <td> 561.87</td>\n",
" </tr>\n",
" <tr>\n",
" <th>DELL</th>\n",
" <td> 12.53</td>\n",
" <td> 12.54</td>\n",
" <td> 12.33</td>\n",
" <td> 12.33</td>\n",
" <td> 19955500</td>\n",
" <td> 12.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>GOOG</th>\n",
" <td> 588.72</td>\n",
" <td> 590.00</td>\n",
" <td> 579.00</td>\n",
" <td> 580.86</td>\n",
" <td> 2968300</td>\n",
" <td> 580.86</td>\n",
" </tr>\n",
" <tr>\n",
" <th>MSFT</th>\n",
" <td> 29.30</td>\n",
" <td> 29.42</td>\n",
" <td> 28.94</td>\n",
" <td> 29.19</td>\n",
" <td> 39134000</td>\n",
" <td> 28.13</td>\n",
" </tr>\n",
" <tr>\n",
" <th rowspan=\"4\" valign=\"top\">2012-06-01</th>\n",
" <th>AAPL</th>\n",
" <td> 569.16</td>\n",
" <td> 572.65</td>\n",
" <td> 560.52</td>\n",
" <td> 560.99</td>\n",
" <td> 18606700</td>\n",
" <td> 545.59</td>\n",
" </tr>\n",
" <tr>\n",
" <th>DELL</th>\n",
" <td> 12.15</td>\n",
" <td> 12.30</td>\n",
" <td> 12.05</td>\n",
" <td> 12.07</td>\n",
" <td> 19396700</td>\n",
" <td> 11.74</td>\n",
" </tr>\n",
" <tr>\n",
" <th>GOOG</th>\n",
" <td> 571.79</td>\n",
" <td> 572.65</td>\n",
" <td> 568.35</td>\n",
" <td> 570.98</td>\n",
" <td> 3057900</td>\n",
" <td> 570.98</td>\n",
" </tr>\n",
" <tr>\n",
" <th>MSFT</th>\n",
" <td> 28.76</td>\n",
" <td> 28.96</td>\n",
" <td> 28.44</td>\n",
" <td> 28.45</td>\n",
" <td> 56634300</td>\n",
" <td> 27.42</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 139,
"text": [
" Open High Low Close Volume Adj Close\n",
"Date minor \n",
"2012-05-30 AAPL 569.20 579.99 566.56 579.17 18908200 563.27\n",
" DELL 12.59 12.70 12.46 12.56 19787800 12.22\n",
" GOOG 588.16 591.90 583.53 588.23 1906700 588.23\n",
" MSFT 29.35 29.48 29.12 29.34 41585500 28.27\n",
"2012-05-31 AAPL 580.74 581.50 571.46 577.73 17559800 561.87\n",
" DELL 12.53 12.54 12.33 12.33 19955500 12.00\n",
" GOOG 588.72 590.00 579.00 580.86 2968300 580.86\n",
" MSFT 29.30 29.42 28.94 29.19 39134000 28.13\n",
"2012-06-01 AAPL 569.16 572.65 560.52 560.99 18606700 545.59\n",
" DELL 12.15 12.30 12.05 12.07 19396700 11.74\n",
" GOOG 571.79 572.65 568.35 570.98 3057900 570.98\n",
" MSFT 28.76 28.96 28.44 28.45 56634300 27.42"
]
}
],
"prompt_number": 139
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"stacked.to_panel()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 140,
"text": [
"<class 'pandas.core.panel.Panel'>\n",
"Dimensions: 6 (items) x 3 (major_axis) x 4 (minor_axis)\n",
"Items axis: Open to Adj Close\n",
"Major_axis axis: 2012-05-30 00:00:00 to 2012-06-01 00:00:00\n",
"Minor_axis axis: AAPL to MSFT"
]
}
],
"prompt_number": 140
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment