Skip to content

Instantly share code, notes, and snippets.

@wesm
Created October 17, 2012 08:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save wesm/3904544 to your computer and use it in GitHub Desktop.
Save wesm/3904544 to your computer and use it in GitHub Desktop.
Foo
{
"metadata": {
"name": "Basics"
},
"nbformat": 2,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": true,
"input": [
"from pandas import *",
"import pandas",
"import numpy as np",
"",
"def side_by_side(*objs, **kwds):",
" from pandas.core.common import adjoin",
" space = kwds.get('space', 4)",
" reprs = [repr(obj).split('\\n') for obj in objs]",
" print adjoin(space, *reprs)",
"",
"plt.rc('figure', figsize=(10, 6))",
"pandas.set_printoptions(notebook_repr_html=False)"
],
"language": "python",
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "markdown",
"source": [
"Series",
"======"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"labels = ['a', 'b', 'c', 'd', 'e']",
"s = Series(randn(5), index=labels)",
"s"
],
"language": "python",
"outputs": [],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"'b' in s"
],
"language": "python",
"outputs": [],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"s['b']"
],
"language": "python",
"outputs": [],
"prompt_number": 4
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"s"
],
"language": "python",
"outputs": [],
"prompt_number": 5
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"mapping = s.to_dict()",
"mapping"
],
"language": "python",
"outputs": [],
"prompt_number": 6
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"s = Series(mapping)",
"s"
],
"language": "python",
"outputs": [],
"prompt_number": 7
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"s[:3]"
],
"language": "python",
"outputs": [],
"prompt_number": 8
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"s.index"
],
"language": "python",
"outputs": [],
"prompt_number": 9
},
{
"cell_type": "markdown",
"source": [
"DataFrame: 2D collection of Series",
"=================================="
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df = DataFrame({'a': np.random.randn(6),",
" 'b': ['foo', 'bar'] * 3,",
" 'c': np.random.randn(6)})",
"df"
],
"language": "python",
"outputs": [],
"prompt_number": 10
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df.index"
],
"language": "python",
"outputs": [],
"prompt_number": 11
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df.columns"
],
"language": "python",
"outputs": [],
"prompt_number": 12
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df = DataFrame({'a': np.random.randn(6),",
" 'b': ['foo', 'bar'] * 3,",
" 'c': np.random.randn(6)},",
" index=DateRange('1/1/2000', periods=6))",
"df"
],
"language": "python",
"outputs": [],
"prompt_number": 13
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df = DataFrame({'a': np.random.randn(6),",
" 'b': ['foo', 'bar'] * 3,",
" 'c': np.random.randn(6)},",
" columns=['a', 'b', 'c', 'd'])",
"df"
],
"language": "python",
"outputs": [],
"prompt_number": 14
},
{
"cell_type": "markdown",
"source": [
"Creation from nested dicts",
"--------------------------",
"",
"These arise naturally in Python code"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"data = {}",
"for col in ['foo', 'bar', 'baz']:",
" for row in ['a', 'b', 'c', 'd']:",
" data.setdefault(col, {})[row] = randn()",
"data"
],
"language": "python",
"outputs": [],
"prompt_number": 15
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"DataFrame(data)"
],
"language": "python",
"outputs": [],
"prompt_number": 16
},
{
"cell_type": "markdown",
"source": [
"Data alignment",
"=============="
]
},
{
"cell_type": "code",
"collapsed": true,
"input": [
"close_px = read_csv('stock_data.csv', index_col=0, parse_dates=True)"
],
"language": "python",
"outputs": [],
"prompt_number": 17
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"close_px"
],
"language": "python",
"outputs": [],
"prompt_number": 18
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"s1 = close_px['AAPL'][-20:]",
"s2 = close_px['AAPL'][-25:-10]",
"side_by_side(s1, s2)"
],
"language": "python",
"outputs": [],
"prompt_number": 19
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"s1 + s2"
],
"language": "python",
"outputs": [],
"prompt_number": 20
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df = close_px.ix[-10:, :3]",
"df"
],
"language": "python",
"outputs": [],
"prompt_number": 21
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"side_by_side(s1.reindex(s2.index), s2)"
],
"language": "python",
"outputs": [],
"prompt_number": 22
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"b, c = s1.align(s2, join='inner')",
"side_by_side(b, c)"
],
"language": "python",
"outputs": [],
"prompt_number": 23
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"b, c = s1.align(s2, join='outer')",
"side_by_side(b, c)"
],
"language": "python",
"outputs": [],
"prompt_number": 24
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"b, c = s1.align(s2, join='right')",
"side_by_side(b, c)"
],
"language": "python",
"outputs": [],
"prompt_number": 25
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df = close_px.ix[-10:, ['AAPL', 'IBM', 'MSFT']]",
"df"
],
"language": "python",
"outputs": [],
"prompt_number": 26
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df2 = df.ix[::2, ['IBM', 'MSFT']]",
"side_by_side(df, df2)"
],
"language": "python",
"outputs": [],
"prompt_number": 27
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df + df2"
],
"language": "python",
"outputs": [],
"prompt_number": 28
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"b, c = df.align(df2, join='inner')",
"side_by_side(b, c) "
],
"language": "python",
"outputs": [],
"prompt_number": 29
},
{
"cell_type": "markdown",
"source": [
"Transposing: no copy if all columns are same type",
"-------------------------------------------------"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df[:5].T"
],
"language": "python",
"outputs": [],
"prompt_number": 30
},
{
"cell_type": "markdown",
"source": [
"Columns can be any type",
"-----------------------"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"n = 10",
"foo = DataFrame(index=range(n))",
"foo['floats'] = np.random.randn(n)",
"foo['ints'] = np.arange(n)",
"foo['strings'] = ['foo', 'bar'] * (n / 2)",
"foo['bools'] = foo['floats'] > 0",
"foo['objects'] = DateRange('1/1/2000', periods=n)",
"foo"
],
"language": "python",
"outputs": [],
"prompt_number": 31
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"foo.dtypes"
],
"language": "python",
"outputs": [],
"prompt_number": 32
},
{
"cell_type": "markdown",
"source": [
"N.B. transposing is not roundtrippable in this case (column-oriented data structure)"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"foo.T.T"
],
"language": "python",
"outputs": [],
"prompt_number": 33
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"foo.T.T.dtypes"
],
"language": "python",
"outputs": [],
"prompt_number": 34
},
{
"cell_type": "markdown",
"source": [
"Function application",
"====================",
"",
"You can apply arbitrary functions to the rows or columns of a DataFrame"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df.apply(np.mean)"
],
"language": "python",
"outputs": [],
"prompt_number": 35
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df.apply(np.mean, axis=1)"
],
"language": "python",
"outputs": [],
"prompt_number": 36
},
{
"cell_type": "markdown",
"source": [
"You can get as fancy as you want"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"close_px"
],
"language": "python",
"outputs": [],
"prompt_number": 37
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def peak_date(series):",
" return series.index[series.argmax()]",
"close_px.apply(peak_date)"
],
"language": "python",
"outputs": [],
"prompt_number": 38
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df.apply(lambda x: x.max() - x.min()) # np.ptp"
],
"language": "python",
"outputs": [],
"prompt_number": 39
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"np.log(close_px)"
],
"language": "python",
"outputs": [],
"prompt_number": 40
},
{
"cell_type": "markdown",
"source": [
"Plotting",
"========",
"",
"Some basic plotting integration with matplotlib in Series / DataFrame"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"close_px[['AAPL', 'IBM', 'MSFT', 'XOM']].plot()"
],
"language": "python",
"outputs": [],
"prompt_number": 41
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"rets.ix[-1]"
],
"language": "python",
"outputs": [],
"prompt_number": 42
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"close_px.ix[-1].plot(kind='bar')",
"title('Prices on %s' % close_px.index[-1])",
"axhline(0)"
],
"language": "python",
"outputs": [],
"prompt_number": 43
},
{
"cell_type": "markdown",
"source": [
"Hierarchical indexing",
"---------------------"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],",
" ['one', 'two', 'three']],",
" labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],",
" [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]])",
"hdf = DataFrame(np.random.randn(10, 3), index=index,",
" columns=['A', 'B', 'C'])",
"hdf"
],
"language": "python",
"outputs": [],
"prompt_number": 44
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"hdf.ix['foo']"
],
"language": "python",
"outputs": [],
"prompt_number": 45
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"hdf.ix['foo'] = 0",
"hdf"
],
"language": "python",
"outputs": [],
"prompt_number": 46
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"hdf.ix['foo', 'three']"
],
"language": "python",
"outputs": [],
"prompt_number": 47
},
{
"cell_type": "markdown",
"source": [
"Stacking and unstacking",
"-----------------------"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"tuples = zip(*[['bar', 'bar', 'baz', 'baz',",
" 'foo', 'foo', 'qux', 'qux'],",
" ['one', 'two', 'one', 'two',",
" 'one', 'two', 'one', 'two']])",
"index = MultiIndex.from_tuples(tuples)",
"columns = MultiIndex.from_tuples([('A', 'cat'), ('B', 'dog'),",
" ('B', 'cat'), ('A', 'dog')])",
"df = DataFrame(randn(8, 4), index=index, columns=columns)",
"df"
],
"language": "python",
"outputs": [],
"prompt_number": 48
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df2 = df.ix[[0, 1, 2, 4, 5, 7]]",
"df2"
],
"language": "python",
"outputs": [],
"prompt_number": 49
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df.unstack()['B']"
],
"language": "python",
"outputs": [],
"prompt_number": 50
},
{
"cell_type": "markdown",
"source": [
"GroupBy",
"======="
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df = DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',",
" 'foo', 'bar', 'foo', 'foo'],",
" 'B' : ['one', 'one', 'two', 'three',",
" 'two', 'two', 'one', 'three'],",
" 'C' : np.random.randn(8),",
" 'D' : np.random.randn(8)})",
"df"
],
"language": "python",
"outputs": [],
"prompt_number": 51
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"for key, group in df.groupby('A'):",
" print key",
" print group"
],
"language": "python",
"outputs": [],
"prompt_number": 52
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df.groupby('A')['C'].describe().T"
],
"language": "python",
"outputs": [],
"prompt_number": 53
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df.groupby('A').mean()"
],
"language": "python",
"outputs": [],
"prompt_number": 54
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"for key, group in df.groupby('A'):",
" print key",
" print group"
],
"language": "python",
"outputs": [],
"prompt_number": 55
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df.groupby(['A', 'B']).mean()"
],
"language": "python",
"outputs": [],
"prompt_number": 56
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df.groupby(['A', 'B'], as_index=False).mean()"
],
"language": "python",
"outputs": [],
"prompt_number": 57
},
{
"cell_type": "markdown",
"source": [
"GroupBy example: linear regression by group",
"-------------------------------------------"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import scikits.statsmodels.api as sm",
"rets = close_px / close_px.shift(1) - 1",
"",
"def get_beta(rets):",
" rets = rets.dropna()",
" rets['intercept'] = 1.",
" model = sm.OLS(rets['MSFT'], rets.ix[:, ['AAPL', 'intercept']]).fit()",
" return model.params",
"",
"get_beta(rets)"
],
"language": "python",
"outputs": [],
"prompt_number": 58
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"grouped = rets.groupby([lambda x: x.year, lambda x: x.month])",
"beta_by_ym = grouped.apply(get_beta)",
"beta_by_ym"
],
"language": "python",
"outputs": [],
"prompt_number": 59
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"beta_by_ym.unstack(0)['AAPL']"
],
"language": "python",
"outputs": [],
"prompt_number": 60
},
{
"cell_type": "markdown",
"source": [
"GroupBy with hierarchical indexing",
"----------------------------------"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"tuples = zip(*[['bar', 'bar', 'baz', 'baz',",
" 'foo', 'foo', 'qux', 'qux'],",
" ['one', 'two', 'one', 'two',",
" 'one', 'two', 'one', 'two']])",
"index = MultiIndex.from_tuples(tuples)",
"columns = MultiIndex.from_tuples([('A', 'cat'), ('B', 'dog'),",
" ('B', 'cat'), ('A', 'dog')])",
"df = DataFrame(randn(8, 4), index=index, columns=columns)",
"df"
],
"language": "python",
"outputs": [],
"prompt_number": 61
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df.groupby(level=0, axis=0).mean()"
],
"language": "python",
"outputs": [],
"prompt_number": 62
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df.stack()"
],
"language": "python",
"outputs": [],
"prompt_number": 63
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df.stack().mean(1).unstack()"
],
"language": "python",
"outputs": [],
"prompt_number": 64
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# could also have done",
"df.groupby(level=1, axis=1).mean()"
],
"language": "python",
"outputs": [],
"prompt_number": 65
},
{
"cell_type": "code",
"collapsed": true,
"input": [],
"language": "python",
"outputs": [],
"prompt_number": " "
}
]
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment