Created
December 3, 2013 23:28
-
-
Save deniszgonjanin/7779609 to your computer and use it in GitHub Desktop.
Pandas Basics - iPython Notebook
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "Pandas Basics" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "from pandas import *", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 90 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "#Construct a sample DataFrame\ndf = DataFrame({'Integers' : [1,2,6,8,-1], 'Floats' : [0.1, 0.2,0.2,10.1,None], 'Strings' : ['a','b',None,'c','a']})", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 111 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "df", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Floats</th>\n <th>Integers</th>\n <th>Strings</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td> 0.1</td>\n <td> 1</td>\n <td> a</td>\n </tr>\n <tr>\n <th>1</th>\n <td> 0.2</td>\n <td> 2</td>\n <td> b</td>\n </tr>\n <tr>\n <th>2</th>\n <td> 0.2</td>\n <td> 6</td>\n <td> None</td>\n </tr>\n <tr>\n <th>3</th>\n <td> 10.1</td>\n <td> 8</td>\n <td> c</td>\n </tr>\n <tr>\n <th>4</th>\n <td> NaN</td>\n <td>-1</td>\n <td> a</td>\n </tr>\n </tbody>\n</table>\n</div>", | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 112, | |
"text": " Floats Integers Strings\n0 0.1 1 a\n1 0.2 2 b\n2 0.2 6 None\n3 10.1 8 c\n4 NaN -1 a" | |
} | |
], | |
"prompt_number": 112 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "#Series is a one-dimentional array of data, that also supports an index which may be non-numerical (e.g. dates)\ndf[\"Floats\"]", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 113, | |
"text": "0 0.1\n1 0.2\n2 0.2\n3 10.1\n4 NaN\nName: Floats, dtype: float64" | |
} | |
], | |
"prompt_number": 113 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "type(df[\"Floats\"])", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 114, | |
"text": "pandas.core.series.Series" | |
} | |
], | |
"prompt_number": 114 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "#Filtering\ndf[df[\"Integers\"] > 3]", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Floats</th>\n <th>Integers</th>\n <th>Strings</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>2</th>\n <td> 0.2</td>\n <td> 6</td>\n <td> None</td>\n </tr>\n <tr>\n <th>3</th>\n <td> 10.1</td>\n <td> 8</td>\n <td> c</td>\n </tr>\n </tbody>\n</table>\n</div>", | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 115, | |
"text": " Floats Integers Strings\n2 0.2 6 None\n3 10.1 8 c" | |
} | |
], | |
"prompt_number": 115 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "#Multiple Conditions\n(df[\"Integers\"] < 3) & (df[\"Floats\"] != 0.2)", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 107, | |
"text": "0 True\n1 False\n2 False\n3 False\n4 True\ndtype: bool" | |
} | |
], | |
"prompt_number": 107 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "#We can also perform arithmetic on dataframe columns\ndf[\"Sums\"] = df[\"Floats\"] + df[\"Integers\"]\ndf", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Floats</th>\n <th>Integers</th>\n <th>Strings</th>\n <th>Sums</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td> 0.1</td>\n <td> 1</td>\n <td> a</td>\n <td> 1.1</td>\n </tr>\n <tr>\n <th>1</th>\n <td> 0.2</td>\n <td> 2</td>\n <td> b</td>\n <td> 2.2</td>\n </tr>\n <tr>\n <th>2</th>\n <td> 0.2</td>\n <td> 6</td>\n <td> None</td>\n <td> 6.2</td>\n </tr>\n <tr>\n <th>3</th>\n <td> 10.1</td>\n <td> 8</td>\n <td> c</td>\n <td> 18.1</td>\n </tr>\n <tr>\n <th>4</th>\n <td> NaN</td>\n <td>-1</td>\n <td> a</td>\n <td> NaN</td>\n </tr>\n </tbody>\n</table>\n</div>", | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 97, | |
"text": " Floats Integers Strings Sums\n0 0.1 1 a 1.1\n1 0.2 2 b 2.2\n2 0.2 6 None 6.2\n3 10.1 8 c 18.1\n4 NaN -1 a NaN" | |
} | |
], | |
"prompt_number": 97 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "#Filtering out missing values\ndf.dropna()", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Floats</th>\n <th>Integers</th>\n <th>Strings</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td> 0.1</td>\n <td> 1</td>\n <td> a</td>\n </tr>\n <tr>\n <th>1</th>\n <td> 0.2</td>\n <td> 2</td>\n <td> b</td>\n </tr>\n <tr>\n <th>3</th>\n <td> 10.1</td>\n <td> 8</td>\n <td> c</td>\n </tr>\n </tbody>\n</table>\n</div>", | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 117, | |
"text": " Floats Integers Strings\n0 0.1 1 a\n1 0.2 2 b\n3 10.1 8 c" | |
} | |
], | |
"prompt_number": 117 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "#Fill in empty values instead\ndf[\"Floats\"].fillna(0.0)", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 99, | |
"text": "0 0.1\n1 0.2\n2 0.2\n3 10.1\n4 0.0\nName: Floats, dtype: float64" | |
} | |
], | |
"prompt_number": 99 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "df[\"Integers\"]", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 118, | |
"text": "0 1\n1 2\n2 6\n3 8\n4 -1\nName: Integers, dtype: int64" | |
} | |
], | |
"prompt_number": 118 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "#Map executes a function across every value in a Series\n\ndef square_integers(x):\n return x*x\ndf[\"Integers\"].map(square_integers)", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 119, | |
"text": "0 1\n1 4\n2 36\n3 64\n4 1\nName: Integers, dtype: int64" | |
} | |
], | |
"prompt_number": 119 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "df[[\"Integers\", \"Floats\"]]", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Integers</th>\n <th>Floats</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td> 1</td>\n <td> 0.1</td>\n </tr>\n <tr>\n <th>1</th>\n <td> 2</td>\n <td> 0.2</td>\n </tr>\n <tr>\n <th>2</th>\n <td> 6</td>\n <td> 0.2</td>\n </tr>\n <tr>\n <th>3</th>\n <td> 8</td>\n <td> 10.1</td>\n </tr>\n <tr>\n <th>4</th>\n <td>-1</td>\n <td> NaN</td>\n </tr>\n </tbody>\n</table>\n</div>", | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 120, | |
"text": " Integers Floats\n0 1 0.1\n1 2 0.2\n2 6 0.2\n3 8 10.1\n4 -1 NaN" | |
} | |
], | |
"prompt_number": 120 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "#Apply executes a function across a DataFrame, either across every row or every column (default)\ndf[[\"Integers\", \"Floats\"]].apply(np.sum, axis=0)", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 101, | |
"text": "Integers 16.0\nFloats 10.6\ndtype: float64" | |
} | |
], | |
"prompt_number": 101 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "#If we want to apply across rows instead:\ndf[[\"Integers\", \"Floats\"]].apply(np.sum, axis=1)", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 102, | |
"text": "0 1.1\n1 2.2\n2 6.2\n3 18.1\n4 -1.0\ndtype: float64" | |
} | |
], | |
"prompt_number": 102 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "df[[\"Integers\", \"Strings\"]]", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Integers</th>\n <th>Strings</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td> 1</td>\n <td> a</td>\n </tr>\n <tr>\n <th>1</th>\n <td> 2</td>\n <td> b</td>\n </tr>\n <tr>\n <th>2</th>\n <td> 6</td>\n <td> None</td>\n </tr>\n <tr>\n <th>3</th>\n <td> 8</td>\n <td> c</td>\n </tr>\n <tr>\n <th>4</th>\n <td>-1</td>\n <td> a</td>\n </tr>\n </tbody>\n</table>\n</div>", | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 122, | |
"text": " Integers Strings\n0 1 a\n1 2 b\n2 6 None\n3 8 c\n4 -1 a" | |
} | |
], | |
"prompt_number": 122 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "#Group By - SQL like\nfor x,y in df[\"Integers\"].groupby(df[\"Strings\"]):\n print y", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "0 1\n4 -1\nName: Integers, dtype: int64\n1 2\nName: Integers, dtype: int64\n3 8\nName: Integers, dtype: int64\n" | |
} | |
], | |
"prompt_number": 128 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "#Finally, merging two DataFrames\ndf2 = DataFrame({'Strings' : ['a','b'], 'More Numbers' : [1, 2]})\ndf2", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>More Numbers</th>\n <th>Strings</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td> 1</td>\n <td> a</td>\n </tr>\n <tr>\n <th>1</th>\n <td> 2</td>\n <td> b</td>\n </tr>\n </tbody>\n</table>\n</div>", | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 104, | |
"text": " More Numbers Strings\n0 1 a\n1 2 b" | |
} | |
], | |
"prompt_number": 104 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "df", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Floats</th>\n <th>Integers</th>\n <th>Strings</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td> 0.1</td>\n <td> 1</td>\n <td> a</td>\n </tr>\n <tr>\n <th>1</th>\n <td> 0.2</td>\n <td> 2</td>\n <td> b</td>\n </tr>\n <tr>\n <th>2</th>\n <td> 0.2</td>\n <td> 6</td>\n <td> None</td>\n </tr>\n <tr>\n <th>3</th>\n <td> 10.1</td>\n <td> 8</td>\n <td> c</td>\n </tr>\n <tr>\n <th>4</th>\n <td> NaN</td>\n <td>-1</td>\n <td> a</td>\n </tr>\n </tbody>\n</table>\n</div>", | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 129, | |
"text": " Floats Integers Strings\n0 0.1 1 a\n1 0.2 2 b\n2 0.2 6 None\n3 10.1 8 c\n4 NaN -1 a" | |
} | |
], | |
"prompt_number": 129 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "pandas.merge(df, df2, on='Strings',how='inner')", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Floats</th>\n <th>Integers</th>\n <th>Strings</th>\n <th>More Numbers</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td> 0.1</td>\n <td> 1</td>\n <td> a</td>\n <td> 1</td>\n </tr>\n <tr>\n <th>1</th>\n <td> NaN</td>\n <td>-1</td>\n <td> a</td>\n <td> 1</td>\n </tr>\n <tr>\n <th>2</th>\n <td> 0.2</td>\n <td> 2</td>\n <td> b</td>\n <td> 2</td>\n </tr>\n </tbody>\n</table>\n</div>", | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 130, | |
"text": " Floats Integers Strings More Numbers\n0 0.1 1 a 1\n1 NaN -1 a 1\n2 0.2 2 b 2" | |
} | |
], | |
"prompt_number": 130 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "pandas.merge(df, df2, on='Strings',how='outer')", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Floats</th>\n <th>Integers</th>\n <th>Strings</th>\n <th>Sums</th>\n <th>More Numbers</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td> 0.1</td>\n <td> 1</td>\n <td> a</td>\n <td> 1.1</td>\n <td> 1</td>\n </tr>\n <tr>\n <th>1</th>\n <td> NaN</td>\n <td>-1</td>\n <td> a</td>\n <td> NaN</td>\n <td> 1</td>\n </tr>\n <tr>\n <th>2</th>\n <td> 0.2</td>\n <td> 2</td>\n <td> b</td>\n <td> 2.2</td>\n <td> 2</td>\n </tr>\n <tr>\n <th>3</th>\n <td> 0.2</td>\n <td> 6</td>\n <td> None</td>\n <td> 6.2</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>4</th>\n <td> 10.1</td>\n <td> 8</td>\n <td> c</td>\n <td> 18.1</td>\n <td>NaN</td>\n </tr>\n </tbody>\n</table>\n</div>", | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 106, | |
"text": " Floats Integers Strings Sums More Numbers\n0 0.1 1 a 1.1 1\n1 NaN -1 a NaN 1\n2 0.2 2 b 2.2 2\n3 0.2 6 None 6.2 NaN\n4 10.1 8 c 18.1 NaN" | |
} | |
], | |
"prompt_number": 106 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 106 | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment