Skip to content

Instantly share code, notes, and snippets.

@deniszgonjanin
Created December 3, 2013 23:28
Show Gist options
  • Save deniszgonjanin/7779609 to your computer and use it in GitHub Desktop.
Save deniszgonjanin/7779609 to your computer and use it in GitHub Desktop.
Pandas Basics - iPython Notebook
Display the source blob
Display the rendered blob
Raw
{
"metadata": {
"name": "Pandas Basics"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": "from pandas import *",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 90
},
{
"cell_type": "code",
"collapsed": false,
"input": "#Construct a sample DataFrame\ndf = DataFrame({'Integers' : [1,2,6,8,-1], 'Floats' : [0.1, 0.2,0.2,10.1,None], 'Strings' : ['a','b',None,'c','a']})",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 111
},
{
"cell_type": "code",
"collapsed": false,
"input": "df",
"language": "python",
"metadata": {},
"outputs": [
{
"html": "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Floats</th>\n <th>Integers</th>\n <th>Strings</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td> 0.1</td>\n <td> 1</td>\n <td> a</td>\n </tr>\n <tr>\n <th>1</th>\n <td> 0.2</td>\n <td> 2</td>\n <td> b</td>\n </tr>\n <tr>\n <th>2</th>\n <td> 0.2</td>\n <td> 6</td>\n <td> None</td>\n </tr>\n <tr>\n <th>3</th>\n <td> 10.1</td>\n <td> 8</td>\n <td> c</td>\n </tr>\n <tr>\n <th>4</th>\n <td> NaN</td>\n <td>-1</td>\n <td> a</td>\n </tr>\n </tbody>\n</table>\n</div>",
"metadata": {},
"output_type": "pyout",
"prompt_number": 112,
"text": " Floats Integers Strings\n0 0.1 1 a\n1 0.2 2 b\n2 0.2 6 None\n3 10.1 8 c\n4 NaN -1 a"
}
],
"prompt_number": 112
},
{
"cell_type": "code",
"collapsed": false,
"input": "#Series is a one-dimentional array of data, that also supports an index which may be non-numerical (e.g. dates)\ndf[\"Floats\"]",
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 113,
"text": "0 0.1\n1 0.2\n2 0.2\n3 10.1\n4 NaN\nName: Floats, dtype: float64"
}
],
"prompt_number": 113
},
{
"cell_type": "code",
"collapsed": false,
"input": "type(df[\"Floats\"])",
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 114,
"text": "pandas.core.series.Series"
}
],
"prompt_number": 114
},
{
"cell_type": "code",
"collapsed": false,
"input": "#Filtering\ndf[df[\"Integers\"] > 3]",
"language": "python",
"metadata": {},
"outputs": [
{
"html": "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Floats</th>\n <th>Integers</th>\n <th>Strings</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>2</th>\n <td> 0.2</td>\n <td> 6</td>\n <td> None</td>\n </tr>\n <tr>\n <th>3</th>\n <td> 10.1</td>\n <td> 8</td>\n <td> c</td>\n </tr>\n </tbody>\n</table>\n</div>",
"metadata": {},
"output_type": "pyout",
"prompt_number": 115,
"text": " Floats Integers Strings\n2 0.2 6 None\n3 10.1 8 c"
}
],
"prompt_number": 115
},
{
"cell_type": "code",
"collapsed": false,
"input": "#Multiple Conditions\n(df[\"Integers\"] < 3) & (df[\"Floats\"] != 0.2)",
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 107,
"text": "0 True\n1 False\n2 False\n3 False\n4 True\ndtype: bool"
}
],
"prompt_number": 107
},
{
"cell_type": "code",
"collapsed": false,
"input": "#We can also perform arithmetic on dataframe columns\ndf[\"Sums\"] = df[\"Floats\"] + df[\"Integers\"]\ndf",
"language": "python",
"metadata": {},
"outputs": [
{
"html": "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Floats</th>\n <th>Integers</th>\n <th>Strings</th>\n <th>Sums</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td> 0.1</td>\n <td> 1</td>\n <td> a</td>\n <td> 1.1</td>\n </tr>\n <tr>\n <th>1</th>\n <td> 0.2</td>\n <td> 2</td>\n <td> b</td>\n <td> 2.2</td>\n </tr>\n <tr>\n <th>2</th>\n <td> 0.2</td>\n <td> 6</td>\n <td> None</td>\n <td> 6.2</td>\n </tr>\n <tr>\n <th>3</th>\n <td> 10.1</td>\n <td> 8</td>\n <td> c</td>\n <td> 18.1</td>\n </tr>\n <tr>\n <th>4</th>\n <td> NaN</td>\n <td>-1</td>\n <td> a</td>\n <td> NaN</td>\n </tr>\n </tbody>\n</table>\n</div>",
"metadata": {},
"output_type": "pyout",
"prompt_number": 97,
"text": " Floats Integers Strings Sums\n0 0.1 1 a 1.1\n1 0.2 2 b 2.2\n2 0.2 6 None 6.2\n3 10.1 8 c 18.1\n4 NaN -1 a NaN"
}
],
"prompt_number": 97
},
{
"cell_type": "code",
"collapsed": false,
"input": "#Filtering out missing values\ndf.dropna()",
"language": "python",
"metadata": {},
"outputs": [
{
"html": "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Floats</th>\n <th>Integers</th>\n <th>Strings</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td> 0.1</td>\n <td> 1</td>\n <td> a</td>\n </tr>\n <tr>\n <th>1</th>\n <td> 0.2</td>\n <td> 2</td>\n <td> b</td>\n </tr>\n <tr>\n <th>3</th>\n <td> 10.1</td>\n <td> 8</td>\n <td> c</td>\n </tr>\n </tbody>\n</table>\n</div>",
"metadata": {},
"output_type": "pyout",
"prompt_number": 117,
"text": " Floats Integers Strings\n0 0.1 1 a\n1 0.2 2 b\n3 10.1 8 c"
}
],
"prompt_number": 117
},
{
"cell_type": "code",
"collapsed": false,
"input": "#Fill in empty values instead\ndf[\"Floats\"].fillna(0.0)",
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 99,
"text": "0 0.1\n1 0.2\n2 0.2\n3 10.1\n4 0.0\nName: Floats, dtype: float64"
}
],
"prompt_number": 99
},
{
"cell_type": "code",
"collapsed": false,
"input": "df[\"Integers\"]",
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 118,
"text": "0 1\n1 2\n2 6\n3 8\n4 -1\nName: Integers, dtype: int64"
}
],
"prompt_number": 118
},
{
"cell_type": "code",
"collapsed": false,
"input": "#Map executes a function across every value in a Series\n\ndef square_integers(x):\n return x*x\ndf[\"Integers\"].map(square_integers)",
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 119,
"text": "0 1\n1 4\n2 36\n3 64\n4 1\nName: Integers, dtype: int64"
}
],
"prompt_number": 119
},
{
"cell_type": "code",
"collapsed": false,
"input": "df[[\"Integers\", \"Floats\"]]",
"language": "python",
"metadata": {},
"outputs": [
{
"html": "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Integers</th>\n <th>Floats</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td> 1</td>\n <td> 0.1</td>\n </tr>\n <tr>\n <th>1</th>\n <td> 2</td>\n <td> 0.2</td>\n </tr>\n <tr>\n <th>2</th>\n <td> 6</td>\n <td> 0.2</td>\n </tr>\n <tr>\n <th>3</th>\n <td> 8</td>\n <td> 10.1</td>\n </tr>\n <tr>\n <th>4</th>\n <td>-1</td>\n <td> NaN</td>\n </tr>\n </tbody>\n</table>\n</div>",
"metadata": {},
"output_type": "pyout",
"prompt_number": 120,
"text": " Integers Floats\n0 1 0.1\n1 2 0.2\n2 6 0.2\n3 8 10.1\n4 -1 NaN"
}
],
"prompt_number": 120
},
{
"cell_type": "code",
"collapsed": false,
"input": "#Apply executes a function across a DataFrame, either across every row or every column (default)\ndf[[\"Integers\", \"Floats\"]].apply(np.sum, axis=0)",
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 101,
"text": "Integers 16.0\nFloats 10.6\ndtype: float64"
}
],
"prompt_number": 101
},
{
"cell_type": "code",
"collapsed": false,
"input": "#If we want to apply across rows instead:\ndf[[\"Integers\", \"Floats\"]].apply(np.sum, axis=1)",
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 102,
"text": "0 1.1\n1 2.2\n2 6.2\n3 18.1\n4 -1.0\ndtype: float64"
}
],
"prompt_number": 102
},
{
"cell_type": "code",
"collapsed": false,
"input": "df[[\"Integers\", \"Strings\"]]",
"language": "python",
"metadata": {},
"outputs": [
{
"html": "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Integers</th>\n <th>Strings</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td> 1</td>\n <td> a</td>\n </tr>\n <tr>\n <th>1</th>\n <td> 2</td>\n <td> b</td>\n </tr>\n <tr>\n <th>2</th>\n <td> 6</td>\n <td> None</td>\n </tr>\n <tr>\n <th>3</th>\n <td> 8</td>\n <td> c</td>\n </tr>\n <tr>\n <th>4</th>\n <td>-1</td>\n <td> a</td>\n </tr>\n </tbody>\n</table>\n</div>",
"metadata": {},
"output_type": "pyout",
"prompt_number": 122,
"text": " Integers Strings\n0 1 a\n1 2 b\n2 6 None\n3 8 c\n4 -1 a"
}
],
"prompt_number": 122
},
{
"cell_type": "code",
"collapsed": false,
"input": "#Group By - SQL like\nfor x,y in df[\"Integers\"].groupby(df[\"Strings\"]):\n print y",
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": "0 1\n4 -1\nName: Integers, dtype: int64\n1 2\nName: Integers, dtype: int64\n3 8\nName: Integers, dtype: int64\n"
}
],
"prompt_number": 128
},
{
"cell_type": "code",
"collapsed": false,
"input": "#Finally, merging two DataFrames\ndf2 = DataFrame({'Strings' : ['a','b'], 'More Numbers' : [1, 2]})\ndf2",
"language": "python",
"metadata": {},
"outputs": [
{
"html": "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>More Numbers</th>\n <th>Strings</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td> 1</td>\n <td> a</td>\n </tr>\n <tr>\n <th>1</th>\n <td> 2</td>\n <td> b</td>\n </tr>\n </tbody>\n</table>\n</div>",
"metadata": {},
"output_type": "pyout",
"prompt_number": 104,
"text": " More Numbers Strings\n0 1 a\n1 2 b"
}
],
"prompt_number": 104
},
{
"cell_type": "code",
"collapsed": false,
"input": "df",
"language": "python",
"metadata": {},
"outputs": [
{
"html": "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Floats</th>\n <th>Integers</th>\n <th>Strings</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td> 0.1</td>\n <td> 1</td>\n <td> a</td>\n </tr>\n <tr>\n <th>1</th>\n <td> 0.2</td>\n <td> 2</td>\n <td> b</td>\n </tr>\n <tr>\n <th>2</th>\n <td> 0.2</td>\n <td> 6</td>\n <td> None</td>\n </tr>\n <tr>\n <th>3</th>\n <td> 10.1</td>\n <td> 8</td>\n <td> c</td>\n </tr>\n <tr>\n <th>4</th>\n <td> NaN</td>\n <td>-1</td>\n <td> a</td>\n </tr>\n </tbody>\n</table>\n</div>",
"metadata": {},
"output_type": "pyout",
"prompt_number": 129,
"text": " Floats Integers Strings\n0 0.1 1 a\n1 0.2 2 b\n2 0.2 6 None\n3 10.1 8 c\n4 NaN -1 a"
}
],
"prompt_number": 129
},
{
"cell_type": "code",
"collapsed": false,
"input": "pandas.merge(df, df2, on='Strings',how='inner')",
"language": "python",
"metadata": {},
"outputs": [
{
"html": "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Floats</th>\n <th>Integers</th>\n <th>Strings</th>\n <th>More Numbers</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td> 0.1</td>\n <td> 1</td>\n <td> a</td>\n <td> 1</td>\n </tr>\n <tr>\n <th>1</th>\n <td> NaN</td>\n <td>-1</td>\n <td> a</td>\n <td> 1</td>\n </tr>\n <tr>\n <th>2</th>\n <td> 0.2</td>\n <td> 2</td>\n <td> b</td>\n <td> 2</td>\n </tr>\n </tbody>\n</table>\n</div>",
"metadata": {},
"output_type": "pyout",
"prompt_number": 130,
"text": " Floats Integers Strings More Numbers\n0 0.1 1 a 1\n1 NaN -1 a 1\n2 0.2 2 b 2"
}
],
"prompt_number": 130
},
{
"cell_type": "code",
"collapsed": false,
"input": "pandas.merge(df, df2, on='Strings',how='outer')",
"language": "python",
"metadata": {},
"outputs": [
{
"html": "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Floats</th>\n <th>Integers</th>\n <th>Strings</th>\n <th>Sums</th>\n <th>More Numbers</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td> 0.1</td>\n <td> 1</td>\n <td> a</td>\n <td> 1.1</td>\n <td> 1</td>\n </tr>\n <tr>\n <th>1</th>\n <td> NaN</td>\n <td>-1</td>\n <td> a</td>\n <td> NaN</td>\n <td> 1</td>\n </tr>\n <tr>\n <th>2</th>\n <td> 0.2</td>\n <td> 2</td>\n <td> b</td>\n <td> 2.2</td>\n <td> 2</td>\n </tr>\n <tr>\n <th>3</th>\n <td> 0.2</td>\n <td> 6</td>\n <td> None</td>\n <td> 6.2</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>4</th>\n <td> 10.1</td>\n <td> 8</td>\n <td> c</td>\n <td> 18.1</td>\n <td>NaN</td>\n </tr>\n </tbody>\n</table>\n</div>",
"metadata": {},
"output_type": "pyout",
"prompt_number": 106,
"text": " Floats Integers Strings Sums More Numbers\n0 0.1 1 a 1.1 1\n1 NaN -1 a NaN 1\n2 0.2 2 b 2.2 2\n3 0.2 6 None 6.2 NaN\n4 10.1 8 c 18.1 NaN"
}
],
"prompt_number": 106
},
{
"cell_type": "code",
"collapsed": false,
"input": "",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 106
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment