Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save cchwala/568562d9f5eb54de682ac2686372279b to your computer and use it in GitHub Desktop.
Save cchwala/568562d9f5eb54de682ac2686372279b to your computer and use it in GitHub Desktop.
pandas_limit_nan_interpolation_to_max_gap_length.ipynb
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Testing the performance of different implementations for limiting `DataFrame.interpolate` to only do something for NaN-gaps shorter then a defined limit\n",
"\n",
"The code stems from [this stackoverflow thread](https://stackoverflow.com/questions/48933165/pandas-dataframe-interpolating-in-sections-delimited-by-indexes)."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"def make_data():\n",
" dictx = {'col1':[1,'nan','nan','nan',5,'nan',7,'nan',9,'nan','nan','nan',13]*1000,\n",
" 'col2':[20,'nan','nan','nan',22,'nan',25,'nan',30,'nan','nan','nan',25]*1000,\n",
" 'col3':[15,'nan','nan','nan',10,'nan',14,'nan',13,'nan','nan','nan',9]*1000}\n",
" return pd.DataFrame(dictx).astype(float)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Method 1 "
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"df = make_data()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 624 ms, sys: 11.2 ms, total: 635 ms\n",
"Wall time: 651 ms\n"
]
}
],
"source": [
"%%time\n",
"limit = 2\n",
"notnull = pd.notnull(df).all(axis=1)\n",
"# assign group numbers to the rows of df. Each group starts with a non-null row,\n",
"# followed by null rows\n",
"group = notnull.cumsum()\n",
"# find the index of groups having length > limit\n",
"ignore = (df.groupby(group).filter(lambda grp: len(grp)>limit)).index\n",
"# only ignore rows which are null\n",
"ignore = df.loc[~notnull].index.intersection(ignore)\n",
"keep = df.index.difference(ignore)\n",
"# interpolate only the kept rows\n",
"df.loc[keep] = df.loc[keep].interpolate()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Method 2 (this is 4 times slower)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"df2 = make_data()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 2.46 s, sys: 83.3 ms, total: 2.55 s\n",
"Wall time: 2.54 s\n"
]
}
],
"source": [
"%%time\n",
"def interp(df, limit):\n",
" d = df.notna().rolling(limit + 1).agg(any).fillna(1)\n",
" d = pd.concat({\n",
" i: d.shift(-i).fillna(1)\n",
" for i in range(limit + 1)\n",
" }).prod(level=1)\n",
"\n",
" return df.interpolate(limit=limit).where(d.astype(bool))\n",
"\n",
"df2 = df2.pipe(interp, 2)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Check if both generate the same result "
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"pd.testing.assert_frame_equal(df, df2)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# The pure `DataFrame.interpolate()` is 50 times faster than method 1... "
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"df = make_data()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 11 ms, sys: 1.89 ms, total: 12.9 ms\n",
"Wall time: 11.7 ms\n"
]
}
],
"source": [
"%%time\n",
"foo = df.interpolate(limit=2)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# My new Method 3 \n",
"using `cumsum()` of NaNs and `diff()` to calculate the gap width"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"def interpolate_nan_max_gap_series(s, maxgap): \n",
" df = pd.DataFrame(s)\n",
" \n",
" # Calculate NaN gap width\n",
" df['nan_gap_width'] = (s\n",
" .isnull()\n",
" .cumsum()\n",
" .loc[~s.isnull()]\n",
" .diff()\n",
" )\n",
" df['nan_gap_width'] = df['nan_gap_width'].fillna(method='bfill')\n",
" \n",
" # Interpolate all NaNs but fill back in those in the\n",
" # gaps that are longer than the allowed limite\n",
" nan_ix = s.isnull()\n",
" s = s.interpolate()\n",
" s[(df['nan_gap_width'] > maxgap) & nan_ix] = pd.np.nan\n",
" \n",
" return s\n",
"\n",
"def interpolate_nan_max_gap_frame(df, maxgap):\n",
" for column_name in df:\n",
" df[column_name] = interpolate_nan_max_gap_series(df[column_name], maxgap=maxgap)\n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"df3 = make_data()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 9.42 ms, sys: 1.82 ms, total: 11.2 ms\n",
"Wall time: 10.1 ms\n"
]
}
],
"source": [
"%%time\n",
"res = interpolate_nan_max_gap_series(df3.col1, maxgap=2)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"df3 = make_data()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 26.1 ms, sys: 2.27 ms, total: 28.4 ms\n",
"Wall time: 27.1 ms\n"
]
}
],
"source": [
"%%time\n",
"res = interpolate_nan_max_gap_frame(df3, maxgap=2)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"df3 = make_data()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 4.05 ms, sys: 1.22 ms, total: 5.26 ms\n",
"Wall time: 4.32 ms\n"
]
}
],
"source": [
"%%time\n",
"res = df3.col1.interpolate(limit=3)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"df3 = make_data()\n",
"res = interpolate_nan_max_gap_frame(df3, maxgap=2)\n",
"pd.testing.assert_frame_equal(df2, res)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.15"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment