Skip to content

Instantly share code, notes, and snippets.

@vsbuffalo
Created June 6, 2024 21:04
Show Gist options
  • Save vsbuffalo/5ae3d39649a50a4113a51bb1b6cc45d8 to your computer and use it in GitHub Desktop.
Save vsbuffalo/5ae3d39649a50a4113a51bb1b6cc45d8 to your computer and use it in GitHub Desktop.
polars rolling bin without truncating end
import polars as pl
import numpy as np
def windowed_bin(df, window_size, column, binedges, precision=5, log_precision=False):
step_size = window_size
def bin_values(values):
bins = np.histogram(values, bins=binedges)[0]
return bins.tolist()
if not log_precision:
bin_labels = [f"bin_{binedges[i]:.{precision}f}" for i in range(1, len(binedges))]
else:
bin_labels = [f"bin_{binedges[i]:.{precision}e}" for i in range(1, len(binedges))]
rolling_bin_df = (
df
.set_sorted("pos")
.group_by_dynamic(index_column="pos", every=f"{step_size}i",
period=f"{window_size}i", offset="0i", closed="left",
include_boundaries=True)
.agg([
pl.map_groups(pl.col(column), bin_values).alias("bins")
])
.rename({"_lower_boundary": "start", "_upper_boundary": "end"})
)
rolling_bin_df = (rolling_bin_df.with_columns([
pl.col('bins').list.get(i).alias(label)
for i, label in enumerate(bin_labels)
])
.select(pl.exclude('bins')))
return rolling_bin_df
# Create a sample DataFrame
df = pl.DataFrame({
'pos': [1, 2, 3, 4, 5, 6, 7, 8, 9],
'value': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
})
# Define the window size, column to bin, and binedges
window_size = 3
column = 'value'
binedges = [0.0, 0.5, 1.0]
# Expected windows: [0, 3), [3, 6), [6, 9), [9, 10)
result = windowed_bin(df, window_size, column, binedges)
print(result)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment