Skip to content

Instantly share code, notes, and snippets.

@lmmx
Created February 8, 2025 20:23
Show Gist options
  • Save lmmx/b0c5a09d6e2a4832b42778fe6abee1d4 to your computer and use it in GitHub Desktop.
Save lmmx/b0c5a09d6e2a4832b42778fe6abee1d4 to your computer and use it in GitHub Desktop.
Demo to set metadata on a DataFrame (which must not change for the `id` to remain unchanged)
import polars as pl
from polars.api import register_dataframe_namespace
import weakref
@register_dataframe_namespace("config_meta")
class ConfigMetaPlugin:
"""
Attach arbitrary metadata to a Polars DataFrame, keyed by its 'id(df)'.
We also keep a weak reference to the df so that when it's GC'd,
we automatically remove the metadata entry.
"""
# Global dictionaries (class-level):
# - _df_id_to_meta: {id(df) -> dict_of_metadata}
# - _df_id_to_ref: {id(df) -> weakref_to_df}
_df_id_to_meta = {}
_df_id_to_ref = {}
def __init__(self, df: pl.DataFrame):
self._df = df
self._df_id = id(df)
# If we've never seen this df before, create an empty metadata dict
# and a weakref that will remove the metadata once df is garbage-collected.
if self._df_id not in self._df_id_to_meta:
self._df_id_to_meta[self._df_id] = {}
# Create a weakref with a callback to remove metadata on GC:
self._df_id_to_ref[self._df_id] = weakref.ref(df, self._cleanup)
@classmethod
def _cleanup(cls, df_weakref):
"""
Once the df is GC'd, remove its entry from both dictionaries.
"""
# We only know `df_weakref` fired, but not the exact ID. So we search.
# (This is O(n) in # of dataframes, but typically that’s not huge.)
to_remove = None
for df_id, wref in cls._df_id_to_ref.items():
if wref is df_weakref:
to_remove = df_id
break
if to_remove is not None:
cls._df_id_to_ref.pop(to_remove, None)
cls._df_id_to_meta.pop(to_remove, None)
def set(self, **kwargs) -> None:
"""
Set (overwrite) multiple key/value pairs in this DataFrame's metadata.
"""
self._df_id_to_meta[self._df_id].update(kwargs)
def update(self, mapping: dict) -> None:
"""
Update metadata from a dict.
"""
self._df_id_to_meta[self._df_id].update(mapping)
def merge(self, *dfs: pl.DataFrame) -> None:
"""
Merge (dict.update) metadata from other DataFrames into the current one.
"""
for other_df in dfs:
other_id = id(other_df)
# Ensure the other_df is "registered" so it definitely has an entry
ConfigMetaPlugin(other_df)
self._df_id_to_meta[self._df_id].update(
self._df_id_to_meta.get(other_id, {})
)
def get_metadata(self) -> dict:
"""
Return the metadata dictionary for this DataFrame, if any.
"""
return self._df_id_to_meta[self._df_id]
import polars as pl
import config_plugin # Must import so the plugin is registered
df1 = pl.DataFrame({"a": [1]})
df2 = pl.DataFrame({"a": [1, 2], "b": [1, 2]})
df3 = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3], "c": [1, 2, 3]})
df1.config_meta.set(name="foo")
df2.config_meta.update({"confidence": 0.95})
# Copy/merge df1 and df2 metadata into df3
df3.config_meta.merge(df1, df2)
print("df1 metadata:", df1.config_meta.get_metadata())
print("df2 metadata:", df2.config_meta.get_metadata())
print("df3 metadata:", df3.config_meta.get_metadata())
@lmmx
Copy link
Author

lmmx commented Feb 8, 2025

Output:

df1 metadata: {'name': 'foo'}
df2 metadata: {'confidence': 0.95}
df3 metadata: {'name': 'foo', 'confidence': 0.95}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment