Created
February 8, 2025 20:23
-
-
Save lmmx/b0c5a09d6e2a4832b42778fe6abee1d4 to your computer and use it in GitHub Desktop.
Demo to set metadata on a DataFrame (which must not change for the `id` to remain unchanged)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import polars as pl | |
from polars.api import register_dataframe_namespace | |
import weakref | |
@register_dataframe_namespace("config_meta") | |
class ConfigMetaPlugin: | |
""" | |
Attach arbitrary metadata to a Polars DataFrame, keyed by its 'id(df)'. | |
We also keep a weak reference to the df so that when it's GC'd, | |
we automatically remove the metadata entry. | |
""" | |
# Global dictionaries (class-level): | |
# - _df_id_to_meta: {id(df) -> dict_of_metadata} | |
# - _df_id_to_ref: {id(df) -> weakref_to_df} | |
_df_id_to_meta = {} | |
_df_id_to_ref = {} | |
def __init__(self, df: pl.DataFrame): | |
self._df = df | |
self._df_id = id(df) | |
# If we've never seen this df before, create an empty metadata dict | |
# and a weakref that will remove the metadata once df is garbage-collected. | |
if self._df_id not in self._df_id_to_meta: | |
self._df_id_to_meta[self._df_id] = {} | |
# Create a weakref with a callback to remove metadata on GC: | |
self._df_id_to_ref[self._df_id] = weakref.ref(df, self._cleanup) | |
@classmethod | |
def _cleanup(cls, df_weakref): | |
""" | |
Once the df is GC'd, remove its entry from both dictionaries. | |
""" | |
# We only know `df_weakref` fired, but not the exact ID. So we search. | |
# (This is O(n) in # of dataframes, but typically that’s not huge.) | |
to_remove = None | |
for df_id, wref in cls._df_id_to_ref.items(): | |
if wref is df_weakref: | |
to_remove = df_id | |
break | |
if to_remove is not None: | |
cls._df_id_to_ref.pop(to_remove, None) | |
cls._df_id_to_meta.pop(to_remove, None) | |
def set(self, **kwargs) -> None: | |
""" | |
Set (overwrite) multiple key/value pairs in this DataFrame's metadata. | |
""" | |
self._df_id_to_meta[self._df_id].update(kwargs) | |
def update(self, mapping: dict) -> None: | |
""" | |
Update metadata from a dict. | |
""" | |
self._df_id_to_meta[self._df_id].update(mapping) | |
def merge(self, *dfs: pl.DataFrame) -> None: | |
""" | |
Merge (dict.update) metadata from other DataFrames into the current one. | |
""" | |
for other_df in dfs: | |
other_id = id(other_df) | |
# Ensure the other_df is "registered" so it definitely has an entry | |
ConfigMetaPlugin(other_df) | |
self._df_id_to_meta[self._df_id].update( | |
self._df_id_to_meta.get(other_id, {}) | |
) | |
def get_metadata(self) -> dict: | |
""" | |
Return the metadata dictionary for this DataFrame, if any. | |
""" | |
return self._df_id_to_meta[self._df_id] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import polars as pl | |
import config_plugin # Must import so the plugin is registered | |
df1 = pl.DataFrame({"a": [1]}) | |
df2 = pl.DataFrame({"a": [1, 2], "b": [1, 2]}) | |
df3 = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3], "c": [1, 2, 3]}) | |
df1.config_meta.set(name="foo") | |
df2.config_meta.update({"confidence": 0.95}) | |
# Copy/merge df1 and df2 metadata into df3 | |
df3.config_meta.merge(df1, df2) | |
print("df1 metadata:", df1.config_meta.get_metadata()) | |
print("df2 metadata:", df2.config_meta.get_metadata()) | |
print("df3 metadata:", df3.config_meta.get_metadata()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Output: