khaeru/pandas_parquet_in_blob.py

## pandas_parquet_in_blob.py
"""Store pandas data in an SQL(ite) BLOB field using the Parquet format.

© 2024 Paul Natsuo Kishimoto <mail@paul.kishimoto.name>.
Licensed under the GNU GPL v3 https://www.gnu.org/licenses/gpl-3.0.en.html.
"""

import sqlite3
from dataclasses import dataclass
from io import BytesIO

import pandas as pd
from pandas.testing import assert_frame_equal


@dataclass
class Item:
    """Structure information."""

    name: str

    dims: list[str]
    values: list[str]
    attributes: list[str]


@dataclass
class DataObject:
    """Data for a particular structure."""

    item: Item
    data: pd.DataFrame

    def __post_init__(self):
        self.data.columns = self.item.dims + self.item.values + self.item.attributes
        self.data.info()  # Show memory usage

    @property
    def data_for_sql(self) -> tuple[str, bytes]:
        return (self.item.name, self.data.to_parquet(index=False))

    @classmethod
    def from_db(cls, item: Item, connection) -> "DataObject":
        # Retrieve the blob; store in a buffer
        with connection:
            result = connection.execute(
                "SELECT data FROM item_data WHERE name = ?", (item.name,)
            )
        buffer = BytesIO(result.fetchone()[0])

        # Decode and instantiate
        return cls(item, pd.read_parquet(buffer))


if __name__ == "__main__":
    # A 'parameter' with 3 dimensions, 1 numeric value per key, and 1 attribute
    i = Item("my_par", ["X", "Y", "Z"], ["value"], ["unit"])

    # Similarly, a 'variable' with 2 dims and 2 numeric values. DataObject doesn't care.
    # i = Item("my_var",  ["X", "Y"], ["level", "marginal"], ["unit"])

    # Create an object with some data. The following exaggerate the size reduction,
    # because of repeated values.
    # k = 1  #           4  rows; memory usage 292     B; SQLite file size about 12 kB
    k = 1_000  #         4k                    156.6+ kB;                        12 kB
    # k = 1_000_000  #   4m                    152.6+ MB;                       180 kB
    # k = 10_000_000  # 40m                      1.5+ GB;                       1.7 MB
    DATA = [
        ["x1", "y1", "z11", 2.1, "kg"],
        ["x2", "y2", "z32", 3.1, "kg"],
        ["x3", "y1", "z56", 4.1, "kg"],
        ["x4", "y2", "z89", 5.1, "kg"],
    ] * k
    do1 = DataObject(i, pd.DataFrame(DATA))

    with sqlite3.connect("demo.db") as c1:
        # Create a table
        c1.execute("CREATE TABLE IF NOT EXISTS item_data (name TEXT UNIQUE, data BLOB)")

        # Store `do1`, commit, close the connection
        c1.execute("INSERT OR REPLACE INTO item_data VALUES (?, ?)", do1.data_for_sql)

    # Connect again
    with sqlite3.connect("demo.db") as c2:
        # Create `do2` by reading and deserializing the data
        do2 = DataObject.from_db(i, c2)

    # Object data are equal
    assert_frame_equal(do1.data, do2.data)  # Slow for k >= 1_000_000
	"""Store pandas data in an SQL(ite) BLOB field using the Parquet format.

	© 2024 Paul Natsuo Kishimoto <mail@paul.kishimoto.name>.
	Licensed under the GNU GPL v3 https://www.gnu.org/licenses/gpl-3.0.en.html.
	"""

	import sqlite3
	from dataclasses import dataclass
	from io import BytesIO

	import pandas as pd
	from pandas.testing import assert_frame_equal


	@dataclass
	class Item:
	"""Structure information."""

	name: str

	dims: list[str]
	values: list[str]
	attributes: list[str]


	@dataclass
	class DataObject:
	"""Data for a particular structure."""

	item: Item
	data: pd.DataFrame

	def __post_init__(self):
	self.data.columns = self.item.dims + self.item.values + self.item.attributes
	self.data.info() # Show memory usage

	@property
	def data_for_sql(self) -> tuple[str, bytes]:
	return (self.item.name, self.data.to_parquet(index=False))

	@classmethod
	def from_db(cls, item: Item, connection) -> "DataObject":
	# Retrieve the blob; store in a buffer
	with connection:
	result = connection.execute(
	"SELECT data FROM item_data WHERE name = ?", (item.name,)
	)
	buffer = BytesIO(result.fetchone()[0])

	# Decode and instantiate
	return cls(item, pd.read_parquet(buffer))


	if __name__ == "__main__":
	# A 'parameter' with 3 dimensions, 1 numeric value per key, and 1 attribute
	i = Item("my_par", ["X", "Y", "Z"], ["value"], ["unit"])

	# Similarly, a 'variable' with 2 dims and 2 numeric values. DataObject doesn't care.
	# i = Item("my_var", ["X", "Y"], ["level", "marginal"], ["unit"])

	# Create an object with some data. The following exaggerate the size reduction,
	# because of repeated values.
	# k = 1 # 4 rows; memory usage 292 B; SQLite file size about 12 kB
	k = 1_000 # 4k 156.6+ kB; 12 kB
	# k = 1_000_000 # 4m 152.6+ MB; 180 kB
	# k = 10_000_000 # 40m 1.5+ GB; 1.7 MB
	DATA = [
	["x1", "y1", "z11", 2.1, "kg"],
	["x2", "y2", "z32", 3.1, "kg"],
	["x3", "y1", "z56", 4.1, "kg"],
	["x4", "y2", "z89", 5.1, "kg"],
	] * k
	do1 = DataObject(i, pd.DataFrame(DATA))

	with sqlite3.connect("demo.db") as c1:
	# Create a table
	c1.execute("CREATE TABLE IF NOT EXISTS item_data (name TEXT UNIQUE, data BLOB)")

	# Store `do1`, commit, close the connection
	c1.execute("INSERT OR REPLACE INTO item_data VALUES (?, ?)", do1.data_for_sql)

	# Connect again
	with sqlite3.connect("demo.db") as c2:
	# Create `do2` by reading and deserializing the data
	do2 = DataObject.from_db(i, c2)

	# Object data are equal
	assert_frame_equal(do1.data, do2.data) # Slow for k >= 1_000_000