Skip to content

Instantly share code, notes, and snippets.

@khaeru
Last active March 13, 2024 15:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save khaeru/6548166e9d3f928b6b0041c464d9e05d to your computer and use it in GitHub Desktop.
Save khaeru/6548166e9d3f928b6b0041c464d9e05d to your computer and use it in GitHub Desktop.
Store pandas data in an SQL(ite) BLOB field using the Parquet format
"""Store pandas data in an SQL(ite) BLOB field using the Parquet format.
© 2024 Paul Natsuo Kishimoto <mail@paul.kishimoto.name>.
Licensed under the GNU GPL v3 https://www.gnu.org/licenses/gpl-3.0.en.html.
"""
import sqlite3
from dataclasses import dataclass
from io import BytesIO
import pandas as pd
from pandas.testing import assert_frame_equal
@dataclass
class Item:
"""Structure information."""
name: str
dims: list[str]
values: list[str]
attributes: list[str]
@dataclass
class DataObject:
"""Data for a particular structure."""
item: Item
data: pd.DataFrame
def __post_init__(self):
self.data.columns = self.item.dims + self.item.values + self.item.attributes
self.data.info() # Show memory usage
@property
def data_for_sql(self) -> tuple[str, bytes]:
return (self.item.name, self.data.to_parquet(index=False))
@classmethod
def from_db(cls, item: Item, connection) -> "DataObject":
# Retrieve the blob; store in a buffer
with connection:
result = connection.execute(
"SELECT data FROM item_data WHERE name = ?", (item.name,)
)
buffer = BytesIO(result.fetchone()[0])
# Decode and instantiate
return cls(item, pd.read_parquet(buffer))
if __name__ == "__main__":
# A 'parameter' with 3 dimensions, 1 numeric value per key, and 1 attribute
i = Item("my_par", ["X", "Y", "Z"], ["value"], ["unit"])
# Similarly, a 'variable' with 2 dims and 2 numeric values. DataObject doesn't care.
# i = Item("my_var", ["X", "Y"], ["level", "marginal"], ["unit"])
# Create an object with some data. The following exaggerate the size reduction,
# because of repeated values.
# k = 1 # 4 rows; memory usage 292 B; SQLite file size about 12 kB
k = 1_000 # 4k 156.6+ kB; 12 kB
# k = 1_000_000 # 4m 152.6+ MB; 180 kB
# k = 10_000_000 # 40m 1.5+ GB; 1.7 MB
DATA = [
["x1", "y1", "z11", 2.1, "kg"],
["x2", "y2", "z32", 3.1, "kg"],
["x3", "y1", "z56", 4.1, "kg"],
["x4", "y2", "z89", 5.1, "kg"],
] * k
do1 = DataObject(i, pd.DataFrame(DATA))
with sqlite3.connect("demo.db") as c1:
# Create a table
c1.execute("CREATE TABLE IF NOT EXISTS item_data (name TEXT UNIQUE, data BLOB)")
# Store `do1`, commit, close the connection
c1.execute("INSERT OR REPLACE INTO item_data VALUES (?, ?)", do1.data_for_sql)
# Connect again
with sqlite3.connect("demo.db") as c2:
# Create `do2` by reading and deserializing the data
do2 = DataObject.from_db(i, c2)
# Object data are equal
assert_frame_equal(do1.data, do2.data) # Slow for k >= 1_000_000
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment