Last active
March 13, 2024 15:36
-
-
Save khaeru/6548166e9d3f928b6b0041c464d9e05d to your computer and use it in GitHub Desktop.
Store pandas data in an SQL(ite) BLOB field using the Parquet format
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Store pandas data in an SQL(ite) BLOB field using the Parquet format. | |
© 2024 Paul Natsuo Kishimoto <mail@paul.kishimoto.name>. | |
Licensed under the GNU GPL v3 https://www.gnu.org/licenses/gpl-3.0.en.html. | |
""" | |
import sqlite3 | |
from dataclasses import dataclass | |
from io import BytesIO | |
import pandas as pd | |
from pandas.testing import assert_frame_equal | |
@dataclass | |
class Item: | |
"""Structure information.""" | |
name: str | |
dims: list[str] | |
values: list[str] | |
attributes: list[str] | |
@dataclass | |
class DataObject: | |
"""Data for a particular structure.""" | |
item: Item | |
data: pd.DataFrame | |
def __post_init__(self): | |
self.data.columns = self.item.dims + self.item.values + self.item.attributes | |
self.data.info() # Show memory usage | |
@property | |
def data_for_sql(self) -> tuple[str, bytes]: | |
return (self.item.name, self.data.to_parquet(index=False)) | |
@classmethod | |
def from_db(cls, item: Item, connection) -> "DataObject": | |
# Retrieve the blob; store in a buffer | |
with connection: | |
result = connection.execute( | |
"SELECT data FROM item_data WHERE name = ?", (item.name,) | |
) | |
buffer = BytesIO(result.fetchone()[0]) | |
# Decode and instantiate | |
return cls(item, pd.read_parquet(buffer)) | |
if __name__ == "__main__": | |
# A 'parameter' with 3 dimensions, 1 numeric value per key, and 1 attribute | |
i = Item("my_par", ["X", "Y", "Z"], ["value"], ["unit"]) | |
# Similarly, a 'variable' with 2 dims and 2 numeric values. DataObject doesn't care. | |
# i = Item("my_var", ["X", "Y"], ["level", "marginal"], ["unit"]) | |
# Create an object with some data. The following exaggerate the size reduction, | |
# because of repeated values. | |
# k = 1 # 4 rows; memory usage 292 B; SQLite file size about 12 kB | |
k = 1_000 # 4k 156.6+ kB; 12 kB | |
# k = 1_000_000 # 4m 152.6+ MB; 180 kB | |
# k = 10_000_000 # 40m 1.5+ GB; 1.7 MB | |
DATA = [ | |
["x1", "y1", "z11", 2.1, "kg"], | |
["x2", "y2", "z32", 3.1, "kg"], | |
["x3", "y1", "z56", 4.1, "kg"], | |
["x4", "y2", "z89", 5.1, "kg"], | |
] * k | |
do1 = DataObject(i, pd.DataFrame(DATA)) | |
with sqlite3.connect("demo.db") as c1: | |
# Create a table | |
c1.execute("CREATE TABLE IF NOT EXISTS item_data (name TEXT UNIQUE, data BLOB)") | |
# Store `do1`, commit, close the connection | |
c1.execute("INSERT OR REPLACE INTO item_data VALUES (?, ?)", do1.data_for_sql) | |
# Connect again | |
with sqlite3.connect("demo.db") as c2: | |
# Create `do2` by reading and deserializing the data | |
do2 = DataObject.from_db(i, c2) | |
# Object data are equal | |
assert_frame_equal(do1.data, do2.data) # Slow for k >= 1_000_000 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment