Skip to content

Instantly share code, notes, and snippets.

@xLaszlo
Last active January 6, 2023 17:53
Show Gist options
  • Save xLaszlo/f09c46429239e78fd982e792f4adde67 to your computer and use it in GitHub Desktop.
Save xLaszlo/f09c46429239e78fd982e792f4adde67 to your computer and use it in GitHub Desktop.
File like class to store dataclasses or pydantic classes as gzipped JSONL-s
import gzip
import json
from dataclasses import asdict
from pydantic import BaseModel
class DataFile:
def __init__(self, data_type, filename, mode, loader=None):
if mode not in ['r', 'w', 'rt', 'wt']:
raise ValueError(f"{mode} must be 'r', 'w', 'rt', 'wt'")
self.data_type = data_type
self.filename = filename
self.mode = mode[0]
self.file_handler = None
self.loader = loader
def open(self):
self.file_handler = gzip.open(self.filename, f'{self.mode}t')
def close(self):
self.file_handler.close()
self.file_handler = None
def __enter__(self):
self.open()
return self
def __exit__(self, *args):
self.close()
def __iter__(self):
if self.file_handler is None:
with self as f:
for data in f:
yield data
else:
for row in self.file_handler:
if self.loader is not None:
yield self.loader(json.loads(row))
else:
yield self.data_type(**json.loads(row))
def write(self, data):
if data.__class__ != self.data_type:
raise ValueError("Type of input data {data.__class__} doesn't match file's {self.data_type}")
if hasattr(data, '__dataclass_fields__'):
row = json.dumps(asdict(data), default=lambda dt: dt.isoformat())
elif isinstance(data, BaseModel):
row = data.json()
else:
raise ValueError(f'Only pydantic BaseModel, dataclass and python dataclass types are allowed, got: {data.__type__}')
self.file_handler.write(f'{row}\n')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment