Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Save jpeg images as compressed binary data, instead of a dense (C, H, W) uint8 tensor.
import torch
import io
from PIL import Image
import numpy as np
# Dataset class for extracting binary data from images to store
class ImageDataset:
def __init__(self):
super(ImageDataset, self).__init__()
self.images = [] # some list of PIL images
def __getitem__(self, index):
img = self.images[index]
binary_data = io.BytesIO()
img.save(binary_data, 'jpeg')
return binary_data.getvalue()
def __len__(self):
return len(self.images)
import h5py
def make_h5():
dset = ImageDataset()
loader = torch.utils.data.DataLoader(dset, batch_size=256, shuffle=False, num_workers=16)
hf = h5py.File('images.h5', 'w')
dset = hf.create_dataset('crops', (len(dset), ), dtype=h5py.special_dtype(vlen=np.dtype('uint8')))
count = 0
for crops in tqdm.tqdm(loader, total=len(loader)):
for crop in crops:
dset[count] = np.frombuffer(crop, dtype='uint8')
count += 1
hf.close()
# Use the h5 to load images in the actual dataset class
class H5Dataset:
def __init__(self):
super(H5Dataset, self).__init__()
self.hf = None
self.num_images = 10000 # stored metadata
def __getitem__(self, index):
if self.hf is None:
self.hf = h5py.File('images.h5', 'r')
img = Image.open(io.BytesIO(self.hf[index]))
return img
def __len__(self):
return self.num_images
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.