Tushar-N/jpeg_to_h5.py

## jpeg_to_h5.py
import torch
import io
from PIL import Image
import numpy as np

# Dataset class for extracting binary data from images to store
class ImageDataset:
    def __init__(self):
        super(ImageDataset, self).__init__()
        self.images = [] # some list of PIL images

    def __getitem__(self, index):
        img = self.images[index]
        binary_data = io.BytesIO()
        img.save(binary_data, 'jpeg')
        return binary_data.getvalue()

    def __len__(self):
        return len(self.images)

import h5py
def make_h5():
    dset = ImageDataset()
    loader = torch.utils.data.DataLoader(dset, batch_size=256, shuffle=False, num_workers=16)
    hf = h5py.File('images.h5', 'w')
    dset = hf.create_dataset('crops', (len(dset), ), dtype=h5py.special_dtype(vlen=np.dtype('uint8')))
    count = 0
    for crops in tqdm.tqdm(loader, total=len(loader)):
        for crop in crops:
            dset[count] = np.frombuffer(crop, dtype='uint8')
            count += 1
    hf.close()


# Use the h5 to load images in the actual dataset class
class H5Dataset:
    def __init__(self):
        super(H5Dataset, self).__init__()
        self.hf = None
        self.num_images = 10000 # stored metadata

    def __getitem__(self, index):
        if self.hf is None:
            self.hf = h5py.File('images.h5', 'r')
        img = Image.open(io.BytesIO(self.hf[index]))
        return img

    def __len__(self):
        return self.num_images
	import torch
	import io
	from PIL import Image
	import numpy as np

	# Dataset class for extracting binary data from images to store
	class ImageDataset:
	def __init__(self):
	super(ImageDataset, self).__init__()
	self.images = [] # some list of PIL images

	def __getitem__(self, index):
	img = self.images[index]
	binary_data = io.BytesIO()
	img.save(binary_data, 'jpeg')
	return binary_data.getvalue()

	def __len__(self):
	return len(self.images)

	import h5py
	def make_h5():
	dset = ImageDataset()
	loader = torch.utils.data.DataLoader(dset, batch_size=256, shuffle=False, num_workers=16)
	hf = h5py.File('images.h5', 'w')
	dset = hf.create_dataset('crops', (len(dset), ), dtype=h5py.special_dtype(vlen=np.dtype('uint8')))
	count = 0
	for crops in tqdm.tqdm(loader, total=len(loader)):
	for crop in crops:
	dset[count] = np.frombuffer(crop, dtype='uint8')
	count += 1
	hf.close()


	# Use the h5 to load images in the actual dataset class
	class H5Dataset:
	def __init__(self):
	super(H5Dataset, self).__init__()
	self.hf = None
	self.num_images = 10000 # stored metadata

	def __getitem__(self, index):
	if self.hf is None:
	self.hf = h5py.File('images.h5', 'r')
	img = Image.open(io.BytesIO(self.hf[index]))
	return img

	def __len__(self):
	return self.num_images