Skip to content

Instantly share code, notes, and snippets.

@derricw
Created August 20, 2015 16:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save derricw/24f1c9ef154b12eca8f7 to your computer and use it in GitHub Desktop.
Save derricw/24f1c9ef154b12eca8f7 to your computer and use it in GitHub Desktop.
Pack data into hdf5 1GB at a time.
import logging
import numpy as np
import h5py
def get_fileobj_size(file_obj):
"""
Gets the size of an open file object.
Args:
file_obj (FileObject): an open file object.
Returns:
int: size of the file in bytes
"""
start_pos = file_obj.tell()
file_obj.seek(0, 2)
size = file_obj.tell()
file_obj.seek(start_pos)
return size
def chunk2hdf5(h5_file,
data_file,
dtype,
data_shape=(-1,),
data_name="data",
chunk_size=10**9):
"""
Loads the data into the hdf5 file in chunks. Useful for really big
binary files.
Args:
h5_file (h5py.File or str): An hdf5 file to add the data to.
data_file (FileObject or str): A file containing a binary data set
dtype (numpy.dtype): intended data type of binary data
data_shape (Optional[tuple]): intended shape of binary data
data_name (Optional[str]): name of destination dataset
chunk_size (Optional[int]): maximum bytes per chunk. Default:10^9
Returns:
int: total bytes added to the hdf5 dataset.
Raises:
IOError: data file doesn't exist
NameError: Dataset already has data by that name.
IOError: Chunk shape not a multiple of row size.
"""
# get file objects if they gave us strings
if isinstance(h5_file, str):
h5_file = h5py.File(h5_file, 'a')
if isinstance(data_file, str):
data_file = open(data_file, 'rb')
data_file.seek(0)
if isinstance(data_shape, int):
data_shape = (data_shape,)
total_bytes = get_fileobj_size(data_file)
itemsize = np.dtype(dtype).itemsize
row_size = itemsize
for l in data_shape[1:]:
row_size *= l
total_rows = total_bytes / row_size
rows_per_chunk = chunk_size // row_size
rounded_chunk_size = rows_per_chunk * row_size
rounded_chunk_items = rounded_chunk_size / itemsize
maxshape = [None] + list(data_shape[1:])
data_shape = [total_rows] + list(data_shape[1:])
# create the dataset if it doesn't exist
if data_name not in h5_file.keys():
dset = h5_file.create_dataset(data_name,
shape=data_shape,
dtype=dtype,
maxshape=maxshape)
else:
raise NameError("Dataset already exists! Choose a new name.")
# add the data to the dataset
chunk_count = 0
sample_count = 0
reshape = [-1]+ list(data_shape[1:])
while True:
#read a chunk
chunk = np.fromfile(data_file,
dtype=dtype,
count=rounded_chunk_items,
)
try:
chunk = chunk.reshape(reshape)
except ValueError:
raise IOError("Chunk shape doesn't match desired shape.")
# if chunk has rows, write it. if not break
if chunk.shape[0] > 0:
start = chunk_count * rows_per_chunk
stop = start + chunk.shape[0]
dset[start:stop] = chunk
chunk_count+=1
sample_count+=chunk.size
else:
logging.info("chunk2hdf5 Finished!")
break
logging.info(" - moved {} samples in {} chunks.".format(sample_count,
chunk_count))
return sample_count
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment