编译:设定setup.py中HDF_INSTALL,然后使用make命令进行编译。
使用:python3 hdf5_test.py
#include "hdf5.h" | |
#include "c_h5read.h" | |
/* | |
* 从H5文件中读取矩阵数据,以column order存储数据 | |
* | |
* filename: H5文件名 | |
* dataset_name: 数据集名 | |
* buffer: 数据内存存储处,大小为 num_rows * num_cols | |
* num_rows: 待读取的矩阵的行数 | |
* num_cols: 待读取的矩阵的列数 | |
* offset_x: 读取的数据在H5文件中的矩阵的x方向上的偏移 (第几行) | |
* offset_y: 读取的数据在H5文件中的矩阵的y方向上的偏移 (第几列) | |
* | |
*/ | |
int read_direct_f(char *file_name, char *dataset_name, double *buffer, | |
int num_rows, int num_cols, int offset_x, int offset_y) | |
{ | |
int i; | |
hid_t file_id, dataset_id; | |
hid_t dataspace_id, memspace_id; | |
herr_t status; | |
hsize_t dims_mem[1]; | |
hsize_t offset[2]; | |
hsize_t count[2]; | |
hsize_t stride[2]; | |
hsize_t block[2]; | |
hsize_t offset_mem[1]; | |
hsize_t count_mem[1]; | |
hsize_t stride_mem[1]; | |
hsize_t block_mem[1]; | |
// open the hdf5 file and the dataset | |
file_id = H5Fopen(file_name, H5F_ACC_RDONLY, H5P_DEFAULT); | |
dataset_id = H5Dopen2(file_id, dataset_name, H5P_DEFAULT); | |
// get file dataspace | |
dataspace_id = H5Dget_space(dataset_id); | |
// create memory space | |
dims_mem[0] = num_rows; // avoid numeric overflow | |
dims_mem[0] = dims_mem[0] * num_cols; | |
memspace_id = H5Screate_simple(1, dims_mem, NULL); | |
// specify size and shape of subset to read | |
offset[0] = offset_x; | |
offset[1] = offset_y; | |
count[0] = 1; | |
count[1] = num_cols; | |
stride[0] = 1; | |
stride[1] = 1; | |
block[0] = 1; | |
block[1] = 1; | |
offset_mem[0] = 0; | |
count_mem[0] = num_cols; | |
stride_mem[0] = num_rows; | |
block_mem[0] = 1; | |
for (i=0; i<num_rows; i++) { | |
offset[0] = offset_x + i; | |
offset_mem[0] = i; | |
// select subset from dataspace | |
status = H5Sselect_hyperslab(dataspace_id, H5S_SELECT_SET, | |
offset, stride, count, block); | |
if (status < 0) { | |
H5Eprint2(H5E_DEFAULT, NULL); | |
return -1; | |
} | |
status = H5Sselect_hyperslab(memspace_id, H5S_SELECT_SET, | |
offset_mem, stride_mem, | |
count_mem, block_mem); | |
if (status < 0) { | |
H5Eprint2(H5E_DEFAULT, NULL); | |
return -1; | |
} | |
// read data | |
status = H5Dread(dataset_id, H5T_NATIVE_DOUBLE, | |
memspace_id, dataspace_id, | |
H5P_DEFAULT, buffer); | |
if (status < 0) { | |
H5Eprint2(H5E_DEFAULT, NULL); | |
return -1; | |
} | |
} | |
// close the resources | |
H5Sclose(memspace_id); | |
H5Sclose(dataspace_id); | |
H5Dclose(dataset_id); | |
H5Fclose(file_id); | |
return 0; | |
} | |
/* | |
* 从H5文件中读取矩阵数据,以column order存储数据 | |
* | |
* filename: H5文件名 | |
* dataset_name: 数据集名 | |
* buffer: 数据内存存储处,大小为 num_rows * num_cols | |
* row_idx: 要读取的行的下标数组,大小为 num_rows | |
* num_rows: 待读取的矩阵的行数 | |
* num_cols: 待读取的矩阵的列数 | |
* offset_y: 读取的数据在H5文件中的矩阵的y方向上的偏移 (第几列) | |
* | |
*/ | |
int read_direct_discrete_f(char *file_name, char *dataset_name, | |
double *buffer, int *row_idx, | |
int num_rows, int num_cols, int offset_y) | |
{ | |
int i; | |
hid_t file_id, dataset_id; | |
hid_t dataspace_id, memspace_id; | |
herr_t status; | |
hsize_t dims_mem[1]; | |
hsize_t offset[2]; | |
hsize_t count[2]; | |
hsize_t stride[2]; | |
hsize_t block[2]; | |
hsize_t offset_mem[1]; | |
hsize_t count_mem[1]; | |
hsize_t stride_mem[1]; | |
hsize_t block_mem[1]; | |
// open the hdf5 file and the dataset | |
file_id = H5Fopen(file_name, H5F_ACC_RDONLY, H5P_DEFAULT); | |
dataset_id = H5Dopen2(file_id, dataset_name, H5P_DEFAULT); | |
// get file dataspace | |
dataspace_id = H5Dget_space(dataset_id); | |
// create memory space | |
dims_mem[0] = num_rows; // avoid numeric overflow | |
dims_mem[0] = dims_mem[0] * num_cols; | |
memspace_id = H5Screate_simple(1, dims_mem, NULL); | |
// specify size and shape of subset to read | |
offset[0] = 0; | |
offset[1] = offset_y; | |
count[0] = 1; | |
count[1] = num_cols; | |
stride[0] = 1; | |
stride[1] = 1; | |
block[0] = 1; | |
block[1] = 1; | |
offset_mem[0] = 0; | |
count_mem[0] = num_cols; | |
stride_mem[0] = num_rows; | |
block_mem[0] = 1; | |
for (i=0; i<num_rows; i++) { | |
offset[0] = row_idx[i]; | |
offset_mem[0] = i; | |
// select subset from dataspace | |
status = H5Sselect_hyperslab(dataspace_id, H5S_SELECT_SET, | |
offset, stride, count, block); | |
if (status < 0) { | |
H5Eprint2(H5E_DEFAULT, NULL); | |
return -1; | |
} | |
status = H5Sselect_hyperslab(memspace_id, H5S_SELECT_SET, | |
offset_mem, stride_mem, | |
count_mem, block_mem); | |
if (status < 0) { | |
H5Eprint2(H5E_DEFAULT, NULL); | |
return -1; | |
} | |
// read data | |
status = H5Dread(dataset_id, H5T_NATIVE_DOUBLE, | |
memspace_id, dataspace_id, | |
H5P_DEFAULT, buffer); | |
if (status < 0) { | |
H5Eprint2(H5E_DEFAULT, NULL); | |
return -1; | |
} | |
} | |
// close the resources | |
H5Sclose(memspace_id); | |
H5Sclose(dataspace_id); | |
H5Dclose(dataset_id); | |
H5Fclose(file_id); | |
return 0; | |
} |
#ifndef _H5_READ_ | |
#define _H5_READ_ | |
int read_direct_f(char *file_name, char *dataset_name, double *buffer, | |
int num_rows, int num_cols, int offset_x, int offset_y); | |
int read_direct_discrete_f(char *file_name, char *dataset_name, | |
double *buffer, int *row_idx, | |
int num_rows, int num_cols, int offset_y); | |
#endif |
cdef extern from "c_h5read.h": | |
int read_direct_f(char *file_name, char *dataset_name, double *buffer, | |
int num_rows, int num_cols, int offset_x, int offset_y) | |
int read_direct_discrete_f(char *file_name, char *dataset_name, | |
double *buffer, int *row_idx, | |
int num_rows, int num_cols, int offset_y) | |
def read_direct_fortran(str file_name, str dataset_name, | |
double[::1, :] data, int offset_x, int offset_y): | |
cdef int num_rows | |
cdef int num_cols | |
cdef bytes file_name_bytes = bytes(file_name, encoding='utf-8') | |
cdef bytes dataset_name_bytes = bytes(dataset_name, encoding='utf-8') | |
cdef int ret; | |
num_rows = data.shape[0] | |
num_cols = data.shape[1] | |
ret = read_direct_f(file_name_bytes, dataset_name_bytes, &data[0, 0], | |
num_rows, num_cols, offset_x, offset_y) | |
def read_direct_discrete_fortran(str file_name, str dataset_name, | |
double[::1, :] data, int[:] row_idx, int offset_y): | |
cdef int num_rows | |
cdef int num_cols | |
cdef bytes file_name_bytes = bytes(file_name, encoding='utf-8') | |
cdef bytes dataset_name_bytes = bytes(dataset_name, encoding='utf-8') | |
cdef int ret | |
num_rows = data.shape[0] | |
num_cols = data.shape[1] | |
if num_rows != row_idx.shape[0]: | |
raise RuntimeError('Dimension of row_idx not equal num_rows') | |
ret = read_direct_discrete_f(file_name_bytes, dataset_name_bytes, | |
&data[0, 0], &row_idx[0], | |
num_rows, num_cols, offset_y) |
# Test h5read | |
import unittest | |
import os | |
import numpy as np | |
import h5py | |
import tempfile | |
import h5read | |
class TestHDF5(unittest.TestCase): | |
def test_read_direct_fortran(self): | |
num_rows, num_cols = 500, 1000 | |
subset_nrows, subset_ncols = 200, 100 | |
offset_x = 20 | |
offset_y = 30 | |
data = np.random.randn(num_rows, num_cols) | |
subset_data = np.empty((subset_nrows, subset_ncols), | |
dtype=np.float64, order='F') | |
# write data to hdf5 file | |
dataset_name = r'/data/test' | |
file_path = os.path.join(tempfile.gettempdir(), 'test.h5') | |
if os.path.isfile(file_path): | |
os.remove(file_path) | |
with h5py.File(file_path) as f: | |
f.create_dataset(dataset_name, data=data, dtype=np.float32) | |
# read data from hdf5 file in column order | |
h5read.read_direct_fortran(file_path, dataset_name, | |
subset_data, offset_x, offset_y) | |
orig_subset = data[offset_x:offset_x+subset_nrows, | |
offset_y:offset_y+subset_ncols] | |
np.testing.assert_allclose(subset_data, orig_subset) | |
def test_read_direct_discrete_fortran(self): | |
num_rows, num_cols = 500, 1000 | |
subset_nrows, subset_ncols = 200, 100 | |
offset_y = 30 | |
row_idx = np.random.choice(num_rows, size=subset_nrows, | |
replace=True) | |
row_idx = row_idx.astype(np.intc) | |
data = np.random.randn(num_rows, num_cols) | |
subset_data = np.empty((subset_nrows, subset_ncols), | |
dtype=np.float64, order='F') | |
# write data to hdf5 file | |
dataset_name = r'/data/test' | |
file_path = os.path.join(tempfile.gettempdir(), 'test.h5') | |
if os.path.isfile(file_path): | |
os.remove(file_path) | |
with h5py.File(file_path) as f: | |
f.create_dataset(dataset_name, data=data, dtype=np.float32) | |
if os.path.isfile(file_path): | |
os.remove(file_path) | |
with h5py.File(file_path) as f: | |
f.create_dataset(dataset_name, data=data, dtype=np.float32) | |
# read data from hdf5 file in column order | |
h5read.read_direct_discrete_fortran(file_path, dataset_name, | |
subset_data, row_idx, offset_y) | |
if __name__ == '__main__': | |
unittest.main() |
all: | |
python3 setup.py build_ext -i | |
clean: | |
rm -rf build/ | |
rm -f h5read.c | |
rm -f *.so |
from distutils.core import setup, Extension | |
from Cython.Build import cythonize | |
import os | |
# the path where HDF5 library installed | |
HDF_INSTALL = '/usr/local/Cellar/hdf5/1.10.2/' | |
ext = Extension("h5read", | |
include_dirs=[os.path.join(HDF_INSTALL, 'include')], | |
libraries=['hdf5'], | |
library_dirs=[os.path.join(HDF_INSTALL, 'lib')], | |
sources=['h5read.pyx', 'c_h5read.c']) | |
setup(name="h5utils", ext_modules=cythonize([ext])) |