Skip to content

Instantly share code, notes, and snippets.

@luowanqian
Last active May 3, 2019 14:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save luowanqian/b924987be33d51f50a30e741fe249526 to your computer and use it in GitHub Desktop.
Save luowanqian/b924987be33d51f50a30e741fe249526 to your computer and use it in GitHub Desktop.
文章:以Column-major Order读取HDF5文件数据,文章链接:http://www.scutmath.com/hdf5_column_major.html

编译:设定setup.py中HDF_INSTALL,然后使用make命令进行编译。

使用:python3 hdf5_test.py

#include "hdf5.h"
#include "c_h5read.h"
/*
* 从H5文件中读取矩阵数据,以column order存储数据
*
* filename: H5文件名
* dataset_name: 数据集名
* buffer: 数据内存存储处,大小为 num_rows * num_cols
* num_rows: 待读取的矩阵的行数
* num_cols: 待读取的矩阵的列数
* offset_x: 读取的数据在H5文件中的矩阵的x方向上的偏移 (第几行)
* offset_y: 读取的数据在H5文件中的矩阵的y方向上的偏移 (第几列)
*
*/
int read_direct_f(char *file_name, char *dataset_name, double *buffer,
int num_rows, int num_cols, int offset_x, int offset_y)
{
int i;
hid_t file_id, dataset_id;
hid_t dataspace_id, memspace_id;
herr_t status;
hsize_t dims_mem[1];
hsize_t offset[2];
hsize_t count[2];
hsize_t stride[2];
hsize_t block[2];
hsize_t offset_mem[1];
hsize_t count_mem[1];
hsize_t stride_mem[1];
hsize_t block_mem[1];
// open the hdf5 file and the dataset
file_id = H5Fopen(file_name, H5F_ACC_RDONLY, H5P_DEFAULT);
dataset_id = H5Dopen2(file_id, dataset_name, H5P_DEFAULT);
// get file dataspace
dataspace_id = H5Dget_space(dataset_id);
// create memory space
dims_mem[0] = num_rows; // avoid numeric overflow
dims_mem[0] = dims_mem[0] * num_cols;
memspace_id = H5Screate_simple(1, dims_mem, NULL);
// specify size and shape of subset to read
offset[0] = offset_x;
offset[1] = offset_y;
count[0] = 1;
count[1] = num_cols;
stride[0] = 1;
stride[1] = 1;
block[0] = 1;
block[1] = 1;
offset_mem[0] = 0;
count_mem[0] = num_cols;
stride_mem[0] = num_rows;
block_mem[0] = 1;
for (i=0; i<num_rows; i++) {
offset[0] = offset_x + i;
offset_mem[0] = i;
// select subset from dataspace
status = H5Sselect_hyperslab(dataspace_id, H5S_SELECT_SET,
offset, stride, count, block);
if (status < 0) {
H5Eprint2(H5E_DEFAULT, NULL);
return -1;
}
status = H5Sselect_hyperslab(memspace_id, H5S_SELECT_SET,
offset_mem, stride_mem,
count_mem, block_mem);
if (status < 0) {
H5Eprint2(H5E_DEFAULT, NULL);
return -1;
}
// read data
status = H5Dread(dataset_id, H5T_NATIVE_DOUBLE,
memspace_id, dataspace_id,
H5P_DEFAULT, buffer);
if (status < 0) {
H5Eprint2(H5E_DEFAULT, NULL);
return -1;
}
}
// close the resources
H5Sclose(memspace_id);
H5Sclose(dataspace_id);
H5Dclose(dataset_id);
H5Fclose(file_id);
return 0;
}
/*
* 从H5文件中读取矩阵数据,以column order存储数据
*
* filename: H5文件名
* dataset_name: 数据集名
* buffer: 数据内存存储处,大小为 num_rows * num_cols
* row_idx: 要读取的行的下标数组,大小为 num_rows
* num_rows: 待读取的矩阵的行数
* num_cols: 待读取的矩阵的列数
* offset_y: 读取的数据在H5文件中的矩阵的y方向上的偏移 (第几列)
*
*/
int read_direct_discrete_f(char *file_name, char *dataset_name,
double *buffer, int *row_idx,
int num_rows, int num_cols, int offset_y)
{
int i;
hid_t file_id, dataset_id;
hid_t dataspace_id, memspace_id;
herr_t status;
hsize_t dims_mem[1];
hsize_t offset[2];
hsize_t count[2];
hsize_t stride[2];
hsize_t block[2];
hsize_t offset_mem[1];
hsize_t count_mem[1];
hsize_t stride_mem[1];
hsize_t block_mem[1];
// open the hdf5 file and the dataset
file_id = H5Fopen(file_name, H5F_ACC_RDONLY, H5P_DEFAULT);
dataset_id = H5Dopen2(file_id, dataset_name, H5P_DEFAULT);
// get file dataspace
dataspace_id = H5Dget_space(dataset_id);
// create memory space
dims_mem[0] = num_rows; // avoid numeric overflow
dims_mem[0] = dims_mem[0] * num_cols;
memspace_id = H5Screate_simple(1, dims_mem, NULL);
// specify size and shape of subset to read
offset[0] = 0;
offset[1] = offset_y;
count[0] = 1;
count[1] = num_cols;
stride[0] = 1;
stride[1] = 1;
block[0] = 1;
block[1] = 1;
offset_mem[0] = 0;
count_mem[0] = num_cols;
stride_mem[0] = num_rows;
block_mem[0] = 1;
for (i=0; i<num_rows; i++) {
offset[0] = row_idx[i];
offset_mem[0] = i;
// select subset from dataspace
status = H5Sselect_hyperslab(dataspace_id, H5S_SELECT_SET,
offset, stride, count, block);
if (status < 0) {
H5Eprint2(H5E_DEFAULT, NULL);
return -1;
}
status = H5Sselect_hyperslab(memspace_id, H5S_SELECT_SET,
offset_mem, stride_mem,
count_mem, block_mem);
if (status < 0) {
H5Eprint2(H5E_DEFAULT, NULL);
return -1;
}
// read data
status = H5Dread(dataset_id, H5T_NATIVE_DOUBLE,
memspace_id, dataspace_id,
H5P_DEFAULT, buffer);
if (status < 0) {
H5Eprint2(H5E_DEFAULT, NULL);
return -1;
}
}
// close the resources
H5Sclose(memspace_id);
H5Sclose(dataspace_id);
H5Dclose(dataset_id);
H5Fclose(file_id);
return 0;
}
#ifndef _H5_READ_
#define _H5_READ_
int read_direct_f(char *file_name, char *dataset_name, double *buffer,
int num_rows, int num_cols, int offset_x, int offset_y);
int read_direct_discrete_f(char *file_name, char *dataset_name,
double *buffer, int *row_idx,
int num_rows, int num_cols, int offset_y);
#endif
cdef extern from "c_h5read.h":
int read_direct_f(char *file_name, char *dataset_name, double *buffer,
int num_rows, int num_cols, int offset_x, int offset_y)
int read_direct_discrete_f(char *file_name, char *dataset_name,
double *buffer, int *row_idx,
int num_rows, int num_cols, int offset_y)
def read_direct_fortran(str file_name, str dataset_name,
double[::1, :] data, int offset_x, int offset_y):
cdef int num_rows
cdef int num_cols
cdef bytes file_name_bytes = bytes(file_name, encoding='utf-8')
cdef bytes dataset_name_bytes = bytes(dataset_name, encoding='utf-8')
cdef int ret;
num_rows = data.shape[0]
num_cols = data.shape[1]
ret = read_direct_f(file_name_bytes, dataset_name_bytes, &data[0, 0],
num_rows, num_cols, offset_x, offset_y)
def read_direct_discrete_fortran(str file_name, str dataset_name,
double[::1, :] data, int[:] row_idx, int offset_y):
cdef int num_rows
cdef int num_cols
cdef bytes file_name_bytes = bytes(file_name, encoding='utf-8')
cdef bytes dataset_name_bytes = bytes(dataset_name, encoding='utf-8')
cdef int ret
num_rows = data.shape[0]
num_cols = data.shape[1]
if num_rows != row_idx.shape[0]:
raise RuntimeError('Dimension of row_idx not equal num_rows')
ret = read_direct_discrete_f(file_name_bytes, dataset_name_bytes,
&data[0, 0], &row_idx[0],
num_rows, num_cols, offset_y)
# Test h5read
import unittest
import os
import numpy as np
import h5py
import tempfile
import h5read
class TestHDF5(unittest.TestCase):
def test_read_direct_fortran(self):
num_rows, num_cols = 500, 1000
subset_nrows, subset_ncols = 200, 100
offset_x = 20
offset_y = 30
data = np.random.randn(num_rows, num_cols)
subset_data = np.empty((subset_nrows, subset_ncols),
dtype=np.float64, order='F')
# write data to hdf5 file
dataset_name = r'/data/test'
file_path = os.path.join(tempfile.gettempdir(), 'test.h5')
if os.path.isfile(file_path):
os.remove(file_path)
with h5py.File(file_path) as f:
f.create_dataset(dataset_name, data=data, dtype=np.float32)
# read data from hdf5 file in column order
h5read.read_direct_fortran(file_path, dataset_name,
subset_data, offset_x, offset_y)
orig_subset = data[offset_x:offset_x+subset_nrows,
offset_y:offset_y+subset_ncols]
np.testing.assert_allclose(subset_data, orig_subset)
def test_read_direct_discrete_fortran(self):
num_rows, num_cols = 500, 1000
subset_nrows, subset_ncols = 200, 100
offset_y = 30
row_idx = np.random.choice(num_rows, size=subset_nrows,
replace=True)
row_idx = row_idx.astype(np.intc)
data = np.random.randn(num_rows, num_cols)
subset_data = np.empty((subset_nrows, subset_ncols),
dtype=np.float64, order='F')
# write data to hdf5 file
dataset_name = r'/data/test'
file_path = os.path.join(tempfile.gettempdir(), 'test.h5')
if os.path.isfile(file_path):
os.remove(file_path)
with h5py.File(file_path) as f:
f.create_dataset(dataset_name, data=data, dtype=np.float32)
if os.path.isfile(file_path):
os.remove(file_path)
with h5py.File(file_path) as f:
f.create_dataset(dataset_name, data=data, dtype=np.float32)
# read data from hdf5 file in column order
h5read.read_direct_discrete_fortran(file_path, dataset_name,
subset_data, row_idx, offset_y)
if __name__ == '__main__':
unittest.main()
all:
python3 setup.py build_ext -i
clean:
rm -rf build/
rm -f h5read.c
rm -f *.so
from distutils.core import setup, Extension
from Cython.Build import cythonize
import os
# the path where HDF5 library installed
HDF_INSTALL = '/usr/local/Cellar/hdf5/1.10.2/'
ext = Extension("h5read",
include_dirs=[os.path.join(HDF_INSTALL, 'include')],
libraries=['hdf5'],
library_dirs=[os.path.join(HDF_INSTALL, 'lib')],
sources=['h5read.pyx', 'c_h5read.c'])
setup(name="h5utils", ext_modules=cythonize([ext]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment