Skip to content

Instantly share code, notes, and snippets.

@manodeep
Created June 27, 2019 22:58
Show Gist options
  • Save manodeep/b8eb323f11d57165508bbe546748a944 to your computer and use it in GitHub Desktop.
Save manodeep/b8eb323f11d57165508bbe546748a944 to your computer and use it in GitHub Desktop.
OpenMP parallel reads of 1-D arrays from HDF5 files
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <omp.h>
#include "hdf5.h"
#include "sglib.h"
/*
This is a small code that demonstrates how to read an HDF5 array (i.e., 1-D array)
in an OpenMP parallel manner.
Note, that reading in parallel is only directly supported by the threadsafe build of the
hdf5 library. The code will automatically detect if the library is threadsafe; and if not,
then this code will serialise all the reads via an OpenMP lock mechanism.
*/
/*Global variable, OpenMP lock*/
omp_lock_t lock;
#define CHECK_STATUS_AND_RETURN_ON_FAIL(status, return_value, ...) \
do { \
if(status < 0) { \
fprintf(stderr, __VA_ARGS__); \
return status; \
} \
} while (0)
#define CREATE_AND_WRITE_DATASET(file_id, field_name, dims, h5_dtype, data) { \
hid_t macro_dataspace_id = H5Screate_simple(1, dims, NULL); \
CHECK_STATUS_AND_RETURN_ON_FAIL(macro_dataspace_id, (int32_t) dataspace_id, \
"Could not create a dataspace for field #field_name.\n" \
"The dimensions of the dataspace was %d\n", (int32_t) dims[0]); \
hid_t macro_dataset_id = H5Dcreate2(file_id, field_name, h5_dtype, macro_dataspace_id, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); \
CHECK_STATUS_AND_RETURN_ON_FAIL(macro_dataset_id, (int32_t) dataset_id, \
"Could not create a dataset for field #field_name.\n" \
"The dimensions of the dataset was %d\nThe file id was %d\n.", \
(int32_t) dims[0], (int32_t) file_id); \
status = H5Dwrite(macro_dataset_id, h5_dtype, H5S_ALL, H5S_ALL, H5P_DEFAULT, data); \
CHECK_STATUS_AND_RETURN_ON_FAIL(status, (int32_t) status, \
"Failed to write a dataset for field #field_name.\n" \
"The dimensions of the dataset was %d\nThe file ID was %d\n." \
"The dataset ID was %d.", (int32_t) dims[0], (int32_t) file_id, \
(int32_t) macro_dataset_id); \
status = H5Dclose(macro_dataset_id); \
CHECK_STATUS_AND_RETURN_ON_FAIL(status, (int32_t) status, \
"Failed to close the dataset for field #field_name.\n" \
"The dimensions of the dataset was %d\nThe file ID was %d\n." \
"The dataset ID was %d.", (int32_t) dims[0], (int32_t) file_id, \
(int32_t) macro_dataset_id); \
status = H5Sclose(macro_dataspace_id); \
CHECK_STATUS_AND_RETURN_ON_FAIL(status, (int32_t) status, \
"Failed to close the dataspace for field #field_name.\n" \
"The dimensions of the dataset was %d\nThe file ID was %d\n." \
"The dataspace ID was %d.", (int32_t) dims[0], (int32_t) file_id, \
(int32_t) macro_dataspace_id); \
}
#define READ_PARTIAL_1D_ARRAY(fd, dataset_name, offset, stride, count, block, h5_dtype, buffer) \
{ \
hid_t dataset_id = H5Dopen(fd, dataset_name, H5P_DEFAULT); \
if (dataset_id < 0) { \
fprintf(stderr, "Error encountered when trying to open up dataset %s\n", dataset_name); \
H5Eprint(dataset_id, stderr); \
return EXIT_FAILURE; \
} \
hid_t filespace = H5Dget_space (dataset_id); \
if(filespace < 0) { \
fprintf(stderr, "Error encountered when trying to get space for dataset %s\n", dataset_name); \
return EXIT_FAILURE; \
} \
herr_t status = H5Sselect_hyperslab(filespace, H5S_SELECT_SET, offset, stride, count, block); \
CHECK_STATUS_AND_RETURN_ON_FAIL(status, (int32_t) status, \
"Failed to select hyperslab for #dataset_name.\n" \
"The dimensions of the dataset was %d\nThe file ID was %d\n." \
"The dataspace ID was %d.", (int32_t) *count, (int32_t) fd, \
(int32_t) dataset_id); \
hid_t memspace = H5Screate_simple(1, count, NULL); \
status = H5Dread(dataset_id, h5_dtype, memspace, filespace, H5P_DEFAULT, buffer); \
CHECK_STATUS_AND_RETURN_ON_FAIL(status, (int32_t) status, \
"Failed to read array for #dataset_name.\n" \
"The dimensions of the dataset was %d\nThe file ID was %d\n." \
"The dataspace ID was %d.", (int32_t) *count, (int32_t) fd, \
(int32_t) dataset_id); \
H5Dclose(dataset_id); \
H5Sclose(filespace); \
H5Sclose(memspace); \
}
#define GENERATE_VALUE_IN_ARRAY(i) (i*i)
typedef struct halo_info {
int32_t rank;
int32_t size;
int64_t start_offset;
int64_t nelements;
int64_t *partial_data;
} halo_info;
int read_partial_array(struct halo_info *info, hid_t file, const int is_threadsafe)
{
hsize_t offset[1];
offset[0] = info->start_offset;
hsize_t count[1];
count[0] = info->nelements;
if(is_threadsafe != 1) {
/*Serialize HDF dataset writing using OpenMP lock*/
omp_set_lock(&lock);
}
READ_PARTIAL_1D_ARRAY(file, "array", &offset[0], NULL, &count[0], NULL, H5T_NATIVE_INT64, info->partial_data);
if(is_threadsafe != 1) {
/*Release lock*/
omp_unset_lock(&lock);
}
return EXIT_SUCCESS;
}
int32_t return_random_integer_within_range(const int32_t range_min, const int32_t range_max)
{
/* Assumes the random number is already seeded appropriately */
return (rand() % (range_max - range_min + 1)) + range_min;
}
//A reasonable solution solution to randomly dividing "totN" over "info->size" partitions
int split_array_over_tasks(const int64_t totN, const unsigned int seed, const int size, int32_t *nchunks)
{
srand(seed);
if(nchunks == NULL) {
fprintf(stderr, "Error: `nchunks` can not be NULL. Needs to hold size=%d elements of int32_t each\n", size);
return EXIT_FAILURE;
}
for(int i = 0; i < size-1; i++) {
nchunks[i] = return_random_integer_within_range(1, totN);
}
/* sort the nchunks array */
SGLIB_ARRAY_SINGLE_QUICK_SORT(int32_t, nchunks, size-1, SGLIB_NUMERIC_COMPARATOR);
nchunks[size-1] = totN;
int32_t prev_val = 0;
for(int i=0;i<size;i++) {
int32_t tmp_chunk = nchunks[i] - prev_val;
prev_val = nchunks[i];
nchunks[i] = tmp_chunk;
}
int64_t sum = 0;
for(int i=0;i<size;i++) {
/* fprintf(stderr,"rank = %d nchunks[%d] = %d sum = %"PRId64"\n", i, i, nchunks[i], sum); */
sum += nchunks[i];
}
if(sum != totN) {
fprintf(stderr,"Error: The sum of chunks does not equal the requested range\n");
return EXIT_FAILURE;
}
return EXIT_SUCCESS;
}
int main (void)
{
herr_t status;
/*Disable dynamic allocation of threads*/
omp_set_dynamic(0);
hbool_t is_ts;
H5is_library_threadsafe(&is_ts);
fprintf(stderr, "HDF5 library threadsafe = %d\n", (int) is_ts);
if((int) is_ts != 1) {
fprintf(stderr,"Warning: The HDF5 library is not thread-safe -- all reads will be serialized\n");
/*Initialize (global) lock*/
omp_init_lock(&lock);
}
/*
* Create a new file using the default properties.
*/
const char partial_filename[] = {"test_partial.h5"};
hid_t file = H5Fcreate (partial_filename, H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT);
const int64_t N = 100000000;
int64_t *new_array = malloc(N*sizeof(*new_array));
for(int64_t i=0;i<N;i++) new_array[i] = GENERATE_VALUE_IN_ARRAY(i);
// Redshift at each snapshot.
hsize_t dims[] = {N};
CREATE_AND_WRITE_DATASET(file, "array", dims, H5T_NATIVE_INT64, new_array);
status = H5Fclose (file);
//Now open the file for reading
file = H5Fopen (partial_filename, H5F_ACC_RDONLY, H5P_DEFAULT);
const int nthreadmax = 32;
const unsigned int seed = 237046530;
status = EXIT_FAILURE;
for(int nthreads=1;nthreads<nthreadmax;nthreads++) {
double start_time = omp_get_wtime();
omp_set_num_threads(nthreads);
int32_t *nchunks = calloc(nthreads, sizeof(*nchunks));
fprintf(stderr,"Running with %d threads................", nthreads);
status = split_array_over_tasks(N, seed, nthreads, nchunks);
if(status != EXIT_SUCCESS) {
goto fail;
}
#pragma omp parallel
{
const int tid = omp_get_thread_num();
if(nthreads != omp_get_num_threads()) {
fprintf(stderr,"Error: The total number of threads as seen by OpenMP is not the same as intended...exiting on rank = %d\n", tid);
exit(EXIT_FAILURE);
}
struct halo_info info;
info.rank = tid;
info.size = nthreads;
info.nelements = nchunks[tid];
int64_t sum = 0;
for(int i=0;i<nthreads;i++) {
if(i == tid) {
info.start_offset = sum;
}
sum += nchunks[i];
}
info.partial_data = malloc(info.nelements*sizeof(info.partial_data[0]));
if(info.partial_data == NULL) {
fprintf(stderr,"Error: on tid = %d could not allocate memory for partial_data with nelements = %"PRId64"\n", tid, info.nelements);
exit(EXIT_FAILURE);
}
status = read_partial_array(&info, file, (int) is_ts);//last argument specifies whether the library is threadsafe.
if(status != EXIT_SUCCESS) {
exit(status);
}
/* Now test that the data was read in correctly */
for(int i=0;i<nthreads;i++) {
if(tid == i) {
/* fprintf(stderr,"On tid = %d start = %"PRId64" nelements = %"PRId64"\n", tid, info.start_offset, info.nelements); */
for(int64_t j=0;j<info.nelements;j++) {
const int64_t idx = info.start_offset + j;
if(idx >= N) {
fprintf(stderr,"Error: on tid = %d trying to access idx = %"PRId64" that is invalid since the max. is N = %"PRId64"\n",
tid, idx, N);
exit(EXIT_FAILURE);
}
/* fprintf(stderr,"On tid = %d, accessing idx = %"PRId64" and comparing to partial_data[%"PRId64"]\n", tid, idx, j); */
if (info.partial_data[j] != new_array[idx]) {
fprintf(stderr,"Error: on tid = %d partial_data[%"PRId64"] = %"PRId64" does NOT equal new_array[%"PRId64"] = %"PRId64"\n",
tid, j, info.partial_data[j], idx, new_array[idx]);
exit(EXIT_FAILURE);
}
}
}
}
free(info.partial_data);
}
free(nchunks);
fprintf(stderr,"...done. Time taken = %0.3g seconds\n", omp_get_wtime() - start_time);
}
if((int) is_ts != 1) {
//Finished lock mechanism, destroy it
omp_destroy_lock(&lock);
}
free(new_array);
H5Fclose (file);
return EXIT_SUCCESS;
fail:
free(new_array);
H5Fclose(file);
return status;
}
@manodeep
Copy link
Author

You will need "sglib.h" (http://sglib.sourceforge.net/)

On my OSX laptop, I compile with gcc -Wall -Wextra -I/opt/local/include -std=c99 test_hdf5_partial_access.c -o test_hdf5_partial_access -L/opt/local/lib -lhdf5 -fopenmp -g. Adapt as necessary for your hdf5 library paths.

The default output from my laptop (with 2 cores) is:

$  ./test_hdf5_partial_access
HDF5 library threadsafe = 0
Warning: The HDF5 library is not thread-safe -- all reads will be serialized
Running with 1 threads...................done. Time taken = 1.41 seconds
Running with 2 threads...................done. Time taken = 0.579 seconds
Running with 3 threads...................done. Time taken = 0.572 seconds
Running with 4 threads...................done. Time taken = 0.532 seconds
Running with 5 threads...................done. Time taken = 0.597 seconds
Running with 6 threads...................done. Time taken = 0.521 seconds
Running with 7 threads...................done. Time taken = 0.455 seconds
Running with 8 threads...................done. Time taken = 0.404 seconds
Running with 9 threads...................done. Time taken = 0.419 seconds
Running with 10 threads...................done. Time taken = 0.393 seconds
Running with 11 threads...................done. Time taken = 0.419 seconds
Running with 12 threads...................done. Time taken = 0.395 seconds
Running with 13 threads...................done. Time taken = 0.378 seconds
Running with 14 threads...................done. Time taken = 0.378 seconds
Running with 15 threads...................done. Time taken = 0.378 seconds
Running with 16 threads...................done. Time taken = 0.372 seconds
Running with 17 threads...................done. Time taken = 0.396 seconds
Running with 18 threads...................done. Time taken = 0.409 seconds
Running with 19 threads...................done. Time taken = 0.376 seconds
Running with 20 threads...................done. Time taken = 0.389 seconds
Running with 21 threads...................done. Time taken = 0.39 seconds
Running with 22 threads...................done. Time taken = 0.394 seconds
Running with 23 threads...................done. Time taken = 0.422 seconds
Running with 24 threads...................done. Time taken = 0.377 seconds
Running with 25 threads...................done. Time taken = 0.376 seconds
Running with 26 threads...................done. Time taken = 0.396 seconds
Running with 27 threads...................done. Time taken = 0.395 seconds
Running with 28 threads...................done. Time taken = 0.433 seconds
Running with 29 threads...................done. Time taken = 0.385 seconds
Running with 30 threads...................done. Time taken = 0.404 seconds
Running with 31 threads...................done. Time taken = 0.418 seconds

@manodeep
Copy link
Author

Default output on local supercomputer:

$  ./test_hdf5_partial_access 
HDF5 library threadsafe = 1
Running with 1 threads...................done. Time taken = 0.868 seconds
Running with 2 threads...................done. Time taken = 0.878 seconds
Running with 3 threads...................done. Time taken = 0.873 seconds
Running with 4 threads...................done. Time taken = 1.24 seconds
Running with 5 threads...................done. Time taken = 0.947 seconds
Running with 6 threads...................done. Time taken = 0.751 seconds
Running with 7 threads...................done. Time taken = 0.635 seconds
Running with 8 threads...................done. Time taken = 0.658 seconds
Running with 9 threads...................done. Time taken = 0.62 seconds
Running with 10 threads...................done. Time taken = 0.587 seconds
Running with 11 threads...................done. Time taken = 0.702 seconds
Running with 12 threads...................done. Time taken = 0.677 seconds
Running with 13 threads...................done. Time taken = 0.607 seconds
Running with 14 threads...................done. Time taken = 0.632 seconds
Running with 15 threads...................done. Time taken = 0.621 seconds
Running with 16 threads...................done. Time taken = 0.585 seconds
Running with 17 threads...................done. Time taken = 0.577 seconds
Running with 18 threads...................done. Time taken = 0.552 seconds
Running with 19 threads...................done. Time taken = 0.584 seconds
Running with 20 threads...................done. Time taken = 0.571 seconds
Running with 21 threads...................done. Time taken = 0.596 seconds
Running with 22 threads...................done. Time taken = 0.545 seconds
Running with 23 threads...................done. Time taken = 0.549 seconds
Running with 24 threads...................done. Time taken = 0.653 seconds
Running with 25 threads...................done. Time taken = 0.605 seconds
Running with 26 threads...................done. Time taken = 0.597 seconds
Running with 27 threads...................done. Time taken = 0.631 seconds
Running with 28 threads...................done. Time taken = 0.577 seconds
Running with 29 threads...................done. Time taken = 0.542 seconds
Running with 30 threads...................done. Time taken = 0.583 seconds
Running with 31 threads...................done. Time taken = 0.532 seconds

On the same supercomputer, but without a threadsafe hdf5 build:

$ ./test_hdf5_partial_access 
HDF5 library threadsafe = 0
Warning: The HDF5 library is not thread-safe -- all reads will be serialized
Running with 1 threads...................done. Time taken = 0.888 seconds
Running with 2 threads...................done. Time taken = 0.845 seconds
Running with 3 threads...................done. Time taken = 0.907 seconds
Running with 4 threads...................done. Time taken = 0.903 seconds
Running with 5 threads...................done. Time taken = 0.713 seconds
Running with 6 threads...................done. Time taken = 0.813 seconds
Running with 7 threads...................done. Time taken = 0.736 seconds
Running with 8 threads...................done. Time taken = 1.11 seconds
Running with 9 threads...................done. Time taken = 0.688 seconds
Running with 10 threads...................done. Time taken = 0.724 seconds
Running with 11 threads...................done. Time taken = 0.728 seconds
Running with 12 threads...................done. Time taken = 0.811 seconds
Running with 13 threads...................done. Time taken = 0.649 seconds
Running with 14 threads...................done. Time taken = 0.75 seconds
Running with 15 threads...................done. Time taken = 0.764 seconds
Running with 16 threads...................done. Time taken = 0.734 seconds
Running with 17 threads...................done. Time taken = 0.673 seconds
Running with 18 threads...................done. Time taken = 0.653 seconds
Running with 19 threads...................done. Time taken = 0.719 seconds
Running with 20 threads...................done. Time taken = 0.622 seconds
Running with 21 threads...................done. Time taken = 0.789 seconds
Running with 22 threads...................done. Time taken = 0.681 seconds
Running with 23 threads...................done. Time taken = 0.62 seconds
Running with 24 threads...................done. Time taken = 0.629 seconds
Running with 25 threads...................done. Time taken = 0.683 seconds
Running with 26 threads...................done. Time taken = 0.716 seconds
Running with 27 threads...................done. Time taken = 0.671 seconds
Running with 28 threads...................done. Time taken = 0.605 seconds
Running with 29 threads...................done. Time taken = 0.653 seconds
Running with 30 threads...................done. Time taken = 0.622 seconds
Running with 31 threads...................done. Time taken = 0.561 seconds

With a non-threadsafe, but MPI build of the hdf5 library:

$ ./test_hdf5_partial_access 
HDF5 library threadsafe = 0
Warning: The HDF5 library is not thread-safe -- all reads will be serialized
Running with 1 threads...................done. Time taken = 0.972 seconds
Running with 2 threads...................done. Time taken = 0.66 seconds
Running with 3 threads...................done. Time taken = 0.695 seconds
Running with 4 threads...................done. Time taken = 0.743 seconds
Running with 5 threads...................done. Time taken = 0.676 seconds
Running with 6 threads...................done. Time taken = 0.63 seconds
Running with 7 threads...................done. Time taken = 0.693 seconds
Running with 8 threads...................done. Time taken = 0.594 seconds
Running with 9 threads...................done. Time taken = 0.557 seconds
Running with 10 threads...................done. Time taken = 0.577 seconds
Running with 11 threads...................done. Time taken = 0.599 seconds
Running with 12 threads...................done. Time taken = 0.629 seconds
Running with 13 threads...................done. Time taken = 0.609 seconds
Running with 14 threads...................done. Time taken = 0.564 seconds
Running with 15 threads...................done. Time taken = 0.703 seconds
Running with 16 threads...................done. Time taken = 0.628 seconds
Running with 17 threads...................done. Time taken = 0.655 seconds
Running with 18 threads...................done. Time taken = 0.603 seconds
Running with 19 threads...................done. Time taken = 0.708 seconds
Running with 20 threads...................done. Time taken = 0.614 seconds
Running with 21 threads...................done. Time taken = 0.689 seconds
Running with 22 threads...................done. Time taken = 0.591 seconds
Running with 23 threads...................done. Time taken = 0.528 seconds
Running with 24 threads...................done. Time taken = 0.521 seconds
Running with 25 threads...................done. Time taken = 0.601 seconds
Running with 26 threads...................done. Time taken = 0.65 seconds
Running with 27 threads...................done. Time taken = 0.733 seconds
Running with 28 threads...................done. Time taken = 0.588 seconds
Running with 29 threads...................done. Time taken = 0.504 seconds
Running with 30 threads...................done. Time taken = 0.576 seconds
Running with 31 threads...................done. Time taken = 0.54 seconds

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment