Skip to content

Instantly share code, notes, and snippets.

@wrathematics
Last active November 30, 2021 12:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save wrathematics/4821828b52fcbe2a49d0ea8f5d9378d5 to your computer and use it in GitHub Desktop.
Save wrathematics/4821828b52fcbe2a49d0ea8f5d9378d5 to your computer and use it in GitHub Desktop.
Use the hdfmat package to split an HDF5 matrix generated by armadillo into row chunks
// h5c++ generate.cpp -o generate
#include <string>
#define ARMA_USE_HDF5
#include <armadillo>
static inline void gen(const int m, const int n, std::string fname)
{
arma::fmat x(m, n);
for (int j=0; j<n; j++)
{
arma::fvec col = arma::linspace<arma::fvec>(j*m + 1, j*m + m, m);
x.col(j) = col;
}
std::cout << x << std::endl;
x.save(arma::hdf5_name(fname, "mydata"));
}
int main()
{
const int m = 10;
const int n = 5;
const std::string fname = "/tmp/test/test_mat.h5";
gen(m, n, fname);
return 0;
}
// h5c++ read.cpp -o read
#include <string>
#define ARMA_USE_HDF5
#include <armadillo>
int main()
{
const std::string fname = "/tmp/test/test_mat1.h5";
arma::fmat x;
x.load(arma::hdf5_name(fname, "mydata"));
std::cout << x << std::endl;
return 0;
}
suppressMessages(library(hdfmat))
nchunks = 3
storage_path = "/tmp/test"
fname_preface = "test_mat"
post = ".h5"
varname = "mydata"
# ------------------------------------------------------------------------------
h = hdfmat_open(file.path(storage_path, paste0(fname_preface, post)), varname)
ncols = h$dim()[2]
split_n_by_k = function(n, k){
if (k > n) stop("")
if (n == k) return(rep(1, n))
else {
t = n %/% k
r = n %% k
ret = rep(t, k)
if (r > 0)
ret[1:r] = ret[1:r] + 1
ret
}
}
chunk_lens = split_n_by_k(ncols, nchunks)
stops = cumsum(chunk_lens)
for (chunk in 1:nchunks){
col_stop = stops[chunk]
col_start = col_stop - chunk_lens[chunk] + 1
x = h$read(col_start=col_start, col_stop=col_stop)
fname_chunk = file.path(storage_path, paste0(fname_preface, chunk, post))
h_chunk = hdfmat(file=fname_chunk, varname, nrow(x), ncol(x), type="float")
h_chunk$fill(x)
h_chunk$close()
invisible(gc())
}
@wrathematics
Copy link
Author

Workflow is

  • generate
  • split
  • read

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment