Skip to content

Instantly share code, notes, and snippets.

@aaronwolen
Created April 7, 2023 14:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save aaronwolen/5167b391876b4a016baffd8a59631f35 to your computer and use it in GitHub Desktop.
Save aaronwolen/5167b391876b4a016baffd8a59631f35 to your computer and use it in GitHub Desktop.
Quick examples of querying dense arrays with TileDB-R

TileDB Dense Array Queries

Setup

Create a dense matrix random integers.

library(tiledb)
set.seed(123)

nr <- 50
nc <- 8

m <- matrix(
  sample(seq_len(nr), nr*nc, replace=TRUE),
  nrow=nr,
  ncol=nc
)
m

##       [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
##  [1,]   31   13   23   26   10   30    6    3
##  [2,]   15   18   15    9   25   31   42   18
##  [3,]   14   33   21    7    8   29   42   32
##  [4,]    3   27   37   34   18   17   38   17
##  [5,]   42   25    8   48    9   37   48   38
##  [6,]   50   38   10   13    7   20   16   42
##  [7,]   43   21   50   19    7   35   35   40
##  [8,]   37   15   42   47   10   25   36   42
##  [9,]   14   41   44   39   24   46   27   32
## [10,]   25   47   34    4   23   33   27   26
## [11,]   26   26   10    1   26    2   35   13
## [12,]   27   31   22   40   43    4   23   20
## [13,]    5   16   12   30   33   10    3   38
## [14,]   27   30   20   30   29   33   31   20
## [15,]   28    6   46   25   10    5   48   29
## [16,]    9   43   17   16   13   25    2   20
## [17,]   29    8   46   24   43    8   47   30
## [18,]   35   22   35   11   11   25   23   41
## [19,]    8   22   40   48   25   21    3   48
## [20,]   26   39   46   20   26   45    9    8
## [21,]    7   31   30   40    7   18   39   39
## [22,]   42   48   15    3   25   42   30   10
## [23,]    9   17   24   29   23   31   31   17
## [24,]   19   50   49   36   26    6   30   18
## [25,]   36   49   23   44   32    7    9   40
## [26,]   14   34   43   22   20   41   36   42
## [27,]   17    4    7   49   24   48   34   11
## [28,]   43   13   29   42    9   17   22   49
## [29,]   39    5   15   20   41   45   44   44
## [30,]   12   25   23   11   37   28   41    2
## [31,]   15   22   26    8   23   40   28   19
## [32,]   32   25   38   46   14    7   29   13
## [33,]   42   32   46   21   46   20   43   13
## [34,]   45   46   32   45    6   18   39   31
## [35,]    7   25    7    2   27   29   45   30
## [36,]    9   23   27   43    1   17   37   22
## [37,]   41   35   42   13   26   33   43   30
## [38,]   10   40    5   46   42    2   33   10
## [39,]   23   48    6    6   49   49   19   16
## [40,]   27   30   16    8   17    2   12    5
## [41,]    7   12   24   44   29   13   34   36
## [42,]   27   31   32   32   26   24   21   50
## [43,]   32   46   21   36   27   49   17   33
## [44,]   38   30   11   45   21    3   12   16
## [45,]   25   35   36   14    7   18   30   28
## [46,]   34   14   44   16   26    2   35   49
## [47,]   29   29   46   23   41   37   30   12
## [48,]    5   32   19   33   20   12   25   28
## [49,]    8    7   25   40    6   13   50    2
## [50,]   12    3   39   40   50   42   47   50

Ingest into a dense TileDB array:

uri <- file.path(tempdir(), "dense-array")
if (dir.exists(uri)) unlink(uri, recursive = TRUE)

tiledb::fromMatrix(obj = m, uri = uri)
tiledb::schema(uri)

## tiledb_array_schema(
##     domain=tiledb_domain(c(tiledb_dim(name="rows", domain=c(1L,50L), tile=50L, type="INT32"), tiledb_dim(name="cols", domain=c(1L,8L), tile=8L, type="INT32"))),
##     attrs=c(tiledb_attr(name="x", type="INT32", ncells=1, nullable=FALSE, filter_list=tiledb_filter_list(c(tiledb_filter_set_option(tiledb_filter("ZSTD"),"COMPRESSION_LEVEL",-1))))),
##     cell_order="COL_MAJOR", tile_order="COL_MAJOR", capacity=10000, sparse=FALSE, allows_dups=FALSE,
##     coords_filter_list=tiledb_filter_list(c(tiledb_filter_set_option(tiledb_filter("ZSTD"),"COMPRESSION_LEVEL",-1))),
##     offsets_filter_list=tiledb_filter_list(c(tiledb_filter_set_option(tiledb_filter("ZSTD"),"COMPRESSION_LEVEL",-1))),
##     validity_filter_list=tiledb_filter_list(c(tiledb_filter_set_option(tiledb_filter("RLE"),"COMPRESSION_LEVEL",-1)))
## )

Range queries

Slicing either dimension by a range produces a matrix identical to what you would get from R:

m1 <- m[1:10, 1:5]
m1

##       [,1] [,2] [,3] [,4] [,5]
##  [1,]   31   13   23   26   10
##  [2,]   15   18   15    9   25
##  [3,]   14   33   21    7    8
##  [4,]    3   27   37   34   18
##  [5,]   42   25    8   48    9
##  [6,]   50   38   10   13    7
##  [7,]   43   21   50   19    7
##  [8,]   37   15   42   47   10
##  [9,]   14   41   44   39   24
## [10,]   25   47   34    4   23

This is true whether indexing via [-notation

# fetches 10x5 matrix as expected
r1a <- tiledb_array(
  uri = uri,
  return_as = "matrix",
)[1:10, 1:5]

r1a

##       [,1] [,2] [,3] [,4] [,5]
##  [1,]   31   13   23   26   10
##  [2,]   15   18   15    9   25
##  [3,]   14   33   21    7    8
##  [4,]    3   27   37   34   18
##  [5,]   42   25    8   48    9
##  [6,]   50   38   10   13    7
##  [7,]   43   21   50   19    7
##  [8,]   37   15   42   47   10
##  [9,]   14   41   44   39   24
## [10,]   25   47   34    4   23
## attr(,"query_status")
## [1] "COMPLETE"

all(m1 == r1a)
## [1] TRUE

Or using the selected_ranges argument:

r1b <- tiledb_array(
  uri = uri,
  return_as = "matrix",
  selected_ranges = list(cbind(1:10, 1:10), cbind(1:5, 1:5))
)[]
r1b

##       [,1] [,2] [,3] [,4] [,5]
##  [1,]   31   13   23   26   10
##  [2,]   15   18   15    9   25
##  [3,]   14   33   21    7    8
##  [4,]    3   27   37   34   18
##  [5,]   42   25    8   48    9
##  [6,]   50   38   10   13    7
##  [7,]   43   21   50   19    7
##  [8,]   37   15   42   47   10
##  [9,]   14   41   44   39   24
## [10,]   25   47   34    4   23
## attr(,"query_status")
## [1] "COMPLETE"

all(m1 == r1b)
## [1] TRUE

Point queries

Expected output for point queries.

m2 <- m[c(3, 5), c(3, 5)]
m2

##      [,1] [,2]
## [1,]   21    8
## [2,]    8    9

Point queries performed with [-notation doesn't produce the expected output. Instead it returns a matrix equivalent to the min/max of the selected points for each dimension:

# returns a matrix equivalent to m[3:5, 3:5]
r2a <- tiledb_array(
  uri = uri,
  return_as = "matrix"
)[c(3, 5), c(3, 5)]
r2a

##      [,1] [,2] [,3]
## [1,]   21    7    8
## [2,]   37   34   18
## [3,]    8   48    9
## attr(,"query_status")
## [1] "COMPLETE"

# all(m2 == r2a)
# ERROR: Non-conformable arrays

Point queries performed with selected_points does produce the expected output:

r2b <- tiledb_array(
  uri = uri,
  return_as = "matrix",
  selected_points = list(c(3, 5), c(3, 5))
)[]
r2b

##      [,1] [,2]
## [1,]   21    8
## [2,]    8    9
## attr(,"query_status")
## [1] "COMPLETE"

all(m2 == r2b)
## [1] TRUE

Attribute filters

Attribute filters result in NAs for values that don't meet the filter criteria:

r3b <- tiledb_array(
  uri = uri,
  return_as = "matrix",
  selected_points = list(c(3, 5), c(3, 5)),
  query_condition = parse_query_condition(x > 13)
)[]

r3b

##      [,1] [,2]
## [1,]   21   NA
## [2,]   NA   NA
## attr(,"query_status")
## [1] "COMPLETE"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment