Skip to content

Instantly share code, notes, and snippets.

@teunbrand
Last active October 20, 2023 13:34
Show Gist options
  • Save teunbrand/28f989168d8d801cdd5777f167e63dfd to your computer and use it in GitHub Desktop.
Save teunbrand/28f989168d8d801cdd5777f167e63dfd to your computer and use it in GitHub Desktop.
First attempt at 2D run-length encoding
rle_2d <- function(matrix) {
# If we have an empty matrix, we return 0-row data.frame
if (prod(dim(matrix)) < 1) {
ans <- data.frame(
col.start = integer(),
col.end = integer(),
row.start = integer(),
row.end = integer(),
value = as.vector(matrix)
)
return(ans)
}
# Simplified case when there is only one column
if (ncol(matrix) == 1) {
rle <- rle(matrix[, 1])
end <- cumsum(rle$lengths)
ans <- data.frame(
col.start = 1L,
col.end = 1L,
row.start = end - rle$lengths + 1,
row.end = end,
value = rle$values
)
return(ans)
}
# We first run-length encode every row of the matrix
runs <- lapply(seq_len(nrow(matrix)), function(row) {
rle <- rle(matrix[row,])
end <- cumsum(rle$lengths)
data.frame(
col.start = end - rle$lengths + 1,
col.end = end,
row.start = row,
row.end = row,
value = rle$values
)
})
# Simplified case when there is only one row
if (nrow(matrix) == 1) {
return(runs[[1]])
}
# Looping over every row that isn't the last row
for (row in seq_len(length(runs) - 1)) {
current <- runs[[row]]
next_rows <- (row + 1):length(runs)
# For every run in current row
for (run in seq_len(nrow(current))) {
# Try to find match in subsequent rows
for (following in next_rows) {
follow <- runs[[following]]
# Does any value in the next row match? (Cheap test)
value_match <- follow$value == current$value[run]
if (!any(value_match)) {
# No value match for this run, restart for next run in same row
break
}
# Do also the positions match? (More expensive)
matched <- which(
follow$col.start == current$col.start[run] &
follow$col.end == current$col.end[run] &
value_match
)
if (length(matched) == 0) {
# No position match for this run, restart for next run in same row
break
}
# Merge match into current run
current$row.end[run] <- follow$row.end[matched[1]]
runs[[following]] <- follow[-matched[1], , drop = FALSE]
}
}
# Update row when finished
runs[[row]] <- current
}
do.call(rbind, runs)
}
# Version 2.0 inspired by Tim Taylor's hash table approach
rle_2d2 <- function(m, byrow = FALSE) {
n <- length(m)
if (n == 0L) {
ans <- data.frame(
col_start = integer(),
col_end = integer(),
row_start = integer(),
row_end = integer(),
value = as.vector(matrix)
)
return(ans)
}
if (isTRUE(byrow)) {
m <- t(m)
}
dim <- dim(m)
# Treat matrix content as levels, so we can deal with NAs
levels <- unique(m)
m <- matrix(match(m, levels), nrow(m), ncol(m))
# Simplified case when m has only a single row
if (dim[1] == 1L) {
rle <- rle(as.vector(m))
ends <- cumsum(rle$lengths)
ans <- data.frame(
col_start = ends - rle$lengths + 1,
col_end = ends,
row_start = 1L,
row_end = 1L,
value = levels[rle$values]
)
}
# Run length encoding by column
# 'By column' just means adding columns as change points
ends <- c(which(m[-1] != m[-n] | (row(m) == nrow(m))[-n]), n)
lengths <- diff(c(0L, ends))
values <- m[ends]
starts <- ends - lengths + 1L
# Simplified case when m has only a single column
if (dim[2] == 1L) {
ans <- data.frame(
col_start = 1L,
col_end = 1L,
row_start = starts,
row_end = ends,
value = levels[values]
)
}
# Translate to indices
row_start <- arrayInd(starts, dim)[, 1]
row_end <- row_start + lengths - 1L
col_start <- col_end <- arrayInd(ends, dim)[, 2]
# Initialise hash table no longer than number of runs
# Inspiration for using hash tables for this problem taken from TimTaylor:
# https://fosstodon.org/@_TimTaylor/111266682218212785
htab <- hashtab(size = length(values))
for (i in seq_along(values)) {
# Lookup if there has been a similar column
key <- c(row_start[i], row_end[i], values[i])
hsh <- gethash(htab, key)
if (!is.null(hsh) && col_start[i] == col_end[hsh] + 1L) {
# Matches run in previous column, merge by updating column end and
# deleting current run (NA value will be filtered out later)
col_end[hsh] <- col_start[i]
values[i] <- NA_integer_
} else {
# Add run-index to the table
sethash(htab, key, i)
}
}
# For small matrices, this is the expensive step
ans <- data.frame(
col_start = col_start,
col_end = col_end,
row_start = row_start,
row_end = row_end,
value = levels[values]
)[!is.na(values), , drop = FALSE]
# Simply rename columns
if (isTRUE(byrow)) {
names(ans) <- c("row_start", "row_end", "col_start", "col_end", "value")
}
ans
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment