teunbrand/rle_2d.R

## rle_2d.R
rle_2d <- function(matrix) {

  # If we have an empty matrix, we return 0-row data.frame
  if (prod(dim(matrix)) < 1) {
    ans <- data.frame(
      col.start = integer(),
      col.end   = integer(),
      row.start = integer(),
      row.end   = integer(),
      value     = as.vector(matrix)
    )
    return(ans)
  }

  # Simplified case when there is only one column
  if (ncol(matrix) == 1) {
    rle <- rle(matrix[, 1])
    end <- cumsum(rle$lengths)
    ans <- data.frame(
      col.start = 1L,
      col.end   = 1L,
      row.start = end - rle$lengths + 1,
      row.end   = end,
      value     = rle$values
    )
    return(ans)
  }

  # We first run-length encode every row of the matrix
  runs <- lapply(seq_len(nrow(matrix)), function(row) {
    rle <- rle(matrix[row,])
    end <- cumsum(rle$lengths)
    data.frame(
      col.start = end - rle$lengths + 1,
      col.end   = end,
      row.start = row,
      row.end   = row,
      value = rle$values
    )
  })

  # Simplified case when there is only one row
  if (nrow(matrix) == 1) {
    return(runs[[1]])
  }

  # Looping over every row that isn't the last row
  for (row in seq_len(length(runs) - 1)) {
    current   <- runs[[row]]
    next_rows <- (row + 1):length(runs)

    # For every run in current row
    for (run in seq_len(nrow(current))) {

      # Try to find match in subsequent rows
      for (following in next_rows) {
        follow <- runs[[following]]
        # Does any value in the next row match? (Cheap test)
        value_match <- follow$value == current$value[run]
        if (!any(value_match)) {
          # No value match for this run, restart for next run in same row
          break
        }
        # Do also the positions match? (More expensive)
        matched <- which(
          follow$col.start == current$col.start[run] &
          follow$col.end   == current$col.end[run] &
          value_match
        )
        if (length(matched) == 0) {
          # No position match for this run, restart for next run in same row
          break
        }
        # Merge match into current run
        current$row.end[run] <- follow$row.end[matched[1]]
        runs[[following]] <- follow[-matched[1], , drop = FALSE]
      }
    }
    # Update row when finished
    runs[[row]] <- current
  }
  do.call(rbind, runs)
}

# Version 2.0 inspired by Tim Taylor's hash table approach
rle_2d2 <- function(m, byrow = FALSE) {

  n <- length(m)

  if (n == 0L) {
    ans <- data.frame(
      col_start = integer(),
      col_end   = integer(),
      row_start = integer(),
      row_end   = integer(),
      value     = as.vector(matrix)
    )
    return(ans)
  }

  if (isTRUE(byrow)) {
    m <- t(m)
  }
  dim <- dim(m)

  # Treat matrix content as levels, so we can deal with NAs
  levels <- unique(m)
  m <- matrix(match(m, levels), nrow(m), ncol(m))

  # Simplified case when m has only a single row
  if (dim[1] == 1L) {
    rle  <- rle(as.vector(m))
    ends <- cumsum(rle$lengths)
    ans <- data.frame(
      col_start = ends - rle$lengths + 1,
      col_end   = ends,
      row_start = 1L,
      row_end   = 1L,
      value     = levels[rle$values]
    )
  }

  # Run length encoding by column
  # 'By column' just means adding columns as change points
  ends    <- c(which(m[-1] != m[-n] | (row(m) == nrow(m))[-n]), n)
  lengths <- diff(c(0L, ends))
  values  <- m[ends]
  starts  <- ends - lengths + 1L

  # Simplified case when m has only a single column
  if (dim[2] == 1L) {
    ans <- data.frame(
      col_start = 1L,
      col_end   = 1L,
      row_start = starts,
      row_end   = ends,
      value     = levels[values]
    )
  }

  # Translate to indices
  row_start <- arrayInd(starts, dim)[, 1]
  row_end   <- row_start + lengths - 1L
  col_start <- col_end <- arrayInd(ends, dim)[, 2]

  # Initialise hash table no longer than number of runs
  # Inspiration for using hash tables for this problem taken from TimTaylor:
  # https://fosstodon.org/@_TimTaylor/111266682218212785
  htab <- hashtab(size = length(values))

  for (i in seq_along(values)) {

    # Lookup if there has been a similar column
    key <- c(row_start[i], row_end[i], values[i])
    hsh <- gethash(htab, key)


    if (!is.null(hsh) && col_start[i] == col_end[hsh] + 1L) {
      # Matches run in previous column, merge by updating column end and
      # deleting current run (NA value will be filtered out later)
      col_end[hsh] <- col_start[i]
      values[i] <- NA_integer_
    } else {
      # Add run-index to the table
      sethash(htab, key, i)
    }
  }

  # For small matrices, this is the expensive step
  ans <- data.frame(
    col_start = col_start,
    col_end   = col_end,
    row_start = row_start,
    row_end   = row_end,
    value     = levels[values]
  )[!is.na(values), , drop = FALSE]

  # Simply rename columns
  if (isTRUE(byrow)) {
    names(ans) <- c("row_start", "row_end", "col_start", "col_end", "value")
  }

  ans
}
	rle_2d <- function(matrix) {

	# If we have an empty matrix, we return 0-row data.frame
	if (prod(dim(matrix)) < 1) {
	ans <- data.frame(
	col.start = integer(),
	col.end = integer(),
	row.start = integer(),
	row.end = integer(),
	value = as.vector(matrix)
	)
	return(ans)
	}

	# Simplified case when there is only one column
	if (ncol(matrix) == 1) {
	rle <- rle(matrix[, 1])
	end <- cumsum(rle$lengths)
	ans <- data.frame(
	col.start = 1L,
	col.end = 1L,
	row.start = end - rle$lengths + 1,
	row.end = end,
	value = rle$values
	)
	return(ans)
	}

	# We first run-length encode every row of the matrix
	runs <- lapply(seq_len(nrow(matrix)), function(row) {
	rle <- rle(matrix[row,])
	end <- cumsum(rle$lengths)
	data.frame(
	col.start = end - rle$lengths + 1,
	col.end = end,
	row.start = row,
	row.end = row,
	value = rle$values
	)
	})

	# Simplified case when there is only one row
	if (nrow(matrix) == 1) {
	return(runs[[1]])
	}

	# Looping over every row that isn't the last row
	for (row in seq_len(length(runs) - 1)) {
	current <- runs[[row]]
	next_rows <- (row + 1):length(runs)

	# For every run in current row
	for (run in seq_len(nrow(current))) {

	# Try to find match in subsequent rows
	for (following in next_rows) {
	follow <- runs[[following]]
	# Does any value in the next row match? (Cheap test)
	value_match <- follow$value == current$value[run]
	if (!any(value_match)) {
	# No value match for this run, restart for next run in same row
	break
	}
	# Do also the positions match? (More expensive)
	matched <- which(
	follow$col.start == current$col.start[run] &
	follow$col.end == current$col.end[run] &
	value_match
	)
	if (length(matched) == 0) {
	# No position match for this run, restart for next run in same row
	break
	}
	# Merge match into current run
	current$row.end[run] <- follow$row.end[matched[1]]
	runs[[following]] <- follow[-matched[1], , drop = FALSE]
	}
	}
	# Update row when finished
	runs[[row]] <- current
	}
	do.call(rbind, runs)
	}

	# Version 2.0 inspired by Tim Taylor's hash table approach
	rle_2d2 <- function(m, byrow = FALSE) {

	n <- length(m)

	if (n == 0L) {
	ans <- data.frame(
	col_start = integer(),
	col_end = integer(),
	row_start = integer(),
	row_end = integer(),
	value = as.vector(matrix)
	)
	return(ans)
	}

	if (isTRUE(byrow)) {
	m <- t(m)
	}
	dim <- dim(m)

	# Treat matrix content as levels, so we can deal with NAs
	levels <- unique(m)
	m <- matrix(match(m, levels), nrow(m), ncol(m))

	# Simplified case when m has only a single row
	if (dim[1] == 1L) {
	rle <- rle(as.vector(m))
	ends <- cumsum(rle$lengths)
	ans <- data.frame(
	col_start = ends - rle$lengths + 1,
	col_end = ends,
	row_start = 1L,
	row_end = 1L,
	value = levels[rle$values]
	)
	}

	# Run length encoding by column
	# 'By column' just means adding columns as change points
	ends <- c(which(m[-1] != m[-n] \| (row(m) == nrow(m))[-n]), n)
	lengths <- diff(c(0L, ends))
	values <- m[ends]
	starts <- ends - lengths + 1L

	# Simplified case when m has only a single column
	if (dim[2] == 1L) {
	ans <- data.frame(
	col_start = 1L,
	col_end = 1L,
	row_start = starts,
	row_end = ends,
	value = levels[values]
	)
	}

	# Translate to indices
	row_start <- arrayInd(starts, dim)[, 1]
	row_end <- row_start + lengths - 1L
	col_start <- col_end <- arrayInd(ends, dim)[, 2]

	# Initialise hash table no longer than number of runs
	# Inspiration for using hash tables for this problem taken from TimTaylor:
	# https://fosstodon.org/@_TimTaylor/111266682218212785
	htab <- hashtab(size = length(values))

	for (i in seq_along(values)) {

	# Lookup if there has been a similar column
	key <- c(row_start[i], row_end[i], values[i])
	hsh <- gethash(htab, key)


	if (!is.null(hsh) && col_start[i] == col_end[hsh] + 1L) {
	# Matches run in previous column, merge by updating column end and
	# deleting current run (NA value will be filtered out later)
	col_end[hsh] <- col_start[i]
	values[i] <- NA_integer_
	} else {
	# Add run-index to the table
	sethash(htab, key, i)
	}
	}

	# For small matrices, this is the expensive step
	ans <- data.frame(
	col_start = col_start,
	col_end = col_end,
	row_start = row_start,
	row_end = row_end,
	value = levels[values]
	)[!is.na(values), , drop = FALSE]

	# Simply rename columns
	if (isTRUE(byrow)) {
	names(ans) <- c("row_start", "row_end", "col_start", "col_end", "value")
	}

	ans
	}