ivyleavedtoadflax/max_rle_df.R

## max_rle_df.R
# Have a look at rle in the console. This function is largely bastardised from
# it. I have used a testing framework testthat to ensure that the outcome is
# what we expect it to be. You can add more tests as required!

# Define the max_rl function ----

max_rl <- function(x, val) {

  # x is the target vector, val is the value for which we want to know the longest
  # run length.

  # Test that we are passing a vector a list to the function

  if (!is.vector(x) && !is.list(x)) stop("'x' must be a vector of an atomic type")

  # Caclulate length of vector

  n <- length(x)

  # Offset the vector with itself minus the last value

  y <- x[-1L] != x[-n]

  # Calculate index

  i <- c(which(y | is.na(y)), n)

  # Calculate length and values as in rle

  lengths <- diff(c(0L, i))
  values <-  x[i]

  # Determine the max which equals val. Need to first check that val is present
  # in values, if not return 0.

  if (val %in% values) {

    max_val <- max(lengths[values == val])

  } else {

    max_val <- 0

  }

  # This step appears to be necessary to avoid issues when calling map_int later

  max_val <- as.integer(max_val)

  return(max_val)

}

# Test max_rl function ----

# Now lets run a suite of tests to check that this function works in the cases
# that we expect it to.

library(testthat)

test_that(
  "max_rl behaves as expected",
  {

    # Case when we have a run of 5 1s, and we want to count the max run of 1s

    expect_equal(
      5,
      max_rl(c(1,1,1,1,1), 1)
    )

    # Case when we have fewer than the length of the vector

    expect_equal(
      2,
      max_rl(c(1,0,1,1,0), 1)
    )

    # Case when there are no values that we are interested in at all.

    expect_equal(
      0,
      max_rl(c(1,1,1,1,1), 0)
    )

  })

# Define the max_rl_df function ----

# This seems to work. Now lets wrap this up into a function that can handle dataframes.

max_rl_df <- function(x, val) {

  # First check that x is a dataframe or a matrix (note this encompasses data_frames/
  # tibbles).

  if (!is.data.frame(x) && !is.matrix(x)) stop("'x' must be a data.frame or matrix")

  # Takes a df an val as an argument to pass to max_rl

  x <- apply(x, MARGIN = 1, FUN = function(x) max_rl(x, val))

  return(x)

}

# Test the max_rl function ----

# Now run obligatory tests on max_rl_df()

test_that(
  'Test that max_rl_df works as expected',
  {

    # Create the dataframe

    gap <- data.frame(
      year1 = c(TRUE, FALSE, TRUE, TRUE, FALSE, FALSE),
      year2 = c(TRUE, TRUE, FALSE, TRUE, FALSE, FALSE),
      year3 = c(TRUE, FALSE, TRUE, TRUE, TRUE, FALSE),
      year4 = c(TRUE, FALSE, FALSE, TRUE, FALSE, FALSE),
      year5 = c(TRUE, FALSE, TRUE, FALSE, FALSE, FALSE)
    )

    # Do a manual count and check that function returns expected.

    expect_equal(
      c(5,1,1,4,1,0),
      max_rl_df(gap, TRUE)
    )

    # Do the same for FALSE cases

    expect_equal(
      c(0,3,1,1,2,5),
      max_rl_df(gap, FALSE)
    )

    # Test that it also works on matrices

    expect_equal(
      c(0,3,1,1,2,5),
      max_rl_df(as.matrix(gap), FALSE)
    )

  }
)

# Apply the max_rl function ----

# All good, so how might you use this in a pipeline?
# Create the dataframe again

gap <- data.frame(
  year1 = c(TRUE, FALSE, TRUE, TRUE, FALSE, FALSE),
  year2 = c(TRUE, TRUE, FALSE, TRUE, FALSE, FALSE),
  year3 = c(TRUE, FALSE, TRUE, TRUE, TRUE, FALSE),
  year4 = c(TRUE, FALSE, FALSE, TRUE, FALSE, FALSE),
  year5 = c(TRUE, FALSE, TRUE, FALSE, FALSE, FALSE)
)

# Now bind the dataframe to the max length of the TRUEs.

gap <- cbind(
  gap,
  len_TRUE = max_rl_df(gap, TRUE),
  len_FALSE = max_rl_df(gap, FALSE)
)

gap
	# Have a look at rle in the console. This function is largely bastardised from
	# it. I have used a testing framework testthat to ensure that the outcome is
	# what we expect it to be. You can add more tests as required!

	# Define the max_rl function ----

	max_rl <- function(x, val) {

	# x is the target vector, val is the value for which we want to know the longest
	# run length.

	# Test that we are passing a vector a list to the function

	if (!is.vector(x) && !is.list(x)) stop("'x' must be a vector of an atomic type")

	# Caclulate length of vector

	n <- length(x)

	# Offset the vector with itself minus the last value

	y <- x[-1L] != x[-n]

	# Calculate index

	i <- c(which(y \| is.na(y)), n)

	# Calculate length and values as in rle

	lengths <- diff(c(0L, i))
	values <- x[i]

	# Determine the max which equals val. Need to first check that val is present
	# in values, if not return 0.

	if (val %in% values) {

	max_val <- max(lengths[values == val])

	} else {

	max_val <- 0

	}

	# This step appears to be necessary to avoid issues when calling map_int later

	max_val <- as.integer(max_val)

	return(max_val)

	}

	# Test max_rl function ----

	# Now lets run a suite of tests to check that this function works in the cases
	# that we expect it to.

	library(testthat)

	test_that(
	"max_rl behaves as expected",
	{

	# Case when we have a run of 5 1s, and we want to count the max run of 1s

	expect_equal(
	5,
	max_rl(c(1,1,1,1,1), 1)
	)

	# Case when we have fewer than the length of the vector

	expect_equal(
	2,
	max_rl(c(1,0,1,1,0), 1)
	)

	# Case when there are no values that we are interested in at all.

	expect_equal(
	0,
	max_rl(c(1,1,1,1,1), 0)
	)

	})

	# Define the max_rl_df function ----

	# This seems to work. Now lets wrap this up into a function that can handle dataframes.

	max_rl_df <- function(x, val) {

	# First check that x is a dataframe or a matrix (note this encompasses data_frames/
	# tibbles).

	if (!is.data.frame(x) && !is.matrix(x)) stop("'x' must be a data.frame or matrix")

	# Takes a df an val as an argument to pass to max_rl

	x <- apply(x, MARGIN = 1, FUN = function(x) max_rl(x, val))

	return(x)

	}

	# Test the max_rl function ----

	# Now run obligatory tests on max_rl_df()

	test_that(
	'Test that max_rl_df works as expected',
	{

	# Create the dataframe

	gap <- data.frame(
	year1 = c(TRUE, FALSE, TRUE, TRUE, FALSE, FALSE),
	year2 = c(TRUE, TRUE, FALSE, TRUE, FALSE, FALSE),
	year3 = c(TRUE, FALSE, TRUE, TRUE, TRUE, FALSE),
	year4 = c(TRUE, FALSE, FALSE, TRUE, FALSE, FALSE),
	year5 = c(TRUE, FALSE, TRUE, FALSE, FALSE, FALSE)
	)

	# Do a manual count and check that function returns expected.

	expect_equal(
	c(5,1,1,4,1,0),
	max_rl_df(gap, TRUE)
	)

	# Do the same for FALSE cases

	expect_equal(
	c(0,3,1,1,2,5),
	max_rl_df(gap, FALSE)
	)

	# Test that it also works on matrices

	expect_equal(
	c(0,3,1,1,2,5),
	max_rl_df(as.matrix(gap), FALSE)
	)

	}
	)

	# Apply the max_rl function ----

	# All good, so how might you use this in a pipeline?
	# Create the dataframe again

	gap <- data.frame(
	year1 = c(TRUE, FALSE, TRUE, TRUE, FALSE, FALSE),
	year2 = c(TRUE, TRUE, FALSE, TRUE, FALSE, FALSE),
	year3 = c(TRUE, FALSE, TRUE, TRUE, TRUE, FALSE),
	year4 = c(TRUE, FALSE, FALSE, TRUE, FALSE, FALSE),
	year5 = c(TRUE, FALSE, TRUE, FALSE, FALSE, FALSE)
	)

	# Now bind the dataframe to the max length of the TRUEs.

	gap <- cbind(
	gap,
	len_TRUE = max_rl_df(gap, TRUE),
	len_FALSE = max_rl_df(gap, FALSE)
	)

	gap