PeteHaitch/faster_duplicated_method.R

## faster_duplicated_method.R
## Create some test data (a matrix)
# The matrix, x, has at least 3 columns, all of which contain integers.
# The first column is an integer-encoding of a chromosome and so there are approximately 20-30 unique values.
# The second column is an integer-encoding of a genomic strand and so there are at most 3 unique values (representing positive, negative or unknown/irrelevant).
# The remaining columns are genomic positions, which are integers in the range of approximately 1-250,000,000.
# sim_data adds 'd' duplicates as the last 'd' rows
# n is the number of rows
# m + 2 is the number of columns. m = 1 is the minimum.
# d is the number of duplicates added to the end of the matrix
# sim_strand is whether the strand is simulated (column 2 of the matrix)
# NOTE: There might be additional duplicates in rows 1 to (n - d)
sim_data <- function(n, m, d, sim_strand = FALSE){
  if (d >= n){
    stop("Require d < n")
  }
  i <- sample(n - d, d)
  chromosome <- sample(x = 24L, size = n - d, replace = TRUE)
  chromosome <- c(chromosome, chromosome[i])
  if (sim_strand){
    strand <- sample(x = 3L, size = n - d, replace = TRUE)
  } else{
    strand <- rep(x = 3L, times = n - d)
  }
  strand <- c(strand, strand[i])
  pos <- matrix(sort(sample(x = 250000000L, size = (n - d) * m, replace = TRUE)), ncol = m)
  pos <- rbind(pos, pos[i, ], deparse.level = 0)

  cbind(chromosome, strand, pos, deparse.level = 0)
}

n <- 2000000
x_1 <- sim_data(n = n, m = 1, d = 100)
x_2 <- sim_data(n = n, m = 2, d = 100)
x_3 <- sim_data(n = n, m = 3, d = 100)
x_8 <- sim_data(n = n, m = 8, d = 100)

library(GenomicRanges) # Available from BioConductor. source("http://bioconductor.org/biocLite.R"); biocLite("GenomicRanges")
## A function to convert the matrix to a GRanges object (when m < 3)
## Can then use the kludgey fast duplicated method
matrix2GR <- function(x){
  if (ncol(x) == 3L){
    GRanges(seqnames = x[, 1], ranges = IRanges(start = x[, 3], width = 1), strand = ifelse(x[, 2] == 1L, '+', ifelse(x[, 2] == 2L, '-', '*')))
  } else if (ncol(x) == 4L){
    GRanges(seqnames = x[, 1], ranges = IRanges(start = x[, 3], width = x[, 4]), strand = ifelse(x[, 2] == 1L, '+', ifelse(x[, 2] == 2L, '-', '*')))
  } else{
    stop("Can't convert when m > 2")
  }
}

y_1 <- matrix2GR(x_1)
y_2 <- matrix2GR(x_2)

# Confirm these methods give identical results when m = 1 and m = 2
identical(duplicated(x_1, MARGIN = 1), duplicated(y_1))
identical(duplicated(x_2, MARGIN = 1), duplicated(y_2))

# Benchmark
system.time(duplicated(x_1, MARGIN = 1)) # 40 seconds
system.time(duplicated(y_1)) # 0.22 seconds
system.time(duplicated(x_2, MARGIN = 1)) # 41 seconds
system.time(duplicated(y_2)) # 0.23 seconds
system.time(duplicated(x_3, MARGIN = 1)) # 51 seconds
system.time(duplicated(x_8, MARGIN = 1)) # 69 seconds
	## Create some test data (a matrix)
	# The matrix, x, has at least 3 columns, all of which contain integers.
	# The first column is an integer-encoding of a chromosome and so there are approximately 20-30 unique values.
	# The second column is an integer-encoding of a genomic strand and so there are at most 3 unique values (representing positive, negative or unknown/irrelevant).
	# The remaining columns are genomic positions, which are integers in the range of approximately 1-250,000,000.
	# sim_data adds 'd' duplicates as the last 'd' rows
	# n is the number of rows
	# m + 2 is the number of columns. m = 1 is the minimum.
	# d is the number of duplicates added to the end of the matrix
	# sim_strand is whether the strand is simulated (column 2 of the matrix)
	# NOTE: There might be additional duplicates in rows 1 to (n - d)
	sim_data <- function(n, m, d, sim_strand = FALSE){
	if (d >= n){
	stop("Require d < n")
	}
	i <- sample(n - d, d)
	chromosome <- sample(x = 24L, size = n - d, replace = TRUE)
	chromosome <- c(chromosome, chromosome[i])
	if (sim_strand){
	strand <- sample(x = 3L, size = n - d, replace = TRUE)
	} else{
	strand <- rep(x = 3L, times = n - d)
	}
	strand <- c(strand, strand[i])
	pos <- matrix(sort(sample(x = 250000000L, size = (n - d) * m, replace = TRUE)), ncol = m)
	pos <- rbind(pos, pos[i, ], deparse.level = 0)

	cbind(chromosome, strand, pos, deparse.level = 0)
	}

	n <- 2000000
	x_1 <- sim_data(n = n, m = 1, d = 100)
	x_2 <- sim_data(n = n, m = 2, d = 100)
	x_3 <- sim_data(n = n, m = 3, d = 100)
	x_8 <- sim_data(n = n, m = 8, d = 100)

	library(GenomicRanges) # Available from BioConductor. source("http://bioconductor.org/biocLite.R"); biocLite("GenomicRanges")
	## A function to convert the matrix to a GRanges object (when m < 3)
	## Can then use the kludgey fast duplicated method
	matrix2GR <- function(x){
	if (ncol(x) == 3L){
	GRanges(seqnames = x[, 1], ranges = IRanges(start = x[, 3], width = 1), strand = ifelse(x[, 2] == 1L, '+', ifelse(x[, 2] == 2L, '-', '*')))
	} else if (ncol(x) == 4L){
	GRanges(seqnames = x[, 1], ranges = IRanges(start = x[, 3], width = x[, 4]), strand = ifelse(x[, 2] == 1L, '+', ifelse(x[, 2] == 2L, '-', '*')))
	} else{
	stop("Can't convert when m > 2")
	}
	}

	y_1 <- matrix2GR(x_1)
	y_2 <- matrix2GR(x_2)

	# Confirm these methods give identical results when m = 1 and m = 2
	identical(duplicated(x_1, MARGIN = 1), duplicated(y_1))
	identical(duplicated(x_2, MARGIN = 1), duplicated(y_2))

	# Benchmark
	system.time(duplicated(x_1, MARGIN = 1)) # 40 seconds
	system.time(duplicated(y_1)) # 0.22 seconds
	system.time(duplicated(x_2, MARGIN = 1)) # 41 seconds
	system.time(duplicated(y_2)) # 0.23 seconds
	system.time(duplicated(x_3, MARGIN = 1)) # 51 seconds
	system.time(duplicated(x_8, MARGIN = 1)) # 69 seconds