Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@arunsrinivasan
Last active August 29, 2015 14:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save arunsrinivasan/9f8aebd1bb3ddce0c7fe to your computer and use it in GitHub Desktop.
Save arunsrinivasan/9f8aebd1bb3ddce0c7fe to your computer and use it in GitHub Desktop.
Benchmarking `duplicated.data.table`

Benchmarking for this gist: https://gist.github.com/PeteHaitch/75d6f7fd0566767e1e80

Function to simulate data as a matrix:

sim_data <- function(n, m, d, sim_strand = FALSE){
  if (d >= n){
    stop("Require d < n")
  }
  i <- sample(n - d, d)
  chromosome <- sample(x = 24L, size = n - d, replace = TRUE)
  chromosome <- c(chromosome, chromosome[i])
  if (sim_strand){
    strand <- sample(x = 3L, size = n - d, replace = TRUE)
  } else{
    strand <- rep(x = 3L, times = n - d)
  }
  strand <- c(strand, strand[i])
  pos <- matrix(sort(sample(x = 250000000L, size = (n - d) * m, replace = TRUE)), ncol = m)
  pos <- rbind(pos, pos[i, ], deparse.level = 0)
  
  cbind(chromosome, strand, pos, deparse.level = 0)
}

Function to simulate data as a data.table:

require(data.table)
sim_data_dt <- function(x) {
  as.data.table(x)
}

Generate Data:

n <- 2000000

## as a matrix
m1 <- sim_data(n = n, m = 1, d = 100)
m2 <- sim_data(n = n, m = 2, d = 100)
m3 <- sim_data(n = n, m = 3, d = 100)
m8 <- sim_data(n = n, m = 8, d = 100)

## as a data.table
system.time(dt1 <- sim_data_dt(m1)) ## 0.185 seconds
system.time(dt2 <- sim_data_dt(m2)) ## 0.160 seconds
system.time(dt3 <- sim_data_dt(m3)) ## 0.213 seconds
system.time(dt8 <- sim_data_dt(m8)) ## 0.480 seconds

Measure time and if the results are identical:

system.time(m.ans1 <- duplicated(m1)) ## 73 seconds
system.time(m.ans2 <- duplicated(m2)) ## 59 seconds
system.time(m.ans3 <- duplicated(m3)) ## 76 seconds
system.time(m.ans4 <- duplicated(m8)) ## 109 seconds

system.time(dt.ans1 <- duplicated(dt1)) ## 0.217 seconds
system.time(dt.ans2 <- duplicated(dt2)) ## 0.305 seconds
system.time(dt.ans3 <- duplicated(dt3)) ## 0.300 seconds
system.time(dt.ans4 <- duplicated(dt8)) ## 0.404 seconds

identical(m.ans1, dt.ans1) # [1] TRUE
identical(m.ans2, dt.ans2) # [1] TRUE
identical(m.ans3, dt.ans3) # [1] TRUE
identical(m.ans4, dt.ans4) # [1] TRUE

GRanges function:

library(GenomicRanges)
matrix2GR <- function(x){
  if (ncol(x) == 3L){
    GRanges(seqnames = x[, 1], ranges = IRanges(start = x[, 3], width = 1), strand = ifelse(x[, 2] == 1L, '+', ifelse(x[, 2] == 2L, '-', '*')))
  } else if (ncol(x) == 4L){
    GRanges(seqnames = x[, 1], ranges = IRanges(start = x[, 3], width = x[, 4]), strand = ifelse(x[, 2] == 1L, '+', ifelse(x[, 2] == 2L, '-', '*')))
  } else{
    stop("Can't convert when m > 2")
  }
}

## Creating the object takes quite some time (compared to getting it as a data.table)
system.time(y1 <- matrix2GR(m1)) ## 5.7 seconds
system.time(y2 <- matrix2GR(m2)) ## 5.6 seconds

system.time(gr.ans1 <- duplicated(y1)) ## 0.29 seconds
system.time(gr.ans2 <- duplicated(y2)) ## 0.31 seconds

identical(gr.ans1, m.ans1) ## [1] TRUE
identical(gr.ans2, m.ans2) ## [1] TRUE
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment