Benchmarking for this gist: https://gist.github.com/PeteHaitch/75d6f7fd0566767e1e80
sim_data <- function(n, m, d, sim_strand = FALSE){
if (d >= n){
stop("Require d < n")
}
i <- sample(n - d, d)
chromosome <- sample(x = 24L, size = n - d, replace = TRUE)
chromosome <- c(chromosome, chromosome[i])
if (sim_strand){
strand <- sample(x = 3L, size = n - d, replace = TRUE)
} else{
strand <- rep(x = 3L, times = n - d)
}
strand <- c(strand, strand[i])
pos <- matrix(sort(sample(x = 250000000L, size = (n - d) * m, replace = TRUE)), ncol = m)
pos <- rbind(pos, pos[i, ], deparse.level = 0)
cbind(chromosome, strand, pos, deparse.level = 0)
}
require(data.table)
sim_data_dt <- function(x) {
as.data.table(x)
}
n <- 2000000
## as a matrix
m1 <- sim_data(n = n, m = 1, d = 100)
m2 <- sim_data(n = n, m = 2, d = 100)
m3 <- sim_data(n = n, m = 3, d = 100)
m8 <- sim_data(n = n, m = 8, d = 100)
## as a data.table
system.time(dt1 <- sim_data_dt(m1)) ## 0.185 seconds
system.time(dt2 <- sim_data_dt(m2)) ## 0.160 seconds
system.time(dt3 <- sim_data_dt(m3)) ## 0.213 seconds
system.time(dt8 <- sim_data_dt(m8)) ## 0.480 seconds
system.time(m.ans1 <- duplicated(m1)) ## 73 seconds
system.time(m.ans2 <- duplicated(m2)) ## 59 seconds
system.time(m.ans3 <- duplicated(m3)) ## 76 seconds
system.time(m.ans4 <- duplicated(m8)) ## 109 seconds
system.time(dt.ans1 <- duplicated(dt1)) ## 0.217 seconds
system.time(dt.ans2 <- duplicated(dt2)) ## 0.305 seconds
system.time(dt.ans3 <- duplicated(dt3)) ## 0.300 seconds
system.time(dt.ans4 <- duplicated(dt8)) ## 0.404 seconds
identical(m.ans1, dt.ans1) # [1] TRUE
identical(m.ans2, dt.ans2) # [1] TRUE
identical(m.ans3, dt.ans3) # [1] TRUE
identical(m.ans4, dt.ans4) # [1] TRUE
library(GenomicRanges)
matrix2GR <- function(x){
if (ncol(x) == 3L){
GRanges(seqnames = x[, 1], ranges = IRanges(start = x[, 3], width = 1), strand = ifelse(x[, 2] == 1L, '+', ifelse(x[, 2] == 2L, '-', '*')))
} else if (ncol(x) == 4L){
GRanges(seqnames = x[, 1], ranges = IRanges(start = x[, 3], width = x[, 4]), strand = ifelse(x[, 2] == 1L, '+', ifelse(x[, 2] == 2L, '-', '*')))
} else{
stop("Can't convert when m > 2")
}
}
## Creating the object takes quite some time (compared to getting it as a data.table)
system.time(y1 <- matrix2GR(m1)) ## 5.7 seconds
system.time(y2 <- matrix2GR(m2)) ## 5.6 seconds
system.time(gr.ans1 <- duplicated(y1)) ## 0.29 seconds
system.time(gr.ans2 <- duplicated(y2)) ## 0.31 seconds
identical(gr.ans1, m.ans1) ## [1] TRUE
identical(gr.ans2, m.ans2) ## [1] TRUE