Skip to content

Instantly share code, notes, and snippets.

@ATpoint
Created February 25, 2020 08:36
Show Gist options
  • Save ATpoint/ceb0a13d3bcd1821c98c4354e94fd519 to your computer and use it in GitHub Desktop.
Save ATpoint/ceb0a13d3bcd1821c98c4354e94fd519 to your computer and use it in GitHub Desktop.
require(data.table)
require(microbenchmark)
require(GenomicRanges)
x <- fread("
chrom start end hgnc
1 100 200 MYC
1 150 300 MYC
1 400 500 MYC
1 150 230 TP53
1 200 350 TP53
1 420 550 TP53")
fromZx <- function(Input){
return(Input[, as.data.table(reduce(IRanges(start, end))), by = hgnc])
}
fromZx(x)
fromMike <- function(Input){
gr <- GRanges(Input)
# in short:
final <- unlist(reduce(split(gr, gr$hgnc)))
# split into GRangesList
grl <- split(gr, gr$hgnc)
# all(names(grl) == sort(unique(gr$hgnc))
# reduce each element (independently reduce ranges for each gene)
grl_redux <- reduce(grl) # element-wise, like lapply(grl, reduce)
# all(names(grl_redux) == names(grl)) & all(lengths(grl_redux) <= lengths(grl))
# return single GRanges, with rownames derived from hgnc
final <- unlist(grl_redux)
# all(names(final) == names(grl_redux))
return(final)
}
fromMike(x)
fromMe <- function(Input){
x.gr <- GRanges(seqnames = paste(Input$chrom, Input$hgnc, sep="_"),
ranges = IRanges(start = Input$start,
end = Input$end))
reduced.gr <- GenomicRanges::reduce(x.gr)
splitted <- strsplit(as.character(seqnames(reduced.gr)), split="_")
new.seqnames <- unlist(lapply(splitted, function(x)x[1]))
final <- GRanges(seqnames = new.seqnames,
ranges = ranges(reduced.gr))
elementMetadata(final) <- unlist(lapply(splitted, function(x)x[2]))
return(final)
}
fromMe(x)
Speed benchmarks:
> microbenchmark(fromZx(x))
Unit: milliseconds
expr min lq mean median uq max neval
fromZx(x) 5.662879 6.76689 9.452377 7.903199 11.01084 22.11675 100
> microbenchmark(fromMike(x))
Unit: milliseconds
expr min lq mean median uq max neval
fromMike(x) 97.53267 103.5992 126.5775 113.4482 131.6864 725.7022 100
> microbenchmark(fromMe(x))
Unit: milliseconds
expr min lq mean median uq max neval
fromMe(x) 29.36052 32.10575 40.09418 35.39014 42.27503 163.3626 100
@zx8754
Copy link

zx8754 commented Feb 26, 2020

This is what I have, yes on bigger data yours is most efficient, mike's is 1.8x and data.table is 22x times slower.

require(data.table)
require(microbenchmark)
require(GenomicRanges)

# SMALL data ------------------------------------------------------------------

x <- fread("
    chrom   start   end hgnc
    1   100 200 MYC
    1   150 300 MYC
    1   400 500 MYC
    1   150 230 TP53
    1   200 350 TP53
    1   420 550 TP53")


microbenchmark::microbenchmark(
  zx8754 = {
    x[, as.data.table(reduce(IRanges(start, end))), by = .(chrom, hgnc)]
    },
  mike = {
    gr <- GRanges(x)
    # in short:
    unlist(reduce(split(gr, gr$hgnc)))
  },
  atpoint = {
    x.gr <- GRanges(seqnames = paste(x$chrom, x$hgnc, sep="_"),
                    ranges = IRanges(start = x$start,
                                     end = x$end))
    
    reduced.gr <- GenomicRanges::reduce(x.gr)
    
    splitted <- strsplit(as.character(seqnames(reduced.gr)), split="_")
    
    new.seqnames <- unlist(lapply(splitted, function(x)x[1]))
    
    final <- GRanges(seqnames = new.seqnames,
                     ranges = ranges(reduced.gr))
    elementMetadata(final) <- unlist(lapply(splitted, function(x)x[2]))
    final
  }, unit = "relative"
)
# Unit: relative
#    expr      min       lq     mean   median       uq      max neval
#  zx8754 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000   100
#    mike 6.809585 4.689092 4.897329 4.451481 4.859497 2.342624   100
# atpoint 3.901485 2.746156 2.884291 2.642569 2.971621 1.649575   100


# BIG data --------------------------------------------------------------------
library(biomaRt)

## get mouse transcript coordinates
mmusculus_genes <- getBM(attributes = c("ensembl_gene_id", "chromosome_name",
                                        "transcript_start", "transcript_end"),  
                         mart = useMart("ensembl", dataset = "mmusculus_gene_ensembl"))

## put in the exact same format as the example of the toplevel question,
## not caring about the fact that hgnc is not the correct name for mouse genes:
x_big <- data.table(chrom = mmusculus_genes$chromosome_name,
                    start = mmusculus_genes$transcript_start,
                    end   = mmusculus_genes$transcript_end, 
                    hgnc  = mmusculus_genes$ensembl_gene_id)

# subsetting onle 1st 1000 rows, should be enough to see the difference
x_big <- x_big[1:1000, ]

microbenchmark::microbenchmark(
  zx8754 = {
    x_big[, as.data.table(reduce(IRanges(start, end))), by = .(chrom, hgnc)]
  },
  mike = {
    gr <- GRanges(x_big)
    # in short:
    unlist(reduce(split(gr, gr$hgnc)))
  },
  atpoint = {
    x.gr <- GRanges(seqnames = paste(x_big$chrom, x_big$hgnc, sep="_"),
                    ranges = IRanges(start = x_big$start,
                                     end = x_big$end))
    
    reduced.gr <- GenomicRanges::reduce(x.gr)
    
    splitted <- strsplit(as.character(seqnames(reduced.gr)), split="_")
    
    new.seqnames <- unlist(lapply(splitted, function(x)x[1]))
    
    final <- GRanges(seqnames = new.seqnames,
                     ranges = ranges(reduced.gr))
    elementMetadata(final) <- unlist(lapply(splitted, function(x)x[2]))
    final
  }, unit = "relative"
)

# Unit: relative
#    expr       min        lq      mean    median        uq       max neval
#  zx8754 22.041729 21.922969 20.468926 21.814661 20.985208 13.019504   100
#    mike  1.874596  1.848605  1.823507  1.836776  1.870061  1.587847   100
# atpoint  1.000000  1.000000  1.000000  1.000000  1.000000  1.000000   100

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment