Skip to content

Instantly share code, notes, and snippets.

@stufield
Last active August 27, 2019 21:03
Show Gist options
  • Save stufield/b6db4f3588762cc8c98e345f2b13e29f to your computer and use it in GitHub Desktop.
Save stufield/b6db4f3588762cc8c98e345f2b13e29f to your computer and use it in GitHub Desktop.
library(Rcpp)
library(purrr)
library(stringr)
library(bench)

# Helper function to both functions
# Extracts the SeqId portion of a string & replaces the "." -> "-"
getSeqId <- function(x) {
  x <- stringr::str_trim(x) # zap whitespace
  match_mat <- stringr::str_locate(x, "[0-9]{4,5}[-.][0-9]{1,3}([._][0-9]{1,3})?$")
  args <- list(
    string = x,
    start = match_mat[, "start"],
    end = match_mat[, "end"]
  )
  purrr::pmap_chr(args, stringr::str_sub) %>%
    stringr::str_replace("\\.", "-")
}

# The C++ version of matchSeqIds
Rcpp::cppFunction('
  #include <Rcpp.h>
  #include <unordered_map>
  Rcpp::StringVector
    matchseq_cpp(CharacterVector x,          // string containing SeqId entries //
                 CharacterVector y,          // string containing SeqIds to match x //
                 bool order_by_x = true) {   // logical on ordering //
    Function base_intersect("intersect");    // cannot use sugar; order differs; small speed cost tho //
    Function getSeqId("getSeqId");           // get function from outside //
    CharacterVector x_seqs = getSeqId(x);    // get SeqIds; non-seqs are NA //
    x_seqs = Rcpp::na_omit(x_seqs);          // rm NAs //
    CharacterVector y_seqs = getSeqId(y);    // get SeqIds from y //
    int L = y.size();
    std::unordered_map< std::string, std::string > hashmap; // initialize hashmap //
    for(int i = 0; i < L; i++) {
      hashmap.insert(std::make_pair(y_seqs[i], y[i]));  // populate hashmap //
    }
    CharacterVector ord_seqs(L);                  // initiate vector for intersect //
    if (order_by_x) {
      ord_seqs = base_intersect(x_seqs, y_seqs);  // get SeqId intersection of x & y //
    } else {
      ord_seqs = base_intersect(y_seqs, x_seqs);  // get SeqId intersection of y & x //
    }
    int n = ord_seqs.size();
    Rcpp::StringVector res(n);
    for(int j = 0; j < n; j++) {
      // get the corresponding y values   //
      res[j] = hashmap[ Rcpp::as< std::string >(ord_seqs[j]) ];
    }
    return res;
  }')

# The original matchSeqIds
# A Seqid is of this form: XYZ.1234.56 or XYZ-1234_45
# Match on the SeqId portion; NOT the preceeding ":alpha:" string
matchSeqIds <- function(x, y, order.by.x = TRUE) {
  x_seqIds <- getSeqId(x) %>% purrr::discard(is.na)
  # create lookup table to index SeqIds to their values in 'y'
  y_lookup <- as.list(y) %>% purrr::set_names(getSeqId(y))
  y_seqIds <- names(y_lookup) %>% purrr::discard(is.na) # rm NAs in 'y'
  if (order.by.x) {
    order_seqs <- intersect(x_seqIds, y_seqIds)
  } else {
    order_seqs <- intersect(y_seqIds, x_seqIds)
  }
  if (length(order_seqs) == 0) {
    return(character(0))
  }
  purrr::map_chr(order_seqs, ~ y_lookup[[.x]])
}


# Generate a dummy long string of combined SeqIds
set.seed(101)
n <- 5000
gene <- replicate(n, paste0(sample(LETTERS, 4, replace = TRUE), collapse = ""))
x <- paste0(gene, ".", sample(1:15000, n), ".", sample(1:999, n, replace = TRUE))

# sample 10 from x and modify the GeneID
# so that full matches can't be done
# must match based on SeqId
y <- sample(x, 2500) # almost all of 'x' but still a subset
y <- paste0(sample(LETTERS, n, replace = TRUE), y)

# Benchmark
bnch <- bench::mark(
  matchseq_cpp = matchseq_cpp(x, y),
  matchSeqIds = matchSeqIds(x, y),
  iterations = 100
)
#> Warning: Some expressions had a GC in every iteration; so filtering is
#> disabled.

# Absolute
# Only ~ 2x improvement
# Longer 'y' results in > improvement (more matching to do)
bnch
#> # A tibble: 2 x 10
#>   expression      min   mean median     max `itr/sec` mem_alloc  n_gc n_itr
#>   <bch:expr>   <bch:> <bch:> <bch:> <bch:t>     <dbl> <bch:byt> <dbl> <int>
#> 1 matchseq_cpp 31.5ms   38ms   38ms  46.3ms      26.3    1.23MB    67   100
#> 2 matchSeqIds  66.1ms 75.9ms 75.2ms 116.4ms      13.2    1.47MB   129   100
#> # … with 1 more variable: total_time <bch:tm>

# Relative
summary(bnch, relative = TRUE)
#> Warning: Some expressions had a GC in every iteration; so filtering is
#> disabled.
#> # A tibble: 2 x 10
#>   expression     min  mean median   max `itr/sec` mem_alloc  n_gc n_itr
#>   <bch:expr>   <dbl> <dbl>  <dbl> <dbl>     <dbl>     <dbl> <dbl> <dbl>
#> 1 matchseq_cpp  1     1      1     1         2.00      1     1        1
#> 2 matchSeqIds   2.10  2.00   1.98  2.52      1         1.20  1.93     1
#> # … with 1 more variable: total_time <dbl>

Created on 2019-08-27 by the reprex package (v0.2.1)

Session info
devtools::session_info()
#> ─ Session info ──────────────────────────────────────────────────────────
#>  setting  value                       
#>  version  R version 3.5.2 (2018-12-20)
#>  os       macOS Mojave 10.14.6        
#>  system   x86_64, darwin15.6.0        
#>  ui       X11                         
#>  language (EN)                        
#>  collate  en_US.UTF-8                 
#>  ctype    en_US.UTF-8                 
#>  tz       America/Denver              
#>  date     2019-08-27                  
#> 
#> ─ Packages ──────────────────────────────────────────────────────────────
#>  package     * version    date       lib source                      
#>  assertthat    0.2.0      2017-04-11 [1] CRAN (R 3.5.0)              
#>  backports     1.1.3      2018-12-14 [1] CRAN (R 3.5.0)              
#>  bench       * 1.0.1.9000 2019-02-01 [1] Github (r-lib/bench@3e5d63f)
#>  callr         3.1.1      2018-12-21 [1] CRAN (R 3.5.0)              
#>  cli           1.1.0      2019-03-19 [1] RSPM (R 3.5.2)              
#>  crayon        1.3.4      2017-09-16 [1] RSPM (R 3.5.2)              
#>  desc          1.2.0      2018-05-01 [1] CRAN (R 3.5.0)              
#>  devtools      2.0.2      2019-04-08 [1] RSPM (R 3.5.2)              
#>  digest        0.6.18     2018-10-10 [1] CRAN (R 3.5.0)              
#>  evaluate      0.13       2019-02-12 [1] CRAN (R 3.5.2)              
#>  fansi         0.4.0      2018-10-05 [1] CRAN (R 3.5.0)              
#>  fs            1.2.6      2018-08-23 [1] CRAN (R 3.5.0)              
#>  glue          1.3.1      2019-03-12 [1] CRAN (R 3.5.2)              
#>  highr         0.7        2018-06-09 [1] CRAN (R 3.5.0)              
#>  htmltools     0.3.6      2017-04-28 [1] CRAN (R 3.5.0)              
#>  knitr         1.22       2019-03-08 [1] CRAN (R 3.5.2)              
#>  magrittr      1.5        2014-11-22 [1] RSPM (R 3.5.2)              
#>  memoise       1.1.0      2017-04-21 [1] CRAN (R 3.5.0)              
#>  pillar        1.3.1      2018-12-15 [1] CRAN (R 3.5.0)              
#>  pkgbuild      1.0.2      2018-10-16 [1] CRAN (R 3.5.0)              
#>  pkgconfig     2.0.2      2018-08-16 [1] CRAN (R 3.5.0)              
#>  pkgload       1.0.2      2018-10-29 [1] CRAN (R 3.5.0)              
#>  prettyunits   1.0.2      2015-07-13 [1] CRAN (R 3.5.0)              
#>  processx      3.2.1      2018-12-05 [1] CRAN (R 3.5.0)              
#>  profmem       0.5.0      2018-01-30 [1] CRAN (R 3.5.0)              
#>  ps            1.3.0      2018-12-21 [1] CRAN (R 3.5.0)              
#>  purrr       * 0.3.2      2019-03-15 [1] RSPM (R 3.5.2)              
#>  R6            2.4.0      2019-02-14 [1] CRAN (R 3.5.2)              
#>  Rcpp        * 1.0.1      2019-03-17 [1] RSPM (R 3.5.2)              
#>  remotes       2.0.2      2018-10-30 [1] CRAN (R 3.5.0)              
#>  rlang         0.3.4      2019-04-07 [1] RSPM (R 3.5.2)              
#>  rmarkdown     1.11       2018-12-08 [1] CRAN (R 3.5.0)              
#>  rprojroot     1.3-2      2018-01-03 [1] CRAN (R 3.5.0)              
#>  sessioninfo   1.1.1      2018-11-05 [1] CRAN (R 3.5.0)              
#>  stringi       1.4.3      2019-03-12 [1] CRAN (R 3.5.2)              
#>  stringr     * 1.4.0      2019-02-10 [1] RSPM (R 3.5.2)              
#>  testthat      2.2.0      2019-07-22 [1] CRAN (R 3.5.2)              
#>  tibble        2.1.1      2019-03-16 [1] RSPM (R 3.5.2)              
#>  usethis       1.5.0      2019-04-07 [1] CRAN (R 3.5.2)              
#>  utf8          1.1.4      2018-05-24 [1] CRAN (R 3.5.0)              
#>  withr         2.1.2      2018-03-15 [1] CRAN (R 3.5.0)              
#>  xfun          0.5        2019-02-20 [1] CRAN (R 3.5.2)              
#>  yaml          2.2.0      2018-07-25 [1] CRAN (R 3.5.0)              
#> 
#> [1] /Users/sfield/r_libs
#> [2] /Library/Frameworks/R.framework/Versions/3.5/Resources/library
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment