Skip to content

Instantly share code, notes, and snippets.

@azza-bazoo
Created April 25, 2014 04:42
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save azza-bazoo/11277870 to your computer and use it in GitHub Desktop.
Save azza-bazoo/11277870 to your computer and use it in GitHub Desktop.
Faster string splitting in R - based on https://gist.github.com/jeroenjanssens/11275916

> summaryRprof("Rprof.out")

$by.self
            self.time self.pct total.time total.pct
"substring"    232.76    99.87     232.76     99.87
"<GC>"           0.08     0.03       0.08      0.03
"gregexpr"       0.06     0.03       0.06      0.03
"sort.list"      0.06     0.03       0.06      0.03
"readLines"      0.04     0.02       0.04      0.02
"table"          0.02     0.01       0.08      0.03
"cat"            0.02     0.01       0.04      0.02
"paste"          0.02     0.01       0.02      0.01

$by.total
                      total.time total.pct self.time self.pct
"<Anonymous>"             232.80     99.89      0.00     0.00
"mapply"                  232.80     99.89      0.00     0.00
"regmatches"              232.78     99.88      0.00     0.00
"substring"               232.76     99.87    232.76    99.87
"Map"                     232.76     99.87      0.00     0.00
"as.data.frame"             0.10      0.04      0.00     0.00
"<GC>"                      0.08      0.03      0.08     0.03
"table"                     0.08      0.03      0.02     0.01
"factor"                    0.08      0.03      0.00     0.00
"gregexpr"                  0.06      0.03      0.06     0.03
"sort.list"                 0.06      0.03      0.06     0.03
"readLines"                 0.04      0.02      0.04     0.02
"cat"                       0.04      0.02      0.02     0.01
"paste"                     0.02      0.01      0.02     0.01
"[[.factor"                 0.02      0.01      0.00     0.00
"[["                        0.02      0.01      0.00     0.00
"as.data.frame.table"       0.02      0.01      0.00     0.00
"data.frame"                0.02      0.01      0.00     0.00
"do.call"                   0.02      0.01      0.00     0.00
"eval"                      0.02      0.01      0.00     0.00
"expand.grid"               0.02      0.01      0.00     0.00
"sprintf"                   0.02      0.01      0.00     0.00
"tolower"                   0.02      0.01      0.00     0.00

$sample.interval
[1] 0.02

$sampling.time
[1] 233.06

> summaryRprof("Rprof.out")

$by.self
                      self.time self.pct total.time total.pct
"strsplit"                 0.12    40.00       0.12     40.00
"readLines"                0.04    13.33       0.04     13.33
"sort.list"                0.04    13.33       0.04     13.33
"as.character"             0.02     6.67       0.04     13.33
"as.character.factor"      0.02     6.67       0.02      6.67
"expand.grid"              0.02     6.67       0.02      6.67
"tolower"                  0.02     6.67       0.02      6.67
"unique.default"           0.02     6.67       0.02      6.67

$by.total
                      total.time total.pct self.time self.pct
"strsplit"                  0.12     40.00      0.12    40.00
"as.data.frame"             0.08     26.67      0.00     0.00
"factor"                    0.06     20.00      0.00     0.00
"table"                     0.06     20.00      0.00     0.00
"readLines"                 0.04     13.33      0.04    13.33
"sort.list"                 0.04     13.33      0.04    13.33
"as.character"              0.04     13.33      0.02     6.67
"<Anonymous>"               0.04     13.33      0.00     0.00
"cat"                       0.04     13.33      0.00     0.00
"mapply"                    0.04     13.33      0.00     0.00
"sprintf"                   0.04     13.33      0.00     0.00
"as.character.factor"       0.02      6.67      0.02     6.67
"expand.grid"               0.02      6.67      0.02     6.67
"tolower"                   0.02      6.67      0.02     6.67
"unique.default"            0.02      6.67      0.02     6.67
"as.data.frame.table"       0.02      6.67      0.00     0.00
"data.frame"                0.02      6.67      0.00     0.00
"do.call"                   0.02      6.67      0.00     0.00
"eval"                      0.02      6.67      0.00     0.00
"unique"                    0.02      6.67      0.00     0.00

$sample.interval
[1] 0.02

$sampling.time
[1] 0.3
#!/usr/bin/env Rscript
Rprof(filename = "Rprof.out", append = FALSE, interval = 0.02)
num.words <- as.integer(commandArgs(trailingOnly = TRUE))
f <- file("stdin")
input.lines <- readLines(f)
close(f)
full.text <- tolower(paste(input.lines, collapse = " "))
#splits <- gregexpr("\\w+", full.text)
#words.all <- (regmatches(full.text, splits)[[1]])
# For Dummies (!) says to use strsplit:
# http://www.dummies.com/how-to/content/how-to-split-strings-in-r.html
words.all <- strsplit(full.text, "\\W+")
words.unique <- as.data.frame(table(words.all))
words.sorted <- words.unique[order(-words.unique$Freq),]
dummy <- mapply(function(w, c) {
cat(sprintf("%8d %s\n", c, w))
}, head(words.sorted$words, num.words), head(words.sorted$Freq, num.words))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment