public
Last active

  • Download Gist
bench.r
R
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
library(microbenchmark)
 
charBinaryMat <- function(listOfValues, fill = NA) {
lev <- sort(unique(unlist(listOfValues, use.names = FALSE)))
m <- matrix(fill, nrow = length(listOfValues), ncol = length(lev))
colnames(m) <- lev
for (i in 1:nrow(m)) {
m[i, listOfValues[[i]]] <- 1
}
m
}
 
reverseIn <- function(vector, value) {
return(value %in% vector)
}
 
buildCategoryMatrix <- function(valueVector) {
allClasses <- c()
for(classVec in unique(valueVector)) {
allClasses <- unique(c(allClasses,
strsplit(classVec, " ", fixed=TRUE)[[1]]))
}
resMatrix <- matrix(ncol=0, nrow=length(valueVector))
splitValues <- strsplit(valueVector, " ", fixed=TRUE)
for(cat in allClasses) {
if(cat=="") {
catIsPart <- (valueVector == "")
} else {
catIsPart <- sapply(splitValues, reverseIn, cat)
}
resMatrix <- cbind(resMatrix, catIsPart)
}
colnames(resMatrix) <- allClasses
return(resMatrix)
}
 
CBM <- function(str) {
charBinaryMat(strsplit(str, " ", fixed=TRUE), fill = 0)
}
BCM <- function(str) {
buildCategoryMatrix(str)*1L
}
Sapply <- function(str) {
y <- unique( unlist( strsplit( str , " " ) ) )
out <- t(sapply(str, function(x) y %in% unlist(strsplit(x , " " )),
USE.NAMES = FALSE )) * 1L
colnames(out) <- y
out
}
 
had <- function(x) {
lines <- strsplit(x, " ", fixed = TRUE)
all <- sort(unique(unlist(lines)))
t(vapply(lines, "%in%", x = all, numeric(length(all))))
}
 
 
set.seed(1)
A = sample(10, 1000, replace = TRUE)
str <- sapply(seq_along(A), function(x)
paste(sample(LETTERS[1:10], A[x]), collapse = " "))
head(str)
 
microbenchmark(CBM(str), BCM(str), Sapply(str), had(str), times=20)
# Unit: milliseconds
# expr min lq median uq max neval
# CBM(str) 2.780421 2.901480 2.957426 3.091195 9.443014 20
# BCM(str) 42.221266 44.834562 45.091333 46.691666 49.844484 20
# Sapply(str) 18.727119 19.106317 19.271673 20.679960 31.284702 20
# had(str) 3.139086 3.300337 3.404033 3.452204 5.381122 20

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.