Skip to content

Instantly share code, notes, and snippets.

@hadley
Last active December 26, 2015 14:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hadley/7169138 to your computer and use it in GitHub Desktop.
Save hadley/7169138 to your computer and use it in GitHub Desktop.
library(microbenchmark)
charBinaryMat <- function(listOfValues, fill = NA) {
lev <- sort(unique(unlist(listOfValues, use.names = FALSE)))
m <- matrix(fill, nrow = length(listOfValues), ncol = length(lev))
colnames(m) <- lev
for (i in 1:nrow(m)) {
m[i, listOfValues[[i]]] <- 1
}
m
}
reverseIn <- function(vector, value) {
return(value %in% vector)
}
buildCategoryMatrix <- function(valueVector) {
allClasses <- c()
for(classVec in unique(valueVector)) {
allClasses <- unique(c(allClasses,
strsplit(classVec, " ", fixed=TRUE)[[1]]))
}
resMatrix <- matrix(ncol=0, nrow=length(valueVector))
splitValues <- strsplit(valueVector, " ", fixed=TRUE)
for(cat in allClasses) {
if(cat=="") {
catIsPart <- (valueVector == "")
} else {
catIsPart <- sapply(splitValues, reverseIn, cat)
}
resMatrix <- cbind(resMatrix, catIsPart)
}
colnames(resMatrix) <- allClasses
return(resMatrix)
}
CBM <- function(str) {
charBinaryMat(strsplit(str, " ", fixed=TRUE), fill = 0)
}
BCM <- function(str) {
buildCategoryMatrix(str)*1L
}
Sapply <- function(str) {
y <- unique( unlist( strsplit( str , " " ) ) )
out <- t(sapply(str, function(x) y %in% unlist(strsplit(x , " " )),
USE.NAMES = FALSE )) * 1L
colnames(out) <- y
out
}
had <- function(x) {
lines <- strsplit(x, " ", fixed = TRUE)
all <- sort(unique(unlist(lines)))
t(vapply(lines, "%in%", x = all, numeric(length(all))))
}
set.seed(1)
A = sample(10, 1000, replace = TRUE)
str <- sapply(seq_along(A), function(x)
paste(sample(LETTERS[1:10], A[x]), collapse = " "))
head(str)
microbenchmark(CBM(str), BCM(str), Sapply(str), had(str), times=20)
# Unit: milliseconds
# expr min lq median uq max neval
# CBM(str) 2.780421 2.901480 2.957426 3.091195 9.443014 20
# BCM(str) 42.221266 44.834562 45.091333 46.691666 49.844484 20
# Sapply(str) 18.727119 19.106317 19.271673 20.679960 31.284702 20
# had(str) 3.139086 3.300337 3.404033 3.452204 5.381122 20
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment