Skip to content

Instantly share code, notes, and snippets.

@markvanderloo
Last active August 29, 2015 14:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save markvanderloo/9ae6a15f7d74a0159aec to your computer and use it in GitHub Desktop.
Save markvanderloo/9ae6a15f7d74a0159aec to your computer and use it in GitHub Desktop.
install.packages("hashr")
drat::addRepo("markvanderloo")
install.packages("stringdist")
library(stringdist)
library(quanteda)
library(stringr)
wordgrams <- function(...,.list=NULL, q=1L, split="[[:blank:]]+"){
K <- c(list(...),.list)
L <- lapply(K,function(x) stringr::str_split(x,pattern=split)[[1]])
# the inevetable metadata...
labels <- names(K)
generic <- paste0("X",seq_along(K))
if (is.null(labels)){
labels <- generic
} else {
I <- labels == ""
labels[I] <- generic[I]
}
# back to the real work: hash all words to integers
H <- hashr::hash(L,what="pointer")
# tokenize with stringdist::seq_qgrams
Q <- do.call("seq_qgrams", list(.list=H, q=q))
# create lookup table to retrieve names
H <- unlist(lapply(seq_along(L),function(i) setNames(H[[i]],L[[i]])))
# find names there; store in matrix
Qnms <- matrix(names(H)[apply(Q[,seq_len(q),drop=FALSE],2,match,H)]
,ncol=q , dimnames=list(NULL,paste0("g",seq_len(q))))
list(Qnms, Q[,-seq_len(q),drop=FALSE])
}
# test tokenizer
s1 <- "mary had a little lamb"
s2 <- "a little lamb had mary"
wordgrams(foo = s1, s2,q=2)
# testset from quanteda
inaug <- as.list(inaugTexts)
microbenchmark::microbenchmark(times=25
,stringdist = wordgrams(.list=inaug,q=2)
,quanteda = tokenize(inaugTexts,ngrams = 2)
)
# some profiling
Rprof()
stringdist = wordgrams(.list=inaug,q=2)
Rprof(NULL)
summaryRprof()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment