Last active
August 29, 2015 14:26
-
-
Save markvanderloo/9ae6a15f7d74a0159aec to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
install.packages("hashr") | |
drat::addRepo("markvanderloo") | |
install.packages("stringdist") | |
library(stringdist) | |
library(quanteda) | |
library(stringr) | |
wordgrams <- function(...,.list=NULL, q=1L, split="[[:blank:]]+"){ | |
K <- c(list(...),.list) | |
L <- lapply(K,function(x) stringr::str_split(x,pattern=split)[[1]]) | |
# the inevetable metadata... | |
labels <- names(K) | |
generic <- paste0("X",seq_along(K)) | |
if (is.null(labels)){ | |
labels <- generic | |
} else { | |
I <- labels == "" | |
labels[I] <- generic[I] | |
} | |
# back to the real work: hash all words to integers | |
H <- hashr::hash(L,what="pointer") | |
# tokenize with stringdist::seq_qgrams | |
Q <- do.call("seq_qgrams", list(.list=H, q=q)) | |
# create lookup table to retrieve names | |
H <- unlist(lapply(seq_along(L),function(i) setNames(H[[i]],L[[i]]))) | |
# find names there; store in matrix | |
Qnms <- matrix(names(H)[apply(Q[,seq_len(q),drop=FALSE],2,match,H)] | |
,ncol=q , dimnames=list(NULL,paste0("g",seq_len(q)))) | |
list(Qnms, Q[,-seq_len(q),drop=FALSE]) | |
} | |
# test tokenizer | |
s1 <- "mary had a little lamb" | |
s2 <- "a little lamb had mary" | |
wordgrams(foo = s1, s2,q=2) | |
# testset from quanteda | |
inaug <- as.list(inaugTexts) | |
microbenchmark::microbenchmark(times=25 | |
,stringdist = wordgrams(.list=inaug,q=2) | |
,quanteda = tokenize(inaugTexts,ngrams = 2) | |
) | |
# some profiling | |
Rprof() | |
stringdist = wordgrams(.list=inaug,q=2) | |
Rprof(NULL) | |
summaryRprof() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment