Skip to content

Instantly share code, notes, and snippets.

@mwufi
Created November 12, 2017 20:42
Show Gist options
  • Save mwufi/79b04582629508fd826688ec4822daf7 to your computer and use it in GitHub Desktop.
Save mwufi/79b04582629508fd826688ec4822daf7 to your computer and use it in GitHub Desktop.
Create vocabulary and document-term matrix using text2vec
get_dtm <- function(movie_review){
setDT(movie_review)
setkey(movie_review, id)
# vectorization
prep_fun = function(x) {
x %>%
# make text lower case
str_to_lower %>%
# remove non-alphanumeric symbols
str_replace_all("[^[:alpha:]]", " ") %>%
# collapse multiple spaces
str_replace_all("\\s+", " ")
}
tok_fun = word_tokenizer
it_train = itoken(movie_review$body,
preprocessor = prep_fun,
tokenizer = tok_fun,
ids=movie_review$id,
progressbar = FALSE)
# Start the clock!
ptm <- proc.time()
vocab = create_vocabulary(it_train)
vocab = vocab %>% prune_vocabulary(term_count_min = 10,
doc_proportion_max = 0.5)
# Stop the clock
time <- proc.time() - ptm
print(time)
print('Time to create vocabulary')
# iterator for the vocabulary
vectorizer = vocab_vectorizer(vocab)
# Start the clock!
ptm <- proc.time()
dtm_train = create_dtm(it_train, vectorizer)
# Stop the clock
time <- proc.time() - ptm
print(time)
print('Time to create document-term matrix')
return(list(dtm=dtm_train, vocab=vocab, iterator=it_train))
}
# user system elapsed
# 50.480 0.248 50.745
# [1] "Time to create vocabulary"
# user system elapsed
# 53.244 0.340 53.589
# [1] "Time to create document-term matrix"
stuff <- get_dtm(t)
dtm <- stuff$dtm
vocab <- stuff$vocab
it <- stuff$iterator
dim(dtm)
# [1] 92335 21361
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment