Skip to content

Instantly share code, notes, and snippets.

@amir-rahnama
Last active July 11, 2016 09:43
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save amir-rahnama/d8fc4fbd071c7235fa858d4146ec96c9 to your computer and use it in GitHub Desktop.
Save amir-rahnama/d8fc4fbd071c7235fa858d4146ec96c9 to your computer and use it in GitHub Desktop.
library(text2vec)
library(SnowballC)
library(doParallel)
library(microbenchmark)
library(tm)
con <- file("/Users/ara/dev/personal/r/final/en_US/en_US.blogs.txt", "r")
blogs <- readLines(con, encoding = 'UTF-8')
close(con)
con <- file("/Users/ara/dev/personal/r/final/en_US/en_US.news.txt", "r")
news <- readLines(con)
close(con)
con <- file("/Users/ara/dev/personal/r/final/en_US/en_US.twitter.txt", "r")
twitter <- readLines(con)
close(con)
decode <- function(text) {
t1 <- iconv(text, from = "UTF-8", to = "ASCII")
return(t1)
}
data <- decode(paste(blogs, news, twitter))
rm(blogs)
rm(news)
rm(twitter)
start <- Sys.time()
clean <- function(docs) {
docs <- removeNumbers(docs)
docs <- removePunctuation(docs)
docs <- stripWhitespace(docs)
docs <- stemDocument(docs)
return(docs)
}
stem_tokenizer <- function(x, tokenizer = word_tokenizer) {
x %>%
tokenizer %>%
# poerter stemmer
lapply(wordStem, 'en')
}
N_WORKERS <- 3
registerDoParallel(N_WORKERS, cores=3)
#
# tokens <- data %>%
# tolower %>%
# stem_tokenizer
splits <- split_into(data, N_WORKERS)
jobs <- lapply(splits, itoken, tolower, word_tokenizer)
stopwords <- c("i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours") %>%
# here we stem stopwords, because stop-words filtering would be performed after tokenization!
wordStem('en')
#one gram
vocab_parallel <- create_vocabulary(jobs, ngram = c(ngram_min = 1L, ngram_max = 20L), stopwords = stopwords)
v_vectorizer <- vocab_vectorizer(vocab_parallel, grow_dtm = TRUE)
vocab_dtm_parallel <- create_dtm(jobs, vectorizer = v_vectorizer)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment