mwufi/test.r

## test.r

get_dtm <- function(movie_review){

  setDT(movie_review)
  setkey(movie_review, id)

  # vectorization
  prep_fun = function(x) {
    x %>%
      # make text lower case
      str_to_lower %>%
      # remove non-alphanumeric symbols
      str_replace_all("[^[:alpha:]]", " ") %>%
      # collapse multiple spaces
      str_replace_all("\\s+", " ")
  }

  tok_fun = word_tokenizer

  it_train = itoken(movie_review$body,
                    preprocessor = prep_fun,
                    tokenizer = tok_fun,
                    ids=movie_review$id,
                    progressbar = FALSE)

  # Start the clock!
  ptm <- proc.time()

  vocab = create_vocabulary(it_train)
  vocab = vocab %>% prune_vocabulary(term_count_min = 10,
                                     doc_proportion_max = 0.5)

  # Stop the clock
  time <- proc.time() - ptm
  print(time)
  print('Time to create vocabulary')

  # iterator for the vocabulary
  vectorizer = vocab_vectorizer(vocab)

  # Start the clock!
  ptm <- proc.time()

  dtm_train = create_dtm(it_train, vectorizer)

  # Stop the clock
  time <- proc.time() - ptm
  print(time)
  print('Time to create document-term matrix')

  return(list(dtm=dtm_train, vocab=vocab, iterator=it_train))
}

#   user  system elapsed
#  50.480   0.248  50.745
#  [1] "Time to create vocabulary"
#   user  system elapsed
#  53.244   0.340  53.589
# [1] "Time to create document-term matrix"

stuff <- get_dtm(t)
dtm <- stuff$dtm
vocab <- stuff$vocab
it <- stuff$iterator

dim(dtm)
# [1] 92335 21361

	get_dtm <- function(movie_review){

	setDT(movie_review)
	setkey(movie_review, id)

	# vectorization
	prep_fun = function(x) {
	x %>%
	# make text lower case
	str_to_lower %>%
	# remove non-alphanumeric symbols
	str_replace_all("[^[:alpha:]]", " ") %>%
	# collapse multiple spaces
	str_replace_all("\\s+", " ")
	}

	tok_fun = word_tokenizer

	it_train = itoken(movie_review$body,
	preprocessor = prep_fun,
	tokenizer = tok_fun,
	ids=movie_review$id,
	progressbar = FALSE)

	# Start the clock!
	ptm <- proc.time()

	vocab = create_vocabulary(it_train)
	vocab = vocab %>% prune_vocabulary(term_count_min = 10,
	doc_proportion_max = 0.5)

	# Stop the clock
	time <- proc.time() - ptm
	print(time)
	print('Time to create vocabulary')

	# iterator for the vocabulary
	vectorizer = vocab_vectorizer(vocab)

	# Start the clock!
	ptm <- proc.time()

	dtm_train = create_dtm(it_train, vectorizer)

	# Stop the clock
	time <- proc.time() - ptm
	print(time)
	print('Time to create document-term matrix')

	return(list(dtm=dtm_train, vocab=vocab, iterator=it_train))
	}

	# user system elapsed
	# 50.480 0.248 50.745
	# [1] "Time to create vocabulary"
	# user system elapsed
	# 53.244 0.340 53.589
	# [1] "Time to create document-term matrix"

	stuff <- get_dtm(t)
	dtm <- stuff$dtm
	vocab <- stuff$vocab
	it <- stuff$iterator

	dim(dtm)
	# [1] 92335 21361