primaryobjects/classifytext.R

## classifytext.R
library(caret)
library(tm)

# Training data.
data <- c('Cats like to chase mice.', 'Dogs like to eat big bones.')
corpus <- VCorpus(VectorSource(data))

# Create a document term matrix.
tdm <- DocumentTermMatrix(corpus, list(removePunctuation = TRUE, stopwords = TRUE, stemming = TRUE, removeNumbers = TRUE))

# Convert to a data.frame for training and assign a classification (factor) to each document.
train <- as.matrix(tdm)
train <- cbind(train, c(0, 1))
colnames(train)[ncol(train)] <- 'y'
train <- as.data.frame(train)
train$y <- as.factor(train$y)

# Train.
fit <- train(y ~ ., data = train, method = 'bayesglm')

# Check accuracy on training.
predict(fit, newdata = train)

# Test data.
data2 <- c('Bats eat bugs.')
corpus <- VCorpus(VectorSource(data2))
tdm <- DocumentTermMatrix(corpus, control = list(dictionary = Terms(tdm), removePunctuation = TRUE, stopwords = TRUE, stemming = TRUE, removeNumbers = TRUE))
test <- as.matrix(tdm)

# Check accuracy on test.
predict(fit, newdata = test)

## results.txt
> data
[1] "Cats like to chase mice."    "Dogs like to eat big bones."
> train
  big bone cat chase dog eat like mice y
1   0    0   1     1   0   0    1    1 0
2   1    1   0     0   1   1    1    0 1
> predict(fit, newdata = train)
[1] 0 1
> data2
[1] "Bats eat bugs."
> test
  big bone cat chase dog eat like mice
1   0    0   0     0   0   1    0    0
> predict(fit, newdata = test)
[1] 1
>
	library(caret)
	library(tm)

	# Training data.
	data <- c('Cats like to chase mice.', 'Dogs like to eat big bones.')
	corpus <- VCorpus(VectorSource(data))

	# Create a document term matrix.
	tdm <- DocumentTermMatrix(corpus, list(removePunctuation = TRUE, stopwords = TRUE, stemming = TRUE, removeNumbers = TRUE))

	# Convert to a data.frame for training and assign a classification (factor) to each document.
	train <- as.matrix(tdm)
	train <- cbind(train, c(0, 1))
	colnames(train)[ncol(train)] <- 'y'
	train <- as.data.frame(train)
	train$y <- as.factor(train$y)

	# Train.
	fit <- train(y ~ ., data = train, method = 'bayesglm')

	# Check accuracy on training.
	predict(fit, newdata = train)

	# Test data.
	data2 <- c('Bats eat bugs.')
	corpus <- VCorpus(VectorSource(data2))
	tdm <- DocumentTermMatrix(corpus, control = list(dictionary = Terms(tdm), removePunctuation = TRUE, stopwords = TRUE, stemming = TRUE, removeNumbers = TRUE))
	test <- as.matrix(tdm)

	# Check accuracy on test.
	predict(fit, newdata = test)
	> data
	[1] "Cats like to chase mice." "Dogs like to eat big bones."
	> train
	big bone cat chase dog eat like mice y
	1 0 0 1 1 0 0 1 1 0
	2 1 1 0 0 1 1 1 0 1
	> predict(fit, newdata = train)
	[1] 0 1
	> data2
	[1] "Bats eat bugs."
	> test
	big bone cat chase dog eat like mice
	1 0 0 0 0 0 1 0 0
	> predict(fit, newdata = test)
	[1] 1
	>