soodoku/text_classifier.R

## text_classifier.R
"
Basic Text Classifier
	- Takes a csv with a text column, and column of labels
	- Splits into train and test
	- Preprocesses text using tm/bag-of-words, 1/2-order Markov
	- Uses SVM and Lasso

@author: Gaurav Sood

"
# Load libs
library(readr)	# read in big data a bit quicker
library(tm)	# pre_process text
library(glmnet)	# for lasso
library(SnowballC) #for stemming
library(kernlab)# for svm
library(doMC)	# parallel computing

"
Load data
Expected Data Format:
  Let label column = labels
  Say labels are of two classes: 1, 0
  Let text column = text
"

data 			<- read_csv("data.csv")
data 			<- as.data.frame(data)
colnames(data) 	<- make.names(colnames(data))

# Take out text columns with no category labels
data <- subset(data, !is.na(labels))

"
 Depending on the classification task, you may not need the entire training dataset.
 Note: You can always plot how much accuracy improves as n increases)

 Subsetting may also prove useful for computational reasons for some
 If subsetting, stratified sampling not a bad idea if labels v. imbalanced

"
"

Stratified function takes:
	data (data),
	name of the label column (labels),
	either proportion of rows from each label (p) or n per label (n_per_label), and
	random seed (seed) for reproducibility

"
stratified <- function(data, labels, n_per_label, p=NA, seed=314159265){
	# data <- data; labels="labels"; n_per_label=200;p <- NA;seed=314159265
	dsample <- NULL
	set.seed(seed)

	for(i in levels(as.factor(data[,labels])))
	{
	  dsub 		<- data[data[,labels] == i,]
	  if(is.na(p)){
	  	dsub 	<- dsub[sample(1:nrow(dsub), n_per_label), ]
	  }
	  else{
	  	dsub 	<- dsub[sample(1:nrow(dsub), ceiling(nrow(dsub) * p)), ]
	  }
	  dsample 	<- c(dsample, row.names(dsub))
  	}
	dsample
}

# Training data
train_rows	<- stratified(data, "labels", n_per_label=25000)
data_train 	<- data[train_rows,]

# Test data (10k is more than enough)
data_test <- data[-as.numeric(train_rows),]
data_test <- data_test[sample(1:nrow(data_test), 10000),]

# An abstract function to preprocess a text column
preprocess <- function(text_column)
{
	# Use tm to get a doc matrix
		corpus <- Corpus(VectorSource(text_column))
	# all lower case
		corpus <- tm_map(corpus, content_transformer(tolower))
	# remove punctuation
		corpus <- tm_map(corpus, content_transformer(removePunctuation))
	# remove numbers
		corpus <- tm_map(corpus, content_transformer(removeNumbers))
	# remove stopwords
		corpus <- tm_map(corpus, removeWords, stopwords("english"))
	# stem document
		corpus <- tm_map(corpus, stemDocument)
	# strip white spaces (always at the end)
		corpus <- tm_map(corpus, stripWhitespace)
	# return
		corpus
}

# Get preprocess training and test data
	train_corpus <- preprocess(data_train$text)
	test_corpus  <- preprocess(data_test$text)

# Create a Document Term Matrix for train and test
# Just including bi and tri-grams

# Bi-Trigram tokenizer function (you can always get longer n-grams)
	bitrigramtokeniser <- function(x, n) {
		RWeka:::NGramTokenizer(x, RWeka:::Weka_control(min = 2, max = 3))
	}

# Appropriate libraries
# For Windows
# Sys.setenv(JAVA_HOME='C:/Program Files/Java/jre7/')
library(rJava)
library(RWeka)

"
Remove remove words <=2
TdIdf weighting
Infrequent (< than 1% of documents) and very frequent (> 80% of documents) terms not included
"
train_dtm <- DocumentTermMatrix(train_corpus, control=list(wordLengths=c(2, Inf),
										 tokenize = bitrigramtokeniser,
										 weighting = function(x) weightTfIdf(x, normalize = FALSE),
										 bounds=list(global=c(floor(length(train_corpus)*0.01), floor(length(train_corpus)*.8)))))

test_dtm <- DocumentTermMatrix(test_corpus, control=list(wordLengths=c(2, Inf),
										tokenize = bitrigramtokeniser,
										weighting = function(x) weightTfIdf(x, normalize = FALSE),
										bounds=list(global=c(floor(length(test_corpus)*0.001), floor(length(test_corpus)*.8)))))

# Variable selection
# ~~~~~~~~~~~~~~~~~~~~
"
 For dimension reduction.
 The function calculates chi-square value for each phrase and keeps phrases with highest chi_square values
 Ideally you want to put variable selection as part of cross-validation.

 chisqTwo function takes:
 	document term matrix (dtm),
 	vector of labels (labels), and
 	number of n-grams you want to keep (n_out)

"
chisqTwo <- function(dtm, labels, n_out=2000){
	mat 		<- as.matrix(dtm)
	cat1		<- 	colSums(mat[labels==T,])	  	# total number of times phrase used in cat1
	cat2		<- 	colSums(mat[labels==F,])	 	# total number of times phrase used in cat2
	n_cat1		<- 	sum(mat[labels==T,]) - cat1   	# total number of phrases in soft minus cat1
	n_cat2		<- 	sum(mat[labels==F,]) - cat2   	# total number of phrases in hard minus cat2

	num 		<- (cat1*n_cat2 - cat2*n_cat1)^2
	den 		<- (cat1 + cat2)*(cat1 + n_cat1)*(cat2 + n_cat2)*(n_cat1 + n_cat2)
	chisq 		<- num/den

	chi_order	<- chisq[order(chisq)][1:n_out]
	mat 		<- mat[, colnames(mat) %in% names(chi_order)]

}

"
With high dimensional data, test matrix may not have all the phrases training matrix has.
This function fixes that - so that test matrix has the same columns as training.

testmat takes column names of training matrix (train_mat_cols), and
	test matrix (test_mat)
	and outputs test_matrix with the same columns as training matrix
"
# Test matrix maker
testmat <- function(train_mat_cols, test_mat){
	# train_mat_cols <- colnames(train_mat); test_mat <- as.matrix(test_dtm)
	test_mat 	<- test_mat[, colnames(test_mat) %in% train_mat_cols]

	miss_names 	<- train_mat_cols[!(train_mat_cols %in% colnames(test_mat))]
	if(length(miss_names)!=0){
		colClasses  <- rep("numeric", length(miss_names))
		df 			<- read.table(text = '', colClasses = colClasses, col.names = miss_names)
		df[1:nrow(test_mat),] <- 0
		test_mat 	<- cbind(test_mat, df)
	}
	as.matrix(test_mat)
}

# Train and test matrices
train_mat <- chisqTwo(train_dtm, data_train$labels)
test_mat  <- testmat(colnames(train_mat), as.matrix(test_dtm))

# Take out the heavy dtms in the memory
rm(train_dtm)
rm(test_dtm)

# Run garbage collector to free up memory
gc()

# Fit the model
# ~~~~~~~~~~~~~~~~~~~~
# Lasso (cross-validated lambda)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
registerDoMC(cores=2) # set cores
fit_lasso 	<- cv.glmnet(train_mat, data_train$labels, family = "binomial", nfolds=5, type.measure="class")
pred 		<- predict(fit_lasso, newx= test_mat, s = "lambda.min", type="response")

# Prediction Accuracy
table(pred > .5, data_test$labels)
sum(diag(table(pred > .5, data_test$labels)))/sum(table(pred > .5, data_test$labels))

# Plot Cross-Validation Curve
pdf("cvCurve.pdf")
plot(fit_lasso)
dev.off()

# SVM
# ~~~~~~~~~~~~~~
svp 	<- ksvm(as.matrix(train_dtm), nyt_train$NewsDeskSoft, type="C-svc", kernel='vanilladot', C=100,scaled=c())
spred  	<- predict(svp,test_dtm)
	"
	Basic Text Classifier
	- Takes a csv with a text column, and column of labels
	- Splits into train and test
	- Preprocesses text using tm/bag-of-words, 1/2-order Markov
	- Uses SVM and Lasso

	@author: Gaurav Sood

	"
	# Load libs
	library(readr) # read in big data a bit quicker
	library(tm) # pre_process text
	library(glmnet) # for lasso
	library(SnowballC) #for stemming
	library(kernlab)# for svm
	library(doMC) # parallel computing

	"
	Load data
	Expected Data Format:
	Let label column = labels
	Say labels are of two classes: 1, 0
	Let text column = text
	"

	data <- read_csv("data.csv")
	data <- as.data.frame(data)
	colnames(data) <- make.names(colnames(data))

	# Take out text columns with no category labels
	data <- subset(data, !is.na(labels))

	"
	Depending on the classification task, you may not need the entire training dataset.
	Note: You can always plot how much accuracy improves as n increases)

	Subsetting may also prove useful for computational reasons for some
	If subsetting, stratified sampling not a bad idea if labels v. imbalanced

	"
	"

	Stratified function takes:
	data (data),
	name of the label column (labels),
	either proportion of rows from each label (p) or n per label (n_per_label), and
	random seed (seed) for reproducibility

	"
	stratified <- function(data, labels, n_per_label, p=NA, seed=314159265){
	# data <- data; labels="labels"; n_per_label=200;p <- NA;seed=314159265
	dsample <- NULL
	set.seed(seed)

	for(i in levels(as.factor(data[,labels])))
	{
	dsub <- data[data[,labels] == i,]
	if(is.na(p)){
	dsub <- dsub[sample(1:nrow(dsub), n_per_label), ]
	}
	else{
	dsub <- dsub[sample(1:nrow(dsub), ceiling(nrow(dsub) * p)), ]
	}
	dsample <- c(dsample, row.names(dsub))
	}
	dsample
	}

	# Training data
	train_rows <- stratified(data, "labels", n_per_label=25000)
	data_train <- data[train_rows,]

	# Test data (10k is more than enough)
	data_test <- data[-as.numeric(train_rows),]
	data_test <- data_test[sample(1:nrow(data_test), 10000),]

	# An abstract function to preprocess a text column
	preprocess <- function(text_column)
	{
	# Use tm to get a doc matrix
	corpus <- Corpus(VectorSource(text_column))
	# all lower case
	corpus <- tm_map(corpus, content_transformer(tolower))
	# remove punctuation
	corpus <- tm_map(corpus, content_transformer(removePunctuation))
	# remove numbers
	corpus <- tm_map(corpus, content_transformer(removeNumbers))
	# remove stopwords
	corpus <- tm_map(corpus, removeWords, stopwords("english"))
	# stem document
	corpus <- tm_map(corpus, stemDocument)
	# strip white spaces (always at the end)
	corpus <- tm_map(corpus, stripWhitespace)
	# return
	corpus
	}

	# Get preprocess training and test data
	train_corpus <- preprocess(data_train$text)
	test_corpus <- preprocess(data_test$text)

	# Create a Document Term Matrix for train and test
	# Just including bi and tri-grams

	# Bi-Trigram tokenizer function (you can always get longer n-grams)
	bitrigramtokeniser <- function(x, n) {
	RWeka:::NGramTokenizer(x, RWeka:::Weka_control(min = 2, max = 3))
	}

	# Appropriate libraries
	# For Windows
	# Sys.setenv(JAVA_HOME='C:/Program Files/Java/jre7/')
	library(rJava)
	library(RWeka)

	"
	Remove remove words <=2
	TdIdf weighting
	Infrequent (< than 1% of documents) and very frequent (> 80% of documents) terms not included
	"
	train_dtm <- DocumentTermMatrix(train_corpus, control=list(wordLengths=c(2, Inf),
	tokenize = bitrigramtokeniser,
	weighting = function(x) weightTfIdf(x, normalize = FALSE),
	bounds=list(global=c(floor(length(train_corpus)0.01), floor(length(train_corpus).8)))))

	test_dtm <- DocumentTermMatrix(test_corpus, control=list(wordLengths=c(2, Inf),
	tokenize = bitrigramtokeniser,
	weighting = function(x) weightTfIdf(x, normalize = FALSE),
	bounds=list(global=c(floor(length(test_corpus)0.001), floor(length(test_corpus).8)))))

	# Variable selection
	# ~~~~~~~~~~~~~~~~~~~~
	"
	For dimension reduction.
	The function calculates chi-square value for each phrase and keeps phrases with highest chi_square values
	Ideally you want to put variable selection as part of cross-validation.

	chisqTwo function takes:
	document term matrix (dtm),
	vector of labels (labels), and
	number of n-grams you want to keep (n_out)

	"
	chisqTwo <- function(dtm, labels, n_out=2000){
	mat <- as.matrix(dtm)
	cat1 <- colSums(mat[labels==T,]) # total number of times phrase used in cat1
	cat2 <- colSums(mat[labels==F,]) # total number of times phrase used in cat2
	n_cat1 <- sum(mat[labels==T,]) - cat1 # total number of phrases in soft minus cat1
	n_cat2 <- sum(mat[labels==F,]) - cat2 # total number of phrases in hard minus cat2

	num <- (cat1n_cat2 - cat2n_cat1)^2
	den <- (cat1 + cat2)(cat1 + n_cat1)(cat2 + n_cat2)*(n_cat1 + n_cat2)
	chisq <- num/den

	chi_order <- chisq[order(chisq)][1:n_out]
	mat <- mat[, colnames(mat) %in% names(chi_order)]

	}

	"
	With high dimensional data, test matrix may not have all the phrases training matrix has.
	This function fixes that - so that test matrix has the same columns as training.

	testmat takes column names of training matrix (train_mat_cols), and
	test matrix (test_mat)
	and outputs test_matrix with the same columns as training matrix
	"
	# Test matrix maker
	testmat <- function(train_mat_cols, test_mat){
	# train_mat_cols <- colnames(train_mat); test_mat <- as.matrix(test_dtm)
	test_mat <- test_mat[, colnames(test_mat) %in% train_mat_cols]

	miss_names <- train_mat_cols[!(train_mat_cols %in% colnames(test_mat))]
	if(length(miss_names)!=0){
	colClasses <- rep("numeric", length(miss_names))
	df <- read.table(text = '', colClasses = colClasses, col.names = miss_names)
	df[1:nrow(test_mat),] <- 0
	test_mat <- cbind(test_mat, df)
	}
	as.matrix(test_mat)
	}

	# Train and test matrices
	train_mat <- chisqTwo(train_dtm, data_train$labels)
	test_mat <- testmat(colnames(train_mat), as.matrix(test_dtm))

	# Take out the heavy dtms in the memory
	rm(train_dtm)
	rm(test_dtm)

	# Run garbage collector to free up memory
	gc()

	# Fit the model
	# ~~~~~~~~~~~~~~~~~~~~
	# Lasso (cross-validated lambda)
	# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	registerDoMC(cores=2) # set cores
	fit_lasso <- cv.glmnet(train_mat, data_train$labels, family = "binomial", nfolds=5, type.measure="class")
	pred <- predict(fit_lasso, newx= test_mat, s = "lambda.min", type="response")

	# Prediction Accuracy
	table(pred > .5, data_test$labels)
	sum(diag(table(pred > .5, data_test$labels)))/sum(table(pred > .5, data_test$labels))

	# Plot Cross-Validation Curve
	pdf("cvCurve.pdf")
	plot(fit_lasso)
	dev.off()

	# SVM
	# ~~~~~~~~~~~~~~
	svp <- ksvm(as.matrix(train_dtm), nyt_train$NewsDeskSoft, type="C-svc", kernel='vanilladot', C=100,scaled=c())
	spred <- predict(svp,test_dtm)