Skip to content

Instantly share code, notes, and snippets.

@soodoku
Last active December 15, 2016 17:44
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 3 You must be signed in to fork a gist
  • Save soodoku/e34dbe0219b0f00a74d5 to your computer and use it in GitHub Desktop.
Save soodoku/e34dbe0219b0f00a74d5 to your computer and use it in GitHub Desktop.
Basic Text Classifier
"
Basic Text Classifier
- Takes a csv with a text column, and column of labels
- Splits into train and test
- Preprocesses text using tm/bag-of-words, 1/2-order Markov
- Uses SVM and Lasso
@author: Gaurav Sood
"
# Load libs
library(readr) # read in big data a bit quicker
library(tm) # pre_process text
library(glmnet) # for lasso
library(SnowballC) #for stemming
library(kernlab)# for svm
library(doMC) # parallel computing
"
Load data
Expected Data Format:
Let label column = labels
Say labels are of two classes: 1, 0
Let text column = text
"
data <- read_csv("data.csv")
data <- as.data.frame(data)
colnames(data) <- make.names(colnames(data))
# Take out text columns with no category labels
data <- subset(data, !is.na(labels))
"
Depending on the classification task, you may not need the entire training dataset.
Note: You can always plot how much accuracy improves as n increases)
Subsetting may also prove useful for computational reasons for some
If subsetting, stratified sampling not a bad idea if labels v. imbalanced
"
"
Stratified function takes:
data (data),
name of the label column (labels),
either proportion of rows from each label (p) or n per label (n_per_label), and
random seed (seed) for reproducibility
"
stratified <- function(data, labels, n_per_label, p=NA, seed=314159265){
# data <- data; labels="labels"; n_per_label=200;p <- NA;seed=314159265
dsample <- NULL
set.seed(seed)
for(i in levels(as.factor(data[,labels])))
{
dsub <- data[data[,labels] == i,]
if(is.na(p)){
dsub <- dsub[sample(1:nrow(dsub), n_per_label), ]
}
else{
dsub <- dsub[sample(1:nrow(dsub), ceiling(nrow(dsub) * p)), ]
}
dsample <- c(dsample, row.names(dsub))
}
dsample
}
# Training data
train_rows <- stratified(data, "labels", n_per_label=25000)
data_train <- data[train_rows,]
# Test data (10k is more than enough)
data_test <- data[-as.numeric(train_rows),]
data_test <- data_test[sample(1:nrow(data_test), 10000),]
# An abstract function to preprocess a text column
preprocess <- function(text_column)
{
# Use tm to get a doc matrix
corpus <- Corpus(VectorSource(text_column))
# all lower case
corpus <- tm_map(corpus, content_transformer(tolower))
# remove punctuation
corpus <- tm_map(corpus, content_transformer(removePunctuation))
# remove numbers
corpus <- tm_map(corpus, content_transformer(removeNumbers))
# remove stopwords
corpus <- tm_map(corpus, removeWords, stopwords("english"))
# stem document
corpus <- tm_map(corpus, stemDocument)
# strip white spaces (always at the end)
corpus <- tm_map(corpus, stripWhitespace)
# return
corpus
}
# Get preprocess training and test data
train_corpus <- preprocess(data_train$text)
test_corpus <- preprocess(data_test$text)
# Create a Document Term Matrix for train and test
# Just including bi and tri-grams
# Bi-Trigram tokenizer function (you can always get longer n-grams)
bitrigramtokeniser <- function(x, n) {
RWeka:::NGramTokenizer(x, RWeka:::Weka_control(min = 2, max = 3))
}
# Appropriate libraries
# For Windows
# Sys.setenv(JAVA_HOME='C:/Program Files/Java/jre7/')
library(rJava)
library(RWeka)
"
Remove remove words <=2
TdIdf weighting
Infrequent (< than 1% of documents) and very frequent (> 80% of documents) terms not included
"
train_dtm <- DocumentTermMatrix(train_corpus, control=list(wordLengths=c(2, Inf),
tokenize = bitrigramtokeniser,
weighting = function(x) weightTfIdf(x, normalize = FALSE),
bounds=list(global=c(floor(length(train_corpus)*0.01), floor(length(train_corpus)*.8)))))
test_dtm <- DocumentTermMatrix(test_corpus, control=list(wordLengths=c(2, Inf),
tokenize = bitrigramtokeniser,
weighting = function(x) weightTfIdf(x, normalize = FALSE),
bounds=list(global=c(floor(length(test_corpus)*0.001), floor(length(test_corpus)*.8)))))
# Variable selection
# ~~~~~~~~~~~~~~~~~~~~
"
For dimension reduction.
The function calculates chi-square value for each phrase and keeps phrases with highest chi_square values
Ideally you want to put variable selection as part of cross-validation.
chisqTwo function takes:
document term matrix (dtm),
vector of labels (labels), and
number of n-grams you want to keep (n_out)
"
chisqTwo <- function(dtm, labels, n_out=2000){
mat <- as.matrix(dtm)
cat1 <- colSums(mat[labels==T,]) # total number of times phrase used in cat1
cat2 <- colSums(mat[labels==F,]) # total number of times phrase used in cat2
n_cat1 <- sum(mat[labels==T,]) - cat1 # total number of phrases in soft minus cat1
n_cat2 <- sum(mat[labels==F,]) - cat2 # total number of phrases in hard minus cat2
num <- (cat1*n_cat2 - cat2*n_cat1)^2
den <- (cat1 + cat2)*(cat1 + n_cat1)*(cat2 + n_cat2)*(n_cat1 + n_cat2)
chisq <- num/den
chi_order <- chisq[order(chisq)][1:n_out]
mat <- mat[, colnames(mat) %in% names(chi_order)]
}
"
With high dimensional data, test matrix may not have all the phrases training matrix has.
This function fixes that - so that test matrix has the same columns as training.
testmat takes column names of training matrix (train_mat_cols), and
test matrix (test_mat)
and outputs test_matrix with the same columns as training matrix
"
# Test matrix maker
testmat <- function(train_mat_cols, test_mat){
# train_mat_cols <- colnames(train_mat); test_mat <- as.matrix(test_dtm)
test_mat <- test_mat[, colnames(test_mat) %in% train_mat_cols]
miss_names <- train_mat_cols[!(train_mat_cols %in% colnames(test_mat))]
if(length(miss_names)!=0){
colClasses <- rep("numeric", length(miss_names))
df <- read.table(text = '', colClasses = colClasses, col.names = miss_names)
df[1:nrow(test_mat),] <- 0
test_mat <- cbind(test_mat, df)
}
as.matrix(test_mat)
}
# Train and test matrices
train_mat <- chisqTwo(train_dtm, data_train$labels)
test_mat <- testmat(colnames(train_mat), as.matrix(test_dtm))
# Take out the heavy dtms in the memory
rm(train_dtm)
rm(test_dtm)
# Run garbage collector to free up memory
gc()
# Fit the model
# ~~~~~~~~~~~~~~~~~~~~
# Lasso (cross-validated lambda)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
registerDoMC(cores=2) # set cores
fit_lasso <- cv.glmnet(train_mat, data_train$labels, family = "binomial", nfolds=5, type.measure="class")
pred <- predict(fit_lasso, newx= test_mat, s = "lambda.min", type="response")
# Prediction Accuracy
table(pred > .5, data_test$labels)
sum(diag(table(pred > .5, data_test$labels)))/sum(table(pred > .5, data_test$labels))
# Plot Cross-Validation Curve
pdf("cvCurve.pdf")
plot(fit_lasso)
dev.off()
# SVM
# ~~~~~~~~~~~~~~
svp <- ksvm(as.matrix(train_dtm), nyt_train$NewsDeskSoft, type="C-svc", kernel='vanilladot', C=100,scaled=c())
spred <- predict(svp,test_dtm)
@krcarriere
Copy link

With this code, I'm hitting this error after running lines 122-126 or 127-130.

Error in rep(seq_along(x), sapply(tflist, length)) :
invalid 'times' argument

It seems to be coming from what I'm assuming is the 94-95 function, but I don't see why. Any suggestions? It's a relatively small data set - only 1,000 observations - though the text itself is each about a paragraph per observation.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment