Skip to content

Instantly share code, notes, and snippets.

@rwalk
Last active January 11, 2016 06:25
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rwalk/99265561766015a153fc to your computer and use it in GitHub Desktop.
Save rwalk/99265561766015a153fc to your computer and use it in GitHub Desktop.
R: SVM example to predict "crude" topic in Reuters21578 Corpus
# This gist samples positive and negative examples of a topic
# in the Reuters21578 corpus using R's "tm" package to manage
# the data. After some simple transformations to the text,
# the data are extracted to a document-term matrix and a simple
# SVM model is fit to classify positive examples of the topic.
#
# author: R. Walker (r_walker@zoho.com)
# LICENSE: MIT
#
# NOTE: Download the full Reuters21578 corpus from
# http://modnlp.berlios.de/reuters21578.html
#
# libraries
library("tm")
library("e1071")
# construct file list
path2data <- 'apath/reuters21578/xml/'
fl1 <- paste0(path2data, 'reut2-00' , 0:9, '.xml')
fl2 <- paste0(path2data, 'reut2-0',10:21, '.xml')
file_list <- c(fl1,fl2)
# what is the positive category?
positive = "crude"
# Process corpus with multicore? Requires "parallel" package if nCores>1.
nCores <- 1
####################################################################################################################
# helper functions
####################################################################################################################
# function to sample equal number of negative and positive
# cases from the reuters data
sampleReuters <- function(file, positive){
tryCatch({
# build corpus using TM's Reuter's source reader
# Note the encoding setting!
corp <- Corpus(ReutersSource(file, encoding = "latin1"))
# build a vector indicating whether or not the
# document has the tag
has_label <- sapply(corp, function(x) length(intersect(c(positive),LocalMetaData(x)$Topics))>0)
# sample document so that an equal number of positive and negative examples are returned
positiveCases <- lapply(which(has_label), function(x) corp[[x]])
negativeCases <- lapply(sample(which(!has_label), size = sum(has_label), replace=FALSE), function(x) corp[[x]])
# bind selected documents into a subcorpus
res <- c(positiveCases, negativeCases)
# return corpus
return(res)
}, error = function(e) print(paste(e," | Couldn't process ",file)))
}
####################################################################################################################
# build the corpus
####################################################################################################################
if(nCores>1){
# parallel
require(parallel)
ptm <- proc.time()
corpus <- mclapply(file_list, function(f) sampleReuters(f,positive), mc.cores = nCores)
proc.time() - ptm
} else{
# nonparallel
ptm <- proc.time()
corpus <- lapply(file_list, function(f) sampleReuters(f,positive))
proc.time() - ptm
}
# combine the lists of corpora into a single corpus
corpus <- do.call(c, do.call(c,corpus))
# reduce dimensionality of text data by text transformations
corpus <- tm_map(corpus, as.PlainTextDocument)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removeWords , stopwords())
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, stemDocument)
# bag of words modeling
DTM <- DocumentTermMatrix(corpus,
control = list(weighting= function(x) weightBin(x)))
DTM <- removeSparseTerms(DTM, .98)
whichDocs <- rownames(DTM)
####################################################################################################################
# SVM Model
####################################################################################################################
y <- ifelse(sapply(corpus, function(x) positive %in% LocalMetaData(x)$Topics), 1, -1)
X <- as.matrix(DTM)
data <- as.data.frame(cbind(y,X))
# split into test and train
train.index <- sample(1:length(y), size=floor(.8*length(y)), replace=FALSE)
train <- data[train.index,]
test <- data[-train.index,]
# fit the svm and do a simple validation test. Cost parameter should be tuned.
sv <- svm(y~., train, type="C-classification", kernel="linear", cost=1)
table(Pred=predict(sv, test[,-1]) , True=test$y)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment