rwalk/svm_tm_example.R

## svm_tm_example.R
#  This gist samples positive and negative examples of a topic
#  in the Reuters21578 corpus using R's "tm" package to manage
#  the data.  After some simple transformations to the text,
#  the data are extracted to a document-term matrix and a simple
#  SVM model is fit to classify positive examples of the topic.
#
#  author: R. Walker  (r_walker@zoho.com)
#  LICENSE: MIT
#
# NOTE: Download the full Reuters21578 corpus from
# http://modnlp.berlios.de/reuters21578.html
#

# libraries
library("tm")
library("e1071")

# construct file list
path2data <- 'apath/reuters21578/xml/'
fl1 <- paste0(path2data, 'reut2-00' , 0:9, '.xml')
fl2 <- paste0(path2data, 'reut2-0',10:21, '.xml')
file_list <- c(fl1,fl2)

# what is the positive category?
positive = "crude"

# Process corpus with multicore? Requires "parallel" package if nCores>1.
nCores <- 1

####################################################################################################################
# helper functions
####################################################################################################################
# function to sample equal number of negative and positive
# cases from the reuters data
sampleReuters <- function(file, positive){
  tryCatch({

    # build corpus using TM's Reuter's source reader
    # Note the encoding setting!
    corp     <- Corpus(ReutersSource(file, encoding = "latin1"))

    # build a vector indicating whether or not the
    # document has the tag
    has_label <- sapply(corp, function(x) length(intersect(c(positive),LocalMetaData(x)$Topics))>0)

    # sample document so that an equal number of positive and negative examples are returned
    positiveCases <- lapply(which(has_label), function(x) corp[[x]])
    negativeCases <- lapply(sample(which(!has_label), size = sum(has_label), replace=FALSE), function(x) corp[[x]])

    # bind selected documents into a subcorpus
    res <- c(positiveCases, negativeCases)

    # return corpus
    return(res)
  }, error = function(e) print(paste(e," | Couldn't process ",file)))
}

####################################################################################################################
# build the corpus
####################################################################################################################
if(nCores>1){
  # parallel
  require(parallel)
  ptm <- proc.time()
  corpus <- mclapply(file_list, function(f) sampleReuters(f,positive), mc.cores = nCores)
  proc.time() - ptm
} else{
  # nonparallel
  ptm <- proc.time()
  corpus <- lapply(file_list, function(f) sampleReuters(f,positive))
  proc.time() - ptm
}

# combine the lists of corpora into a single corpus
corpus   <- do.call(c, do.call(c,corpus))

# reduce dimensionality of text data by text transformations
corpus <- tm_map(corpus, as.PlainTextDocument)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removeWords , stopwords())
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, stemDocument)

# bag of words modeling
DTM <- DocumentTermMatrix(corpus,
                          control = list(weighting= function(x) weightBin(x)))
DTM <- removeSparseTerms(DTM, .98)
whichDocs <- rownames(DTM)

####################################################################################################################
# SVM Model
####################################################################################################################
y <- ifelse(sapply(corpus, function(x) positive %in% LocalMetaData(x)$Topics), 1, -1)
X <- as.matrix(DTM)
data <- as.data.frame(cbind(y,X))

# split into test and train
train.index <- sample(1:length(y), size=floor(.8*length(y)), replace=FALSE)
train <- data[train.index,]
test <- data[-train.index,]

# fit the svm and do a simple validation test. Cost parameter should be tuned.
sv <- svm(y~., train, type="C-classification", kernel="linear", cost=1)
table(Pred=predict(sv, test[,-1]) , True=test$y)
	# This gist samples positive and negative examples of a topic
	# in the Reuters21578 corpus using R's "tm" package to manage
	# the data. After some simple transformations to the text,
	# the data are extracted to a document-term matrix and a simple
	# SVM model is fit to classify positive examples of the topic.
	#
	# author: R. Walker (r_walker@zoho.com)
	# LICENSE: MIT
	#
	# NOTE: Download the full Reuters21578 corpus from
	# http://modnlp.berlios.de/reuters21578.html
	#

	# libraries
	library("tm")
	library("e1071")

	# construct file list
	path2data <- 'apath/reuters21578/xml/'
	fl1 <- paste0(path2data, 'reut2-00' , 0:9, '.xml')
	fl2 <- paste0(path2data, 'reut2-0',10:21, '.xml')
	file_list <- c(fl1,fl2)

	# what is the positive category?
	positive = "crude"

	# Process corpus with multicore? Requires "parallel" package if nCores>1.
	nCores <- 1

	####################################################################################################################
	# helper functions
	####################################################################################################################
	# function to sample equal number of negative and positive
	# cases from the reuters data
	sampleReuters <- function(file, positive){
	tryCatch({

	# build corpus using TM's Reuter's source reader
	# Note the encoding setting!
	corp <- Corpus(ReutersSource(file, encoding = "latin1"))

	# build a vector indicating whether or not the
	# document has the tag
	has_label <- sapply(corp, function(x) length(intersect(c(positive),LocalMetaData(x)$Topics))>0)

	# sample document so that an equal number of positive and negative examples are returned
	positiveCases <- lapply(which(has_label), function(x) corp[[x]])
	negativeCases <- lapply(sample(which(!has_label), size = sum(has_label), replace=FALSE), function(x) corp[[x]])

	# bind selected documents into a subcorpus
	res <- c(positiveCases, negativeCases)

	# return corpus
	return(res)
	}, error = function(e) print(paste(e," \| Couldn't process ",file)))
	}

	####################################################################################################################
	# build the corpus
	####################################################################################################################
	if(nCores>1){
	# parallel
	require(parallel)
	ptm <- proc.time()
	corpus <- mclapply(file_list, function(f) sampleReuters(f,positive), mc.cores = nCores)
	proc.time() - ptm
	} else{
	# nonparallel
	ptm <- proc.time()
	corpus <- lapply(file_list, function(f) sampleReuters(f,positive))
	proc.time() - ptm
	}

	# combine the lists of corpora into a single corpus
	corpus <- do.call(c, do.call(c,corpus))

	# reduce dimensionality of text data by text transformations
	corpus <- tm_map(corpus, as.PlainTextDocument)
	corpus <- tm_map(corpus, removeNumbers)
	corpus <- tm_map(corpus, tolower)
	corpus <- tm_map(corpus, removeWords , stopwords())
	corpus <- tm_map(corpus, removePunctuation)
	corpus <- tm_map(corpus, stripWhitespace)
	corpus <- tm_map(corpus, stemDocument)

	# bag of words modeling
	DTM <- DocumentTermMatrix(corpus,
	control = list(weighting= function(x) weightBin(x)))
	DTM <- removeSparseTerms(DTM, .98)
	whichDocs <- rownames(DTM)

	####################################################################################################################
	# SVM Model
	####################################################################################################################
	y <- ifelse(sapply(corpus, function(x) positive %in% LocalMetaData(x)$Topics), 1, -1)
	X <- as.matrix(DTM)
	data <- as.data.frame(cbind(y,X))

	# split into test and train
	train.index <- sample(1:length(y), size=floor(.8*length(y)), replace=FALSE)
	train <- data[train.index,]
	test <- data[-train.index,]

	# fit the svm and do a simple validation test. Cost parameter should be tuned.
	sv <- svm(y~., train, type="C-classification", kernel="linear", cost=1)
	table(Pred=predict(sv, test[,-1]) , True=test$y)