Created
November 29, 2013 22:45
-
-
Save chrishanretty/7713010 to your computer and use it in GitHub Desktop.
Classifier for Times headlines
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### Load libraries | |
library(RTextTools) | |
### Make sure .xlsx file has been converted to CSV properly | |
media <- read.csv("media1960-2008_websiteversion_111007.csv", | |
header=T,as.is=T) | |
media$Date <- as.Date(media$Date,"%d/%m/%Y") | |
media <- subset(media,Date > as.Date("1996-01-01")) | |
media <- media[,c("Title","Subtitle","Major_Topic")] | |
media_matrix <- create_matrix(cbind(media["Title"],media["Subtitle"]), | |
language="english", removeNumbers=TRUE, | |
stemWords=TRUE, weighting=weightTfIdf) | |
corpus <- create_container(media_matrix,media$Major_Topic, | |
trainSize=1:5700,testSize=5701:nrow(media), | |
virgin=FALSE) | |
names(attributes(corpus)) #class matrix_container | |
models <- train_models(corpus, algorithms=c("SVM","MAXENT")) | |
results <- classify_models(corpus, models) | |
########################################## | |
# VIEW THE RESULTS BY CREATING ANALYTICS # | |
########################################## | |
analytics <- create_analytics(corpus, results) | |
head(analytics@algorithm_summary) | |
head(analytics@label_summary) | |
head(analytics@document_summary) | |
analytics@ensemble_summary | |
# WRITE OUT THE DATA TO A CSV --- look in your working directory | |
write.csv(analytics@algorithm_summary,"times_AlgorithmSummary.csv") | |
write.csv(analytics@label_summary,"times_LabelSummary.csv") | |
write.csv(analytics@document_summary,"times_DocumentSummary.csv") | |
write.csv(analytics@ensemble_summary,"times_EnsembleSummary.csv") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment