Skip to content

Instantly share code, notes, and snippets.

@not-for-me
Created June 17, 2014 07:22
Show Gist options
  • Save not-for-me/f530c391dae203ea600d to your computer and use it in GitHub Desktop.
Save not-for-me/f530c391dae203ea600d to your computer and use it in GitHub Desktop.
Document Classification with whole Term
# Load Library
library(tm)
library(party)
library(rpart)
library(nnet)
library(randomForest)
# Current R environment session Info
sessionInfo()
# Set file Paths
trainFilePath <- "~/Documents/mining/train_txt"
testFilePath <- "~/Documents/mining/test_txt"
# Import txt to TextCorpus
trainTextCorpus <- Corpus(DirSource(trainFilePath), readerControl = list(reader = readPlain, language = "en"))
trainTextCorpus
testTextCorpus <- Corpus(DirSource(testFilePath), readerControl = list(reader = readPlain, language = "en"))
testTextCorpus
totalTextCorpus <- c(trainTextCorpus, testTextCorpus)
totalTextCorpus
# Convert Text Encoding to UTF-8
totalTextCorpus <- tm_map(totalTextCorpus, function(x) iconv(x, to='UTF-8-MAC', sub='byte'))
# Preprocessing
totalTextCorpus <- tm_map(totalTextCorpus, tolower)
totalTextCorpus <- tm_map(totalTextCorpus, removeWords, stopwords("english"))
totalTextCorpus <- tm_map(totalTextCorpus, removeNumbers)
totalTextCorpus <- tm_map(totalTextCorpus, removePunctuation)
totalTextCorpus <- tm_map(totalTextCorpus, stripWhitespace)
totalTextCorpus <- tm_map(totalTextCorpus, stemDocument)
# Make a DTM with words whose length 3 to 12.
dtm <-DocumentTermMatrix(totalTextCorpus,control=list(wordLengths = c(3,12)))
dim(dtm)
# Remove Sparse terms
stm <- removeSparseTerms(dtm, 0.8)
dim(stm)
# Convert stm to dataframe
df <- as.data.frame(inspect(stm))
# Add a document category
ncol(df)
category <- c(rep("BigData",101), rep("Cloud",97), rep("DB",102), rep("Multi", 100), rep("SDN", 99), rep("WSN",99), rep("BigData",10), rep("DB", 9), rep("Multi", 10), rep("WSN", 9))
df <- cbind(df, category)
ncol(df)
train_category <- c(rep("BigData",101), rep("Cloud",97), rep("DB",102), rep("Multi", 100), rep("SDN", 99), rep("WSN",99))
test_category <- c(rep("BigData",10), rep("DB", 9), rep("Multi", 10), rep("WSN", 9))
# Divide df into train_df and test_df
train_df <- df[1:598, 1:1008]
test_df <- df[599:636, 1:1008]
#----------------------------------------------------------------
# Algorithm: Decision tree with Party Library
myFormula <- category ~.
party_tree <- ctree(myFormula, data=train_df)
party_tree
plot(party_tree)
plot(party_tree, type="simple")
train_result_table <- table(predict(party_tree), train_category)
train_result_table
# Train data Accuracy
sum(diag(train_result_table)) / sum(train_result_table)
test_result_table <- table(predict(party_tree, newdata=test_df), test_category)
test_result_table
# Test data Accuracy
sum(test_result_table[1,1], test_result_table[3,2], test_result_table[4,3], test_result_table[6,4]) / sum(test_result_table)
#----------------------------------------------------------------
# Algorithm: Decision tree with RPart
dt <- rpart(myFormula, data=train_df, control=rpart.control(minsplit=10))
attributes(dt)
print(dt)
plot(dt)
text(dt, use.n=TRUE)
train_result_table <- table(predict(dt, train_df, type="class"),train_category)
train_result_table
# Train data Accuracy
sum(diag(train_result_table)) / sum(train_result_table)
test_result_table <- table(predict(dt, test_df, type="class"), test_category)
test_result_table
# Test data Accuracy
sum(test_result_table[1,1], test_result_table[3,2], test_result_table[4,3], test_result_table[6,4]) / sum(test_result_table)
#----------------------------------------------------------------
# Algorithm: NN
# 아래 수치는 여러번 다양한 값을 돌려서 성능이 괜찮은 것을 택한 것임
nnet.classifier <- nnet(myFormula, data=train_df, size=5, rang=0.1, decay=5e-5, maxit=1500, MaxNWts=5100 )
train_result_table <- table(predict(nnet.classifier, train_df, type="class"), train_category)
train_result_table
# Train data Accuracy
sum(diag(train_result_table)) / sum(train_result_table)
test_result_table <- table(predict(nnet.classifier, test_df, type="class"), test_category)
test_result_table
# Test data Accuracy
sum(test_result_table[1,1], test_result_table[3,2], test_result_table[4,3], test_result_table[6,4]) / sum(test_result_table)
# 기타 참고사항
# 문서 빈도수 조건 추가된 문장
# train_dtm <-DocumentTermMatrix(trainTextCorpus,control=list(wordLengths = c(3,12), bounds=list(global=c(100,Inf))))
# StemWord 쓰려면 install.packages("SnowballC") 요거
# Sys.setenv(LANG="EN") 영어로 에러보기
# http://web.letras.up.pt/bhsmaia/EDV/apresentacoes/Bradzil_Classif_withTM.pdf
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment