Created
June 17, 2014 07:22
-
-
Save not-for-me/f530c391dae203ea600d to your computer and use it in GitHub Desktop.
Document Classification with whole Term
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Load Library | |
library(tm) | |
library(party) | |
library(rpart) | |
library(nnet) | |
library(randomForest) | |
# Current R environment session Info | |
sessionInfo() | |
# Set file Paths | |
trainFilePath <- "~/Documents/mining/train_txt" | |
testFilePath <- "~/Documents/mining/test_txt" | |
# Import txt to TextCorpus | |
trainTextCorpus <- Corpus(DirSource(trainFilePath), readerControl = list(reader = readPlain, language = "en")) | |
trainTextCorpus | |
testTextCorpus <- Corpus(DirSource(testFilePath), readerControl = list(reader = readPlain, language = "en")) | |
testTextCorpus | |
totalTextCorpus <- c(trainTextCorpus, testTextCorpus) | |
totalTextCorpus | |
# Convert Text Encoding to UTF-8 | |
totalTextCorpus <- tm_map(totalTextCorpus, function(x) iconv(x, to='UTF-8-MAC', sub='byte')) | |
# Preprocessing | |
totalTextCorpus <- tm_map(totalTextCorpus, tolower) | |
totalTextCorpus <- tm_map(totalTextCorpus, removeWords, stopwords("english")) | |
totalTextCorpus <- tm_map(totalTextCorpus, removeNumbers) | |
totalTextCorpus <- tm_map(totalTextCorpus, removePunctuation) | |
totalTextCorpus <- tm_map(totalTextCorpus, stripWhitespace) | |
totalTextCorpus <- tm_map(totalTextCorpus, stemDocument) | |
# Make a DTM with words whose length 3 to 12. | |
dtm <-DocumentTermMatrix(totalTextCorpus,control=list(wordLengths = c(3,12))) | |
dim(dtm) | |
# Remove Sparse terms | |
stm <- removeSparseTerms(dtm, 0.8) | |
dim(stm) | |
# Convert stm to dataframe | |
df <- as.data.frame(inspect(stm)) | |
# Add a document category | |
ncol(df) | |
category <- c(rep("BigData",101), rep("Cloud",97), rep("DB",102), rep("Multi", 100), rep("SDN", 99), rep("WSN",99), rep("BigData",10), rep("DB", 9), rep("Multi", 10), rep("WSN", 9)) | |
df <- cbind(df, category) | |
ncol(df) | |
train_category <- c(rep("BigData",101), rep("Cloud",97), rep("DB",102), rep("Multi", 100), rep("SDN", 99), rep("WSN",99)) | |
test_category <- c(rep("BigData",10), rep("DB", 9), rep("Multi", 10), rep("WSN", 9)) | |
# Divide df into train_df and test_df | |
train_df <- df[1:598, 1:1008] | |
test_df <- df[599:636, 1:1008] | |
#---------------------------------------------------------------- | |
# Algorithm: Decision tree with Party Library | |
myFormula <- category ~. | |
party_tree <- ctree(myFormula, data=train_df) | |
party_tree | |
plot(party_tree) | |
plot(party_tree, type="simple") | |
train_result_table <- table(predict(party_tree), train_category) | |
train_result_table | |
# Train data Accuracy | |
sum(diag(train_result_table)) / sum(train_result_table) | |
test_result_table <- table(predict(party_tree, newdata=test_df), test_category) | |
test_result_table | |
# Test data Accuracy | |
sum(test_result_table[1,1], test_result_table[3,2], test_result_table[4,3], test_result_table[6,4]) / sum(test_result_table) | |
#---------------------------------------------------------------- | |
# Algorithm: Decision tree with RPart | |
dt <- rpart(myFormula, data=train_df, control=rpart.control(minsplit=10)) | |
attributes(dt) | |
print(dt) | |
plot(dt) | |
text(dt, use.n=TRUE) | |
train_result_table <- table(predict(dt, train_df, type="class"),train_category) | |
train_result_table | |
# Train data Accuracy | |
sum(diag(train_result_table)) / sum(train_result_table) | |
test_result_table <- table(predict(dt, test_df, type="class"), test_category) | |
test_result_table | |
# Test data Accuracy | |
sum(test_result_table[1,1], test_result_table[3,2], test_result_table[4,3], test_result_table[6,4]) / sum(test_result_table) | |
#---------------------------------------------------------------- | |
# Algorithm: NN | |
# 아래 수치는 여러번 다양한 값을 돌려서 성능이 괜찮은 것을 택한 것임 | |
nnet.classifier <- nnet(myFormula, data=train_df, size=5, rang=0.1, decay=5e-5, maxit=1500, MaxNWts=5100 ) | |
train_result_table <- table(predict(nnet.classifier, train_df, type="class"), train_category) | |
train_result_table | |
# Train data Accuracy | |
sum(diag(train_result_table)) / sum(train_result_table) | |
test_result_table <- table(predict(nnet.classifier, test_df, type="class"), test_category) | |
test_result_table | |
# Test data Accuracy | |
sum(test_result_table[1,1], test_result_table[3,2], test_result_table[4,3], test_result_table[6,4]) / sum(test_result_table) | |
# 기타 참고사항 | |
# 문서 빈도수 조건 추가된 문장 | |
# train_dtm <-DocumentTermMatrix(trainTextCorpus,control=list(wordLengths = c(3,12), bounds=list(global=c(100,Inf)))) | |
# StemWord 쓰려면 install.packages("SnowballC") 요거 | |
# Sys.setenv(LANG="EN") 영어로 에러보기 | |
# http://web.letras.up.pt/bhsmaia/EDV/apresentacoes/Bradzil_Classif_withTM.pdf |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment