Skip to content

Instantly share code, notes, and snippets.

@Fishbiscuit
Last active December 13, 2019 05:01
Show Gist options
  • Save Fishbiscuit/a64b494f6ddd6a4de9823644933dca42 to your computer and use it in GitHub Desktop.
Save Fishbiscuit/a64b494f6ddd6a4de9823644933dca42 to your computer and use it in GitHub Desktop.
library(tm)
library(SnowballC)
library(caTools)
library(rpart)
library(rpart.plot)
library(randomForest)
library(dplyr)
library(adabag)
library(stringr)
#preprocess
twitter <- read.csv("train.csv", stringsAsFactors = FALSE)
all <- read.csv("Test & train combines tgt.csv",stringsAsFactors=FALSE)
text <- all$tweet #CART 0.81007
#text <- gsub("RT","", text) #0.8099259
#text <- gsub("weather", " neutral ", text)
#text <- gsub("Current Conditions", " neutral ", text) #0.9099259
#text <- gsub("current conditions", " neutral ", text)
#text <- gsub("Special Weather Statement", " neutral ", text) #0.8100741
#text <- gsub("M0ZIF QTH Weather", " neutral ", text)
#text <- gsub("San Antonio, Texas", " neutral ", text)
# first few combined 0.8097778
#text <- gsub("@mention:", " ", text)
#text <- gsub("#WEATHER:", " neutral ", text)
#text <- gsub("Anthem,", text)
#locations
for (i in c('San Diego','san diego')){
text <- gsub(i, ' location ', text)
}
text <- gsub('&lt;',"<",text)
text <- gsub('&gt;',">",text)
#HappyEmoticons
for (i in c(':1',':2',':3',':4',':5',':6',':7',':8',':9',':0','=B','=R','=M','=C','inches','Full Forecast','BULLETIN','bulletin','Bulletin','outlook','OUTLOOK','Outlook','risk','pct','expir','spc','Baro','#arwx','gardener','Feels F','#tnwx',
'#mowx','Current Conditions:','Special Weather Statement','M0ZIF QTH Weather','San Antonio, Texas Weather','#WEATHER:','Anthem,','New event.','Special Weather Statement','Tonight-','Fair and Windy','Overcast and Windy','Overcast and 5', 'Overcast and 6','Overcast and 4','Overcast and 7')){
text <- gsub(i,' emoji_neut ',text)
}
for (i in c(
":-)", ':))',':)', ';)', ':o)', ":-0)",':]', '=]', ':D', 'xD', 'X-D', 'XD', '=-D', '=D','=3', ":')", ':-P', ':P', ':-p', ':p', '#=p', ':-b', ':b',':s',':@','>:D','80','allur','lounge','Terminal')){
text <- gsub(i, " emoji_pos ",text)
}
for (i in c(
':L', ':-/', ':S', ':@', ':\\[', '=L', ':<',':-\\[','=/', ':\\(', ":'\\(",':c', ';\\(','>:\\(','):<','>.<','blech','smh','90','die','moodkiller','nemesis','horribl','bitch','wth','wtf','depress','crappy','shitty','bipolar','stupid','fuk','fuck this weather'
)){
text <- gsub(i, " emoji_neg ",text)
}
text <- gsub("([[:alpha:]])\\1{2,}", " \\1 ", text) #0.8117037 + emoji = 0.8183704
#text <- gsub("#", "", text)
#text <- gsub("@", "", text)
#text <- gsub("not ", "not", text)
#text <- gsub("MPH", " neutral ", text) #0.809333
#text <- gsub("mph", " neutral ", text)
corpus <- Corpus(VectorSource(text))
corpus <- tm_map (corpus,content_transformer(tolower))
corpus <- tm_map(corpus,removeWords,stopwords("english"))
corpus <- tm_map(corpus,removePunctuation)
corpus <- tm_map(corpus,stemDocument)
dtmall <- DocumentTermMatrix(corpus)
spdtmall <- removeSparseTerms(dtmall,0.995)
allSparse <- as.data.frame(as.matrix(spdtmall))
colnames(allSparse) <- make.names(colnames(allSparse))
test <- read.csv("test.csv",stringsAsFactors=FALSE)
trainset<-allSparse[1:22500,]
testset<-allSparse[22501:30000,]
# add sentiments into train set
trainset$sentiment <- as.factor(twitter$sentiment)
trainset$positive <- as.factor(twitter$sentiment == 3)
trainset$neutral <- as.factor(twitter$sentiment == 2)
trainset$negative <- as.factor(twitter$sentiment == 1)
#split training set
set.seed(123)
spl <- sample.split(trainset$sentiment, SplitRatio = 0.7)
train <- subset(trainset, spl == TRUE)
test <- subset(trainset, spl == FALSE)
#create positive set
trainPos <- subset(train,select=-c(negative,neutral,sentiment))
testPos<- subset(test,select=-c(negative,neutral,sentiment))
#create negative set
trainNeg <- subset(train,select=-c(positive,neutral,sentiment))
testNeg<- subset(test,select=-c(positive,neutral,sentiment))
#create neutral set
trainNeut <- subset(train,select=-c(positive,negative,sentiment))
testNeut <- subset(test,select=-c(positive,negative,sentiment))
text
#CARTs
set.seed(123)
twitterCARTpos <- rpart(positive~., data=trainPos, method="class",cp=10^-6)
#printcp(twitterCARTpos)
predictCARTpos <- predict(twitterCARTpos, newdata = testPos, type="class")
predictCARTposprob <- predict(twitterCARTpos, newdata = testPos, type="prob")
pos_accu <- table(testPos$positive, predictCARTpos)
accu_CART_pos <- (pos_accu[1,1] + pos_accu[2,2])/sum(pos_accu)
# accu_CART_pos #0.8856296
set.seed(123)
twitterCARTneg <- rpart(negative~., data=trainNeg, method="class",cp=10^-6)
#prp(twitterCARTneg)
predictCARTneg <- predict(twitterCARTneg, newdata = testNeg, type="class")
predictCARTnegprob <- predict(twitterCARTneg, newdata = testNeg, type="prob")
neg_accu <- table(testNeg$negative, predictCARTneg)
accu_CART_neg <- (neg_accu[1,1] + neg_accu[2,2])/sum(neg_accu)
# accu_CART_neg #0.8352593
set.seed(123)
twitterCARTneut <- rpart(neutral~., data=trainNeut, method="class",cp=10^-6)
#prp(twitterCARTneut)
predictCARTneut <- predict(twitterCARTneut, newdata = testNeut, type="class")
predictCARTneutprob<- predict(twitterCARTneut, newdata = testNeut, type="prob")
neut_accu <- table(testNeut$neutral, predictCARTneut)
accu_CART_neut <- (neut_accu[1,1] + neut_accu[2,2])/sum(neut_accu)
# accu_CART_neut #0.890963 0.886963
allprobability <- data.frame((predictCARTnegprob[,2]+predictCARTneutprob[,1]+predictCARTposprob[,1]),
(predictCARTneutprob[,2]+predictCARTnegprob[,1]+predictCARTposprob[,1]),
(predictCARTposprob[,2]+predictCARTnegprob[,1]+predictCARTneutprob[,1]))
allprobability[,"max"] <- apply(allprobability,1,max)
allprobability <- mutate(allprobability, sentiment = case_when(
allprobability[,1]==allprobability$max ~ 1, allprobability[,2]==allprobability$max ~ 2, TRUE~3
))
combine_accu <- table(test$sentiment,allprobability$sentiment)
#combine_accu
accu_CART_combine <- (combine_accu[1,1] + combine_accu[2,2] + combine_accu[3,3])/sum(combine_accu)
accu_CART_combine #0.8382222 without weights - 0.8379259
# w emoji [1] 0.8214815, 0.8100741
#0.8205
#819407
#8195556
#0.8184
#RF
set.seed(123)
twitterRFpos <- randomForest(positive~.,data=trainPos,method="class",cp=10^-6)
predictRFpos <- predict(twitterRFpos,newdata=testPos,type="class")
predictRFposprob <- predict(twitterRFpos,newdata=testPos,type="prob")
pos_rf_accu<-table(testPos$positive,predictRFpos)
accu_RF_pos <- (pos_rf_accu[1,1] + pos_rf_accu[2,2])/sum(pos_rf_accu)
#accu_RF_pos #0.9017777
set.seed(123)
twitterRFneg <- randomForest(negative~.,data=trainNeg,method="class",cp=10^-6)
predictRFneg <- predict(twitterRFneg, newdata = testNeg, type="class")
predictRFnegprob <- predict(twitterRFneg, newdata = testNeg, type="prob")
neg_rf_accu <- table(testNeg$negative, predictRFneg)
accu_RF_neg <- (neg_rf_accu[1,1] + neg_rf_accu[2,2])/sum(neg_rf_accu)
#accu_RF_neg #0.861333
set.seed(123)
twitterRFneut<-randomForest(neutral~.,data=trainNeut,method="class",cp=10^-6)
predictRFneut <- predict(twitterRFneut, newdata = testNeut, type="class")
predictRFneutprob <- predict(twitterRFneut, newdata = testNeut, type="prob")
neut_rf_accu <- table(testNeut$neutral, predictRFneut)
accu_RF_neut <- (neut_rf_accu[1,1] + neut_rf_accu[2,2])/sum(neut_rf_accu)
#accu_RF_neut #0.87 0.9084444
print(accu_RF_neg,accu_RF_neut,accu_RF_pos)
#combine
allprobability <- data.frame((predictCARTnegprob[,2]*0.8321+predictCARTneutprob[,1]*0.9017+predictCARTposprob[,1]*0.8893),
(predictCARTneutprob[,2]*0.9017+predictCARTnegprob[,1]*0.8321+predictCARTposprob[,1]*0.8893),
(predictCARTposprob[,2]*0.8893+predictCARTnegprob[,1]*0.8321+predictCARTneutprob[,1]*0.9017))
allprobability <- data.frame((predictRFnegprob[,2]*0.8635+predictRFneutprob[,1]*0.9074+predictRFposprob[,1]*0.9034),
(predictRFneutprob[,2]*0.9074+predictRFnegprob[,1]*0.8635+predictRFposprob[,1]*0.9034),
(predictRFposprob[,2]*0.9034+predictRFnegprob[,1]*0.8635+predictRFneutprob[,1]*0.9074))
allprobability <- data.frame((predictRFnegprob[,2]+predictRFneutprob[,1]+predictRFposprob[,1]),
(predictRFneutprob[,2]+predictRFnegprob[,1]+predictRFposprob[,1]),
(predictRFposprob[,2]+predictRFnegprob[,1]+predictRFneutprob[,1]))
allprobability <- data.frame((predictRFnegprob[,2]+predictRFposprob[,1]),
(predictRFnegprob[,1]+predictRFposprob[,1]),
(predictRFposprob[,2]+predictRFnegprob[,1]))
allprobability[,"max"] <- apply(allprobability,1,max)
allprobability$max
allprobability <- mutate(allprobability, sentiment = case_when(
allprobability[,1]==allprobability$max ~ 1, allprobability[,2]==allprobability$max ~ 2, TRUE~3
))
combine_accu <- table(test$sentiment,allprobability$sentiment)
combine_accu
accu_CART_combine <- (combine_accu[1,1] + combine_accu[2,2] + combine_accu[3,3])/sum(combine_accu)
accu_CART_combine #0.8382222 without weights - 0.8379259
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment