Skip to content

Instantly share code, notes, and snippets.

@zunman
Last active March 16, 2017 02:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save zunman/93a025f62f1161f86d9c7d10564519f5 to your computer and use it in GitHub Desktop.
Save zunman/93a025f62f1161f86d9c7d10564519f5 to your computer and use it in GitHub Desktop.
##load functions
source('~/experiments/lib/functions.R')
##load datasets
load('~/experiments/cache/socialMed.training-v1.RData')
load('~/experiments/cache/socialMed.validation-v1.RData')
load('~/experiments/cache/socialMed.test-v1.RData')
prepareFeatures <- function(dataset){
dataset <- cleanData(dataset)
dataset <- normalizeData(dataset)
dataset <- addPolarityWordsAsFeatures(dataset)
dataset <- addDepressionWordsCountAsFeature(dataset)
dataset <- pronounsAsFeature(dataset)
dataset <- getTweetClass(dataset)
dataset <- checkAttributeTypes(dataset)
}
socialMed.training <- prepareFeatures(socialMed.training)
socialMed.test <- prepareFeatures(rbind(socialMed.validation, socialMed.test))
UserLevel.train <- mergeTweets(socialMed.training)
UserLevel.train <- getUserClass(UserLevel.train)
UserLevel.test <- mergeTweets(socialMed.test)
UserLevel.test <- getUserClass(UserLevel.test)
# l <- c("UserLevel.test", "UserLevel.train")
# save(list = l, file="results/UserLevelData.RData")
dropColumns <- function(dataset){
dataset$nodeID <- NULL
dataset$text <- NULL
return(dataset)
}
#load("~/experiments/results/UserLevelData.RData")
library(caret)
imbal_train <- dropColumns(UserLevel.train)
imbal_test <- dropColumns(UserLevel.test)
imbal_train$UserClass <- revalue(imbal_train$UserClass, c("self-reported"="no"))
imbal_test$UserClass <- revalue(imbal_test$UserClass, c("self-reported" = "no"))
#SMOTE to balance dataset (optional)
library(DMwR)
set.seed(9560)
smote_train <- SMOTE(UserClass ~ ., data = imbal_train)
table(smote_train$UserClass)
#10-fold
ctrl <- trainControl(method = "repeatedcv", repeats = 10,
classProbs = TRUE,
summaryFunction = twoClassSummary)
##TRAINING
set.seed(5627)
orig_fit <- train(UserClass ~ ., data = imbal_train,
method = "svmLinear",
metric = "ROC",
preProc = c("center", "scale"),
trControl = ctrl)
set.seed(5627)
smote_outside <- train(UserClass ~ ., data = smote_train,
method = "svmLinear",
metric = "ROC",
preProc = c("center", "scale"),
trControl = ctrl)
##PERFORMANCE
##Confusion matrix - training
imbal_train$pred <- predict(orig_fit, imbal_train)
confusionMatrix(data = imbal_train$pred, reference = imbal_train$UserClass,positive = "yes", mode = "prec_recall")
smote_train$pred <- predict(smote_outside, smote_train)
confusionMatrix(data = smote_train$pred, reference = smote_train$UserClass,positive = "yes", mode = "prec_recall")
##Confusion matrix - test
imbal_test$pred <- predict(orig_fit, imbal_test)
confusionMatrix(data = imbal_test$pred, reference = imbal_test$UserClass,positive = "yes", mode = "prec_recall")
imbal_test$predSmote <- predict(smote_outside, imbal_test)
confusionMatrix(data = imbal_test$predSmote, reference = imbal_test$UserClass,positive = "yes", mode = "prec_recall")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment