Created
May 13, 2016 05:44
-
-
Save Lanbig/9c365cbd372c0d7aed81011338893e91 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
########## | |
#Load data | |
setwd("~/Dropbox/DePaul/OnlineNewsPopularity") | |
OnlineNewsPopularity = read.csv("OnlineNewsPopularity.csv", header = TRUE) | |
###########Data Preprpcessing step########## | |
#Duplicate dataset | |
df.onNews <- OnlineNewsPopularity | |
#Remove ID | |
df.onNews$url = NULL | |
df.onNews$timedelta = NULL | |
#Change Dummy variables to factors | |
df.onNews$pubDay <- ifelse(OnlineNewsPopularity$weekday_is_monday == 1, "monday", | |
ifelse((OnlineNewsPopularity$weekday_is_tuesday == 1), "tuesday", | |
ifelse((OnlineNewsPopularity$weekday_is_wednesday == 1), "wednesday", | |
ifelse((OnlineNewsPopularity$weekday_is_thursday == 1), "thursday", | |
ifelse((OnlineNewsPopularity$weekday_is_friday == 1), "friday", | |
ifelse((OnlineNewsPopularity$weekday_is_saturday == 1), "saturday", "sunday")))))) | |
df.onNews$pubDay <- as.factor(df.onNews$pubDay) | |
df.onNews$weekday_is_monday = NULL | |
df.onNews$weekday_is_tuesday = NULL | |
df.onNews$weekday_is_wednesday = NULL | |
df.onNews$weekday_is_thursday = NULL | |
df.onNews$weekday_is_friday = NULL | |
df.onNews$weekday_is_saturday = NULL | |
df.onNews$weekday_is_sunday = NULL | |
df.onNews$dataChannel <- ifelse(OnlineNewsPopularity$data_channel_is_lifestyle == 1,"lifestyle", | |
ifelse(OnlineNewsPopularity$data_channel_is_entertainment == 1,"entertainment", | |
ifelse(OnlineNewsPopularity$data_channel_is_bus == 1,"business", | |
ifelse(OnlineNewsPopularity$data_channel_is_socmed == 1,"Social", | |
ifelse(OnlineNewsPopularity$data_channel_is_tech == 1,"tech", "world"))))) | |
df.onNews$data_channel_is_lifestyle = NULL | |
df.onNews$data_channel_is_entertainment = NULL | |
df.onNews$data_channel_is_bus = NULL | |
df.onNews$data_channel_is_socmed = NULL | |
df.onNews$data_channel_is_tech = NULL | |
df.onNews$data_channel_is_world = NULL | |
df.onNews$dataChannel <- as.factor(df.onNews$dataChannel) | |
#Class distribution Bin data | |
df.onNews$share2bins <- ifelse(OnlineNewsPopularity$shares > 1300, "high", "low") | |
df.onNews$share2bins <- as.factor(df.onNews$share2bins) | |
###### | |
library(ggplot2) | |
theme_set(theme_bw(base_size = 28)) | |
qplot(df.onNews$dataChannel, geom="histogram",ylab = "count",binwidth = 30, | |
main = "Histogram for data channel", xlab = "Channel", fill=df.onNews$share2bins ) | |
table(df.onNews$dataChannel,df.onNews$share2bins) | |
qplot(df.onNews$pubDay, geom="histogram",ylab = "count",binwidth = 30, | |
main = "Histogram for publication day", xlab = "Day", fill=df.onNews$share2bins ) | |
table(df.onNews$pubDay,df.onNews$share2bins) | |
qplot(df.onNews$self_reference_min_shares, geom="histogram", ylab = "count", | |
main = "Histogram for self_reference_min_shares", xlab = "Min. shares of referenced articles in Mashable", fill=df.onNews$share2bins, xlim=c(3.8,8000) ) | |
table(df.onNews$self_reference_min_shares,df.onNews$share2bins0) | |
qplot(df.onNews$kw_avg_avg, geom="histogram",ylab = "count", | |
main = "Histogram for kw_avg_avg", xlab = "Avg. keyword", fill=df.onNews$share2bins, xlim=c(3.8,8000) ) | |
hist(df.onNews$shares, xlim=c(3.8,500000) ) | |
####### | |
df.onNews$shares = NULL | |
###########Required Library########## | |
library(rpart.plot) | |
library(caret) | |
library(e1071) | |
library(klaR) | |
library(rattle) | |
library(rpart.plot) | |
library(RColorBrewer) | |
###########Clasification modelling step########## | |
set.seed(415) | |
# define an 66%/34% train/test split of the dataset | |
trainIndex <- createDataPartition(df.onNews$share2bins, p=0.66, list=FALSE) | |
data_train <- df.onNews[ trainIndex,] | |
data_test <- df.onNews[-trainIndex,] | |
############# Decision Tree ##################### | |
Sys.time() | |
fit.rpart5= train(share2bins ~ . ,data=data_train , method= "rpart", tuneLength = 5, trControl=trainControl(method="cv",number=10)) | |
fit.rpart8= train(share2bins ~ . ,data=data_train , method= "rpart", tuneLength = 8, trControl=trainControl(method="cv",number=10)) | |
fit.rpart13= train(share2bins ~ . ,data=data_train , method= "rpart", tuneLength = 13, trControl=trainControl(method="cv",number=10)) | |
fit.rpart17= train(share2bins ~ . ,data=data_train , method= "rpart", tuneLength = 17, trControl=trainControl(method="cv",number=10)) | |
fit.rpart20= train(share2bins ~ . ,data=data_train , method= "rpart", tuneLength = 20, trControl=trainControl(method="cv",number=10)) | |
summary(fit.rpart5$finalModel) | |
predictions <- predict(fit.rpart8, data_test[,1:48]) | |
confusionMatrix(predictions, data_test$share2bins) | |
fancyRpartPlot(fit.rpart8$finalModel) | |
prp(fit.rpart5$finalModel) | |
print(fit.rpart8) | |
print(fit.rpart8$finalModel) | |
Sys.time() | |
################################################## | |
############# Naive Bayes ##################### | |
Sys.time() | |
fit.nb= train(share2bins ~ . ,data=data_train , method= "nb", trControl=trainControl(method="cv",number=10)) | |
predictions <- predict(fit.nb, data_test[,1:58]) | |
confusionMatrix(predictions, data_test$share2bins) | |
print(fit.nb$finalModel) | |
print(fit.nb) | |
Sys.time() | |
################################################## | |
############# Random Forest ##################### | |
fit.rf = train(share2bins ~ . ,data_train , method= "rf", ntree=501 , tuneGrid = data.frame(mtry = 4), | |
allowParallel=TRUE,trControl=trainControl(method="cv",number=10) ) | |
predictions <- predict(fit.rf2, data_test[,1:48]) | |
confusionMatrix(predictions, data_test$share2bins) | |
print(fit.rf) | |
print(fit.rf$finalModel) | |
importance(fit.rf$finalModel) | |
varImpPlot(fit.rf2$finalModel) | |
importance(fit.rf$finalModel) | |
################################################## |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment