########## | |
#Load data | |
setwd("~/Dropbox/DePaul/OnlineNewsPopularity") | |
OnlineNewsPopularity = read.csv("OnlineNewsPopularity.csv", header = TRUE) | |
###########Data Preprpcessing step########## | |
#Duplicate dataset | |
df.onNews <- OnlineNewsPopularity | |
#Remove ID | |
df.onNews$url = NULL | |
df.onNews$timedelta = NULL | |
#Change Dummy variables to factors | |
df.onNews$pubDay <- ifelse(OnlineNewsPopularity$weekday_is_monday == 1, "monday", | |
ifelse((OnlineNewsPopularity$weekday_is_tuesday == 1), "tuesday", | |
ifelse((OnlineNewsPopularity$weekday_is_wednesday == 1), "wednesday", | |
ifelse((OnlineNewsPopularity$weekday_is_thursday == 1), "thursday", | |
ifelse((OnlineNewsPopularity$weekday_is_friday == 1), "friday", | |
ifelse((OnlineNewsPopularity$weekday_is_saturday == 1), "saturday", "sunday")))))) | |
df.onNews$pubDay <- as.factor(df.onNews$pubDay) | |
df.onNews$weekday_is_monday = NULL | |
df.onNews$weekday_is_tuesday = NULL | |
df.onNews$weekday_is_wednesday = NULL | |
df.onNews$weekday_is_thursday = NULL | |
df.onNews$weekday_is_friday = NULL | |
df.onNews$weekday_is_saturday = NULL | |
df.onNews$weekday_is_sunday = NULL | |
df.onNews$dataChannel <- ifelse(OnlineNewsPopularity$data_channel_is_lifestyle == 1,"lifestyle", | |
ifelse(OnlineNewsPopularity$data_channel_is_entertainment == 1,"entertainment", | |
ifelse(OnlineNewsPopularity$data_channel_is_bus == 1,"business", | |
ifelse(OnlineNewsPopularity$data_channel_is_socmed == 1,"Social", | |
ifelse(OnlineNewsPopularity$data_channel_is_tech == 1,"tech", "world"))))) | |
df.onNews$data_channel_is_lifestyle = NULL | |
df.onNews$data_channel_is_entertainment = NULL | |
df.onNews$data_channel_is_bus = NULL | |
df.onNews$data_channel_is_socmed = NULL | |
df.onNews$data_channel_is_tech = NULL | |
df.onNews$data_channel_is_world = NULL | |
df.onNews$dataChannel <- as.factor(df.onNews$dataChannel) | |
#Class distribution Bin data | |
df.onNews$share2bins <- ifelse(OnlineNewsPopularity$shares > 1300, "high", "low") | |
df.onNews$share2bins <- as.factor(df.onNews$share2bins) | |
###### | |
library(ggplot2) | |
theme_set(theme_bw(base_size = 28)) | |
qplot(df.onNews$dataChannel, geom="histogram",ylab = "count",binwidth = 30, | |
main = "Histogram for data channel", xlab = "Channel", fill=df.onNews$share2bins ) | |
table(df.onNews$dataChannel,df.onNews$share2bins) | |
qplot(df.onNews$pubDay, geom="histogram",ylab = "count",binwidth = 30, | |
main = "Histogram for publication day", xlab = "Day", fill=df.onNews$share2bins ) | |
table(df.onNews$pubDay,df.onNews$share2bins) | |
qplot(df.onNews$self_reference_min_shares, geom="histogram", ylab = "count", | |
main = "Histogram for self_reference_min_shares", xlab = "Min. shares of referenced articles in Mashable", fill=df.onNews$share2bins, xlim=c(3.8,8000) ) | |
table(df.onNews$self_reference_min_shares,df.onNews$share2bins0) | |
qplot(df.onNews$kw_avg_avg, geom="histogram",ylab = "count", | |
main = "Histogram for kw_avg_avg", xlab = "Avg. keyword", fill=df.onNews$share2bins, xlim=c(3.8,8000) ) | |
hist(df.onNews$shares, xlim=c(3.8,500000) ) | |
####### | |
df.onNews$shares = NULL | |
###########Required Library########## | |
library(rpart.plot) | |
library(caret) | |
library(e1071) | |
library(klaR) | |
library(rattle) | |
library(rpart.plot) | |
library(RColorBrewer) | |
###########Clasification modelling step########## | |
set.seed(415) | |
# define an 66%/34% train/test split of the dataset | |
trainIndex <- createDataPartition(df.onNews$share2bins, p=0.66, list=FALSE) | |
data_train <- df.onNews[ trainIndex,] | |
data_test <- df.onNews[-trainIndex,] | |
############# Decision Tree ##################### | |
Sys.time() | |
fit.rpart5= train(share2bins ~ . ,data=data_train , method= "rpart", tuneLength = 5, trControl=trainControl(method="cv",number=10)) | |
fit.rpart8= train(share2bins ~ . ,data=data_train , method= "rpart", tuneLength = 8, trControl=trainControl(method="cv",number=10)) | |
fit.rpart13= train(share2bins ~ . ,data=data_train , method= "rpart", tuneLength = 13, trControl=trainControl(method="cv",number=10)) | |
fit.rpart17= train(share2bins ~ . ,data=data_train , method= "rpart", tuneLength = 17, trControl=trainControl(method="cv",number=10)) | |
fit.rpart20= train(share2bins ~ . ,data=data_train , method= "rpart", tuneLength = 20, trControl=trainControl(method="cv",number=10)) | |
summary(fit.rpart5$finalModel) | |
predictions <- predict(fit.rpart8, data_test[,1:48]) | |
confusionMatrix(predictions, data_test$share2bins) | |
fancyRpartPlot(fit.rpart8$finalModel) | |
prp(fit.rpart5$finalModel) | |
print(fit.rpart8) | |
print(fit.rpart8$finalModel) | |
Sys.time() | |
################################################## | |
############# Naive Bayes ##################### | |
Sys.time() | |
fit.nb= train(share2bins ~ . ,data=data_train , method= "nb", trControl=trainControl(method="cv",number=10)) | |
predictions <- predict(fit.nb, data_test[,1:58]) | |
confusionMatrix(predictions, data_test$share2bins) | |
print(fit.nb$finalModel) | |
print(fit.nb) | |
Sys.time() | |
################################################## | |
############# Random Forest ##################### | |
fit.rf = train(share2bins ~ . ,data_train , method= "rf", ntree=501 , tuneGrid = data.frame(mtry = 4), | |
allowParallel=TRUE,trControl=trainControl(method="cv",number=10) ) | |
predictions <- predict(fit.rf2, data_test[,1:48]) | |
confusionMatrix(predictions, data_test$share2bins) | |
print(fit.rf) | |
print(fit.rf$finalModel) | |
importance(fit.rf$finalModel) | |
varImpPlot(fit.rf2$finalModel) | |
importance(fit.rf$finalModel) | |
################################################## |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment