#http://www3.dsi.uminho.pt/pcortez/forestfires/

#Step 1 - Load Data and check summary details
setwd("E:/RNotes/RData/")
forestfires = NULL
forestfires <- read.csv(file = "forestfires.csv")
head(forestfires)
summary(forestfires)

#Step 2 - Visual Correlation between variables
#library(psych)
#pairs.panels(forestfires)

#Step 3 - Normalize columns where min and max values have large differences
#forestfires$DC <- log(forestfires$DC) 
#forestfires$DMC <- log(forestfires$DMC) 
forestfires = subset(forestfires, forestfires$area>0)
nrow(forestfires)
#forestfires$area <- (forestfires$area-min(forestfires$area))/(max(forestfires$area)-min(forestfires$area))
forestfires$area <- log(forestfires$area)
forestfires$X = (forestfires$X-min(forestfires$X))/(max(forestfires$X)-min(forestfires$X))
forestfires$Y = (forestfires$Y-min(forestfires$Y))/(max(forestfires$Y)-min(forestfires$Y))
forestfires$XY = forestfires$X*forestfires$Y
forestfires$XXYY = forestfires$X*forestfires$X*forestfires$Y*forestfires$Y
forestfires$XY = (forestfires$XY-min(forestfires$XY))/(max(forestfires$XY)-min(forestfires$XY))
forestfires$XXYY = (forestfires$XXYY-min(forestfires$XXYY))/(max(forestfires$XXYY)-min(forestfires$XXYY))
forestfires$FFMC = (forestfires$FFMC-min(forestfires$FFMC))/(max(forestfires$FFMC)-min(forestfires$FFMC))
#forestfires$DC <- log(forestfires$DC)
#forestfires$DC <- forestfires$DC
#forestfires$DMC <- forestfires$DMC
forestfires$DMCDC <- forestfires$DMC*forestfires$DC
forestfires$tempwind <- forestfires$temp*forestfires$wind
forestfires$FFMCDMCDC <- forestfires$FFMC*forestfires$DMC*forestfires$DC

#print(unique(forestfires$month))
#print(unique(forestfires$day))

sample(forestfires)

#Step 4 - Replace month and data text to numeric values as below
forestfires$month1[forestfires$month=='jan'] = 1
forestfires$month1[forestfires$month=='feb'] = 2
forestfires$month1[forestfires$month=='mar'] = 3
forestfires$month1[forestfires$month=='apr'] = 4
forestfires$month1[forestfires$month=='may'] = 5
forestfires$month1[forestfires$month=='jun'] = 6
forestfires$month1[forestfires$month=='jul'] = 7
forestfires$month1[forestfires$month=='aug'] = 8
forestfires$month1[forestfires$month=='sep'] = 9
forestfires$month1[forestfires$month=='oct'] = 10
forestfires$month1[forestfires$month=='nov'] = 11
forestfires$month1[forestfires$month=='dec'] = 12
forestfires$day1[forestfires$day=='sun'] = 0
forestfires$day1[forestfires$day=='mon'] = 1
forestfires$day1[forestfires$day=='tue'] = 2
forestfires$day1[forestfires$day=='wed'] = 3
forestfires$day1[forestfires$day=='thu'] = 4
forestfires$day1[forestfires$day=='fri'] = 5
forestfires$day1[forestfires$day=='sat'] = 6

#Step 5 - Remove original columns

forestfires$day <- NULL
forestfires$month <- NULL

#Step 6 - Run K means

library(NbClust)
set.seed(1)
numberofclusters <- NbClust(forestfires,min.nc=2,max.nc=15,method="kmeans")

#Step 7 - Print results
table(numberofclusters$Best.n[1,])

grpForest <- kmeans( forestfires, centers=3,nstart=10)
grpForest$cluster
grpForest$centers
grpForest$withinss
grpForest$size

forestfirescluster1 = NULL
forestfirescluster2 = NULL
forestfirescluster3 = NULL
clusterresults = c(grpForest$cluster)
length(clusterresults)
nrow(forestfires)

for(i in 1:nrow(forestfires))
{ 
  if(clusterresults[i]==1)
  {
    print('Cluster 1')
    deldata = forestfires[i,]
    forestfirescluster1 = rbind(forestfirescluster1,deldata)
  }
  if(clusterresults[i]==2)
  {
    print('Cluster 2')
    deldata = forestfires[i,]
    forestfirescluster2 = rbind(forestfirescluster2,deldata)
  }
  if(clusterresults[i]==3)
  {
    print('Cluster 3')
    deldata = forestfires[i,]
    forestfirescluster3 = rbind(forestfirescluster3,deldata)
  }
}

nrow(forestfirescluster1)
nrow(forestfirescluster2)
nrow(forestfirescluster3)

#df2<-forestfirescluster1[complete.cases(forestfirescluster1),]
#forestfirescluster1 = NULL
#forestfirescluster1 = df2

#df2<-forestfirescluster3[complete.cases(forestfirescluster3),]
#forestfirescluster3 = NULL
#forestfirescluster3 = df2

nrow(forestfirescluster1)
nrow(forestfirescluster2)
nrow(forestfirescluster3)

model1 = lm(forestfirescluster1$area~forestfirescluster1$DMCDC+forestfirescluster1$tempwind +forestfirescluster1$FFMCDMCDC+forestfirescluster1$DMCDC+forestfirescluster1$XY+forestfirescluster1$XXYY+forestfirescluster1$X+forestfirescluster1$Y+forestfirescluster1$FFMC+forestfirescluster1$DMC+forestfirescluster1$DC+forestfirescluster1$ISI+forestfirescluster1$temp+forestfirescluster1$RH+forestfirescluster1$wind+forestfirescluster1$rain+factor(forestfirescluster1$month1)+factor(forestfirescluster1$day1),na.action="na.exclude")
model1
summary(model1)
plot(model1$fitted.values, model1$residual.values)
hist(model1$residuals)

library(leaps)
model1 = regsubsets(forestfirescluster1$area~forestfirescluster1$DMCDC+forestfirescluster1$tempwind +forestfirescluster1$FFMCDMCDC+forestfirescluster1$DMCDC+forestfirescluster1$XY+forestfirescluster1$XXYY+forestfirescluster1$X+forestfirescluster1$Y+forestfirescluster1$FFMC+forestfirescluster1$DMC+forestfirescluster1$DC+forestfirescluster1$ISI+forestfirescluster1$temp+forestfirescluster1$RH+forestfirescluster1$wind+forestfirescluster1$rain+factor(forestfirescluster1$month1)+factor(forestfirescluster1$day1),na.action="na.exclude",data=forestfirescluster1,nbest = 8)
model1
summary(model1)

plot(model1$fitted.values, model1$residual.values)
hist(model1$residuals)


model2 = lm(forestfirescluster2$area~forestfirescluster2$DMCDC+forestfirescluster2$tempwind +forestfirescluster2$FFMCDMCDC+forestfirescluster2$DMCDC+forestfirescluster2$XY+forestfirescluster2$XXYY+forestfirescluster2$X+forestfirescluster2$Y+forestfirescluster2$FFMC+forestfirescluster2$DMC+forestfirescluster2$DC+forestfirescluster2$ISI+forestfirescluster2$temp+forestfirescluster2$RH+forestfirescluster2$wind+forestfirescluster2$rain+factor(forestfirescluster2$month1)+factor(forestfirescluster2$day1),na.action="na.exclude")
model2
summary(model2)
plot(model2$fitted.values, model2$residual.values)
hist(model2$residuals)

model3 = lm(forestfirescluster3$area~forestfirescluster3$DMCDC+forestfirescluster3$tempwind +forestfirescluster3$FFMCDMCDC+forestfirescluster3$DMCDC+forestfirescluster3$XY+forestfirescluster3$XXYY+forestfirescluster3$X+forestfirescluster3$Y+forestfirescluster3$FFMC+forestfirescluster3$DMC+forestfirescluster3$DC+forestfirescluster3$ISI+forestfirescluster3$temp+forestfirescluster3$RH+forestfirescluster3$wind+forestfirescluster3$rain+factor(forestfirescluster3$month1)+factor(forestfirescluster3$day1),na.action="na.exclude")
model3
summary(model3)
plot(model3$fitted.values, model3$residual.values)
hist(model3$residuals)