#http://www3.dsi.uminho.pt/pcortez/forestfires/ #Step 1 - Load Data and check summary details setwd("E:/RNotes/RData/") forestfires = NULL forestfires <- read.csv(file = "forestfires.csv") head(forestfires) summary(forestfires) #Step 2 - Visual Correlation between variables #library(psych) #pairs.panels(forestfires) #Step 3 - Normalize columns where min and max values have large differences #forestfires$DC <- log(forestfires$DC) #forestfires$DMC <- log(forestfires$DMC) forestfires = subset(forestfires, forestfires$area>0) nrow(forestfires) #forestfires$area <- (forestfires$area-min(forestfires$area))/(max(forestfires$area)-min(forestfires$area)) forestfires$area <- log(forestfires$area) forestfires$X = (forestfires$X-min(forestfires$X))/(max(forestfires$X)-min(forestfires$X)) forestfires$Y = (forestfires$Y-min(forestfires$Y))/(max(forestfires$Y)-min(forestfires$Y)) forestfires$XY = forestfires$X*forestfires$Y forestfires$XXYY = forestfires$X*forestfires$X*forestfires$Y*forestfires$Y forestfires$XY = (forestfires$XY-min(forestfires$XY))/(max(forestfires$XY)-min(forestfires$XY)) forestfires$XXYY = (forestfires$XXYY-min(forestfires$XXYY))/(max(forestfires$XXYY)-min(forestfires$XXYY)) forestfires$FFMC = (forestfires$FFMC-min(forestfires$FFMC))/(max(forestfires$FFMC)-min(forestfires$FFMC)) #forestfires$DC <- log(forestfires$DC) #forestfires$DC <- forestfires$DC #forestfires$DMC <- forestfires$DMC forestfires$DMCDC <- forestfires$DMC*forestfires$DC forestfires$tempwind <- forestfires$temp*forestfires$wind forestfires$FFMCDMCDC <- forestfires$FFMC*forestfires$DMC*forestfires$DC #print(unique(forestfires$month)) #print(unique(forestfires$day)) sample(forestfires) #Step 4 - Replace month and data text to numeric values as below forestfires$month1[forestfires$month=='jan'] = 1 forestfires$month1[forestfires$month=='feb'] = 2 forestfires$month1[forestfires$month=='mar'] = 3 forestfires$month1[forestfires$month=='apr'] = 4 forestfires$month1[forestfires$month=='may'] = 5 forestfires$month1[forestfires$month=='jun'] = 6 forestfires$month1[forestfires$month=='jul'] = 7 forestfires$month1[forestfires$month=='aug'] = 8 forestfires$month1[forestfires$month=='sep'] = 9 forestfires$month1[forestfires$month=='oct'] = 10 forestfires$month1[forestfires$month=='nov'] = 11 forestfires$month1[forestfires$month=='dec'] = 12 forestfires$day1[forestfires$day=='sun'] = 0 forestfires$day1[forestfires$day=='mon'] = 1 forestfires$day1[forestfires$day=='tue'] = 2 forestfires$day1[forestfires$day=='wed'] = 3 forestfires$day1[forestfires$day=='thu'] = 4 forestfires$day1[forestfires$day=='fri'] = 5 forestfires$day1[forestfires$day=='sat'] = 6 #Step 5 - Remove original columns forestfires$day <- NULL forestfires$month <- NULL #Step 6 - Run K means library(NbClust) set.seed(1) numberofclusters <- NbClust(forestfires,min.nc=2,max.nc=15,method="kmeans") #Step 7 - Print results table(numberofclusters$Best.n[1,]) grpForest <- kmeans( forestfires, centers=3,nstart=10) grpForest$cluster grpForest$centers grpForest$withinss grpForest$size forestfirescluster1 = NULL forestfirescluster2 = NULL forestfirescluster3 = NULL clusterresults = c(grpForest$cluster) length(clusterresults) nrow(forestfires) for(i in 1:nrow(forestfires)) { if(clusterresults[i]==1) { print('Cluster 1') deldata = forestfires[i,] forestfirescluster1 = rbind(forestfirescluster1,deldata) } if(clusterresults[i]==2) { print('Cluster 2') deldata = forestfires[i,] forestfirescluster2 = rbind(forestfirescluster2,deldata) } if(clusterresults[i]==3) { print('Cluster 3') deldata = forestfires[i,] forestfirescluster3 = rbind(forestfirescluster3,deldata) } } nrow(forestfirescluster1) nrow(forestfirescluster2) nrow(forestfirescluster3) #df2<-forestfirescluster1[complete.cases(forestfirescluster1),] #forestfirescluster1 = NULL #forestfirescluster1 = df2 #df2<-forestfirescluster3[complete.cases(forestfirescluster3),] #forestfirescluster3 = NULL #forestfirescluster3 = df2 nrow(forestfirescluster1) nrow(forestfirescluster2) nrow(forestfirescluster3) model1 = lm(forestfirescluster1$area~forestfirescluster1$DMCDC+forestfirescluster1$tempwind +forestfirescluster1$FFMCDMCDC+forestfirescluster1$DMCDC+forestfirescluster1$XY+forestfirescluster1$XXYY+forestfirescluster1$X+forestfirescluster1$Y+forestfirescluster1$FFMC+forestfirescluster1$DMC+forestfirescluster1$DC+forestfirescluster1$ISI+forestfirescluster1$temp+forestfirescluster1$RH+forestfirescluster1$wind+forestfirescluster1$rain+factor(forestfirescluster1$month1)+factor(forestfirescluster1$day1),na.action="na.exclude") model1 summary(model1) plot(model1$fitted.values, model1$residual.values) hist(model1$residuals) library(leaps) model1 = regsubsets(forestfirescluster1$area~forestfirescluster1$DMCDC+forestfirescluster1$tempwind +forestfirescluster1$FFMCDMCDC+forestfirescluster1$DMCDC+forestfirescluster1$XY+forestfirescluster1$XXYY+forestfirescluster1$X+forestfirescluster1$Y+forestfirescluster1$FFMC+forestfirescluster1$DMC+forestfirescluster1$DC+forestfirescluster1$ISI+forestfirescluster1$temp+forestfirescluster1$RH+forestfirescluster1$wind+forestfirescluster1$rain+factor(forestfirescluster1$month1)+factor(forestfirescluster1$day1),na.action="na.exclude",data=forestfirescluster1,nbest = 8) model1 summary(model1) plot(model1$fitted.values, model1$residual.values) hist(model1$residuals) model2 = lm(forestfirescluster2$area~forestfirescluster2$DMCDC+forestfirescluster2$tempwind +forestfirescluster2$FFMCDMCDC+forestfirescluster2$DMCDC+forestfirescluster2$XY+forestfirescluster2$XXYY+forestfirescluster2$X+forestfirescluster2$Y+forestfirescluster2$FFMC+forestfirescluster2$DMC+forestfirescluster2$DC+forestfirescluster2$ISI+forestfirescluster2$temp+forestfirescluster2$RH+forestfirescluster2$wind+forestfirescluster2$rain+factor(forestfirescluster2$month1)+factor(forestfirescluster2$day1),na.action="na.exclude") model2 summary(model2) plot(model2$fitted.values, model2$residual.values) hist(model2$residuals) model3 = lm(forestfirescluster3$area~forestfirescluster3$DMCDC+forestfirescluster3$tempwind +forestfirescluster3$FFMCDMCDC+forestfirescluster3$DMCDC+forestfirescluster3$XY+forestfirescluster3$XXYY+forestfirescluster3$X+forestfirescluster3$Y+forestfirescluster3$FFMC+forestfirescluster3$DMC+forestfirescluster3$DC+forestfirescluster3$ISI+forestfirescluster3$temp+forestfirescluster3$RH+forestfirescluster3$wind+forestfirescluster3$rain+factor(forestfirescluster3$month1)+factor(forestfirescluster3$day1),na.action="na.exclude") model3 summary(model3) plot(model3$fitted.values, model3$residual.values) hist(model3$residuals)