eellpp/R ml notes

## R ml notes

#In this case the blank cell are to be marked as NA. So have to tell R
train.data = read.csv("train.csv", na.strings=c("NA", ""))

# taking data from dataframe based on condition : filtering data
train.data[which(train.data$Survived == 1),"Survived"])
length(train.data[which(train.data$Survived == 1 & train.data$Age > 50),"Survived"])
OR
length(train.data$Survived[train.data$Survived == 1 & train.data$Age > 50])

> length(train.data[which(train.data$Survived == 1 & train.data$Sex == "female"),"Survived"])
[1] 233
> length(train.data[which(train.data$Survived == 1 & train.data$Sex == "male"),"Survived"])
[1] 109

# which
> length(which(is.na(train.data$Age))) / length(train.data$Age)
[1] 0.1986532


# dataframe is list with some restriction so we can use sapply over dataframe

> sapply(train.data,FUN=function(df){length(which(is.na(df))) / length(df)})
PassengerId    Survived      Pclass        Name         Sex         Age       SibSp       Parch      Ticket
0.000000000 0.000000000 0.000000000 0.000000000 0.000000000 0.198653199 0.000000000 0.000000000 0.000000000
       Fare       Cabin    Embarked
0.000000000 0.771043771 0.002244669

# imputing values
Since embarked class has less NA they can be imputed
> table(train.data$Embarked,useNA="always")

   C    Q    S <NA>
 168   77  644    2
> train.data$Embarked[which(is.na(train.data$Embarked))] <- as.factor("S")
---
# Generate plots for survival according to sex and fare (which seems intutive)
> df <- train.data[train.data$Sex == "female",c("Fare","Survived")]
> farelevels <- cut(df$Fare,c(0,10,15,30,50,100,200,550),labels = sort(c("a","b","c","d","e","f","g"),decreasing=FALSE) )
> barplot(table(df$Survived,farelevels),beside=TRUE,legend =c(levels(df$Survived)),col=c("red","green"))
> df.males <- train.data[train.data$Sex != "female",c("Fare","Survived")]
> farelevels.m <- cut(df.males$Fare,c(0,10,15,30,50,100,200,550),labels = sort(c("a","b","c","d","e","f","g"),decreasing=FALSE) )
> barplot(table(df.males$Survived,farelevels.m),beside=TRUE,legend =c(levels(df$Survived)),col=c("red","green"))

#Generating Descriptive Statistics with
# histogram split the data by freq into bins
# barplot can show the freq of variation of two variables

> barplot(table(train.data$Survived),names.arg=c("Perished","Survived"))
> plot(iris$Sepal.Length,iris$Sepal.Width,col= iris$Species)
> table(iris$Sepal.Length,iris$Sepal.Width)

# multiple cross values on same page
> par(mfrow=c(2,2))
    > plot(SLID$wages ~ SLID$language)
    > plot(SLID$wages ~ SLID$age)
    > plot(SLID$wages ~ SLID$education)
    > plot(SLID$wages ~ SLID$sex)
---
# survival rate by sex
> xtable <- table(train.data$Survived,train.data$Sex)
> barplot(xtable,legend=c("Perished","Survived"),col=c("red","green"))

# make histogram and add second graph on top of first
> hist(train.data$Age[which(train.data$Survived == "0")],col="blue",add = F)
> hist(train.data$Age[which(train.data$Survived == "1")],col="red",add = T)

#Using R to split the data into training and test set
percentage <- 0.7
set.seed(123)
training_index <- sample(seq_len(nrow(train.data)),floor(percentage * nrow(train.data)))
training_data <- train.data[training_index,]
test_data <- train.data[-training_index,]

# train using decision tree
> train.ctree <- ctree(Survived ~ Pclass + Sex + Age + SibSp + Fare + Parch + Embarked,data=training_data)
> plot(train.ctree)

# from decision tree we can see which combination of attributes are useful for classification
> ctree.predict <- predict(train.ctree,test_data)

# install caret package

# use the confusionmatrix to see the prediction accuracy
> confusionMatrix(ctree.predict,test_data$Survived)

##################
# Predictive Models
##################

Before applying the algorithms, look at the nature of the data
- factors, numbers etc
- data cleansing, NA's etc, bad data etc
  -- read the data cleaning in R book
- dependent variables etc
after creating the model, check the accuracy etc
and check if the model is correct

# linear regression example
# use linear regression in numeric data prediction etc
> data(Quartet)
> str(Quartet)
'data.frame':  11 obs. of  6 variables:
 $ x : int  10 8 13 9 11 14 6 4 12 7 ...
 $ y1: num  8.04 6.95 7.58 8.81 8.33 ...
 $ y2: num  9.14 8.14 8.74 8.77 9.26 8.1 6.13 3.1 9.13 7.26 ...
 $ y3: num  7.46 6.77 12.74 7.11 7.81 ...
 $ x4: int  8 8 8 8 8 8 8 19 8 8 ...
 $ y4: num  6.58 5.76 7.71 8.84 8.47 7.04 5.25 12.5 5.56 7.91 ...
> plot(Quartet$x,Quartet$y1)
> lmfit <- lm(y1 ~ x, Quartet)
> abline(lmfit,col = "red")
> summary(lm)


#TODO: understand confint
> confint(lmfit, level = 0.95)
                2.5 %    97.5 %
(Intercept) 0.4557369 5.5444449
x           0.2333701 0.7668117

#logistic regression ()
## Best for Yes/No kind of results
model <- glm(Survived ~ Age + Sex , data = training_data, family = "binomial")
pred <- predict(model,test_data,type = "response")
Class <- ifelse(pred > 0.5,1,0)

#Regression Diagnostic
> par(mfrow=c(2,2))
> plot(lmfit)

# create the confusion matrix to get the accuracy
#table
ctb <- table(test_data$Survived,Class)
confusionMatrix(ctb)

#Naive Bayes Classification
> model <- naiveBayes(Survived ~ Age + Sex, training_data)
> pred <- predict(model,data = test_data)
> confusionMatrix(table(test_data$Survived,pred))

# SVM Classification
> model = svm(Survived ~ Age + Sex,training_data)
# replace NA with mean value for age: Should be done in training also
>test$Age[which(is.na(test$Age))] = 28
> pred = predict(model,newdata = test)
> pred_class = ifelse(pred == 0 , 0 ,1)
> confusionMatrix(table(pred_class,test$Survived))

	#In this case the blank cell are to be marked as NA. So have to tell R
	train.data = read.csv("train.csv", na.strings=c("NA", ""))

	# taking data from dataframe based on condition : filtering data
	train.data[which(train.data$Survived == 1),"Survived"])
	length(train.data[which(train.data$Survived == 1 & train.data$Age > 50),"Survived"])
	OR
	length(train.data$Survived[train.data$Survived == 1 & train.data$Age > 50])

	> length(train.data[which(train.data$Survived == 1 & train.data$Sex == "female"),"Survived"])
	[1] 233
	> length(train.data[which(train.data$Survived == 1 & train.data$Sex == "male"),"Survived"])
	[1] 109

	# which
	> length(which(is.na(train.data$Age))) / length(train.data$Age)
	[1] 0.1986532


	# dataframe is list with some restriction so we can use sapply over dataframe

	> sapply(train.data,FUN=function(df){length(which(is.na(df))) / length(df)})
	PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket
	0.000000000 0.000000000 0.000000000 0.000000000 0.000000000 0.198653199 0.000000000 0.000000000 0.000000000
	Fare Cabin Embarked
	0.000000000 0.771043771 0.002244669

	# imputing values
	Since embarked class has less NA they can be imputed
	> table(train.data$Embarked,useNA="always")

	C Q S <NA>
	168 77 644 2
	> train.data$Embarked[which(is.na(train.data$Embarked))] <- as.factor("S")
	---
	# Generate plots for survival according to sex and fare (which seems intutive)
	> df <- train.data[train.data$Sex == "female",c("Fare","Survived")]
	> farelevels <- cut(df$Fare,c(0,10,15,30,50,100,200,550),labels = sort(c("a","b","c","d","e","f","g"),decreasing=FALSE) )
	> barplot(table(df$Survived,farelevels),beside=TRUE,legend =c(levels(df$Survived)),col=c("red","green"))
	> df.males <- train.data[train.data$Sex != "female",c("Fare","Survived")]
	> farelevels.m <- cut(df.males$Fare,c(0,10,15,30,50,100,200,550),labels = sort(c("a","b","c","d","e","f","g"),decreasing=FALSE) )
	> barplot(table(df.males$Survived,farelevels.m),beside=TRUE,legend =c(levels(df$Survived)),col=c("red","green"))

	#Generating Descriptive Statistics with
	# histogram split the data by freq into bins
	# barplot can show the freq of variation of two variables

	> barplot(table(train.data$Survived),names.arg=c("Perished","Survived"))
	> plot(iris$Sepal.Length,iris$Sepal.Width,col= iris$Species)
	> table(iris$Sepal.Length,iris$Sepal.Width)

	# multiple cross values on same page
	> par(mfrow=c(2,2))
	> plot(SLID$wages ~ SLID$language)
	> plot(SLID$wages ~ SLID$age)
	> plot(SLID$wages ~ SLID$education)
	> plot(SLID$wages ~ SLID$sex)
	---
	# survival rate by sex
	> xtable <- table(train.data$Survived,train.data$Sex)
	> barplot(xtable,legend=c("Perished","Survived"),col=c("red","green"))

	# make histogram and add second graph on top of first
	> hist(train.data$Age[which(train.data$Survived == "0")],col="blue",add = F)
	> hist(train.data$Age[which(train.data$Survived == "1")],col="red",add = T)

	#Using R to split the data into training and test set
	percentage <- 0.7
	set.seed(123)
	training_index <- sample(seq_len(nrow(train.data)),floor(percentage * nrow(train.data)))
	training_data <- train.data[training_index,]
	test_data <- train.data[-training_index,]

	# train using decision tree
	> train.ctree <- ctree(Survived ~ Pclass + Sex + Age + SibSp + Fare + Parch + Embarked,data=training_data)
	> plot(train.ctree)

	# from decision tree we can see which combination of attributes are useful for classification
	> ctree.predict <- predict(train.ctree,test_data)

	# install caret package

	# use the confusionmatrix to see the prediction accuracy
	> confusionMatrix(ctree.predict,test_data$Survived)

	##################
	# Predictive Models
	##################

	Before applying the algorithms, look at the nature of the data
	- factors, numbers etc
	- data cleansing, NA's etc, bad data etc
	-- read the data cleaning in R book
	- dependent variables etc
	after creating the model, check the accuracy etc
	and check if the model is correct

	# linear regression example
	# use linear regression in numeric data prediction etc
	> data(Quartet)
	> str(Quartet)
	'data.frame': 11 obs. of 6 variables:
	$ x : int 10 8 13 9 11 14 6 4 12 7 ...
	$ y1: num 8.04 6.95 7.58 8.81 8.33 ...
	$ y2: num 9.14 8.14 8.74 8.77 9.26 8.1 6.13 3.1 9.13 7.26 ...
	$ y3: num 7.46 6.77 12.74 7.11 7.81 ...
	$ x4: int 8 8 8 8 8 8 8 19 8 8 ...
	$ y4: num 6.58 5.76 7.71 8.84 8.47 7.04 5.25 12.5 5.56 7.91 ...
	> plot(Quartet$x,Quartet$y1)
	> lmfit <- lm(y1 ~ x, Quartet)
	> abline(lmfit,col = "red")
	> summary(lm)


	#TODO: understand confint
	> confint(lmfit, level = 0.95)
	2.5 % 97.5 %
	(Intercept) 0.4557369 5.5444449
	x 0.2333701 0.7668117

	#logistic regression ()
	## Best for Yes/No kind of results
	model <- glm(Survived ~ Age + Sex , data = training_data, family = "binomial")
	pred <- predict(model,test_data,type = "response")
	Class <- ifelse(pred > 0.5,1,0)

	#Regression Diagnostic
	> par(mfrow=c(2,2))
	> plot(lmfit)

	# create the confusion matrix to get the accuracy
	#table
	ctb <- table(test_data$Survived,Class)
	confusionMatrix(ctb)

	#Naive Bayes Classification
	> model <- naiveBayes(Survived ~ Age + Sex, training_data)
	> pred <- predict(model,data = test_data)
	> confusionMatrix(table(test_data$Survived,pred))

	# SVM Classification
	> model = svm(Survived ~ Age + Sex,training_data)
	# replace NA with mean value for age: Should be done in training also
	>test$Age[which(is.na(test$Age))] = 28
	> pred = predict(model,newdata = test)
	> pred_class = ifelse(pred == 0 , 0 ,1)
	> confusionMatrix(table(pred_class,test$Survived))