Skip to content

Instantly share code, notes, and snippets.

@eellpp
Last active December 11, 2015 02:21
Show Gist options
  • Save eellpp/73f73166e0c3e2a38400 to your computer and use it in GitHub Desktop.
Save eellpp/73f73166e0c3e2a38400 to your computer and use it in GitHub Desktop.
#In this case the blank cell are to be marked as NA. So have to tell R
train.data = read.csv("train.csv", na.strings=c("NA", ""))
# taking data from dataframe based on condition : filtering data
train.data[which(train.data$Survived == 1),"Survived"])
length(train.data[which(train.data$Survived == 1 & train.data$Age > 50),"Survived"])
OR
length(train.data$Survived[train.data$Survived == 1 & train.data$Age > 50])
> length(train.data[which(train.data$Survived == 1 & train.data$Sex == "female"),"Survived"])
[1] 233
> length(train.data[which(train.data$Survived == 1 & train.data$Sex == "male"),"Survived"])
[1] 109
# which
> length(which(is.na(train.data$Age))) / length(train.data$Age)
[1] 0.1986532
# dataframe is list with some restriction so we can use sapply over dataframe
> sapply(train.data,FUN=function(df){length(which(is.na(df))) / length(df)})
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket
0.000000000 0.000000000 0.000000000 0.000000000 0.000000000 0.198653199 0.000000000 0.000000000 0.000000000
Fare Cabin Embarked
0.000000000 0.771043771 0.002244669
# imputing values
Since embarked class has less NA they can be imputed
> table(train.data$Embarked,useNA="always")
C Q S <NA>
168 77 644 2
> train.data$Embarked[which(is.na(train.data$Embarked))] <- as.factor("S")
---
# Generate plots for survival according to sex and fare (which seems intutive)
> df <- train.data[train.data$Sex == "female",c("Fare","Survived")]
> farelevels <- cut(df$Fare,c(0,10,15,30,50,100,200,550),labels = sort(c("a","b","c","d","e","f","g"),decreasing=FALSE) )
> barplot(table(df$Survived,farelevels),beside=TRUE,legend =c(levels(df$Survived)),col=c("red","green"))
> df.males <- train.data[train.data$Sex != "female",c("Fare","Survived")]
> farelevels.m <- cut(df.males$Fare,c(0,10,15,30,50,100,200,550),labels = sort(c("a","b","c","d","e","f","g"),decreasing=FALSE) )
> barplot(table(df.males$Survived,farelevels.m),beside=TRUE,legend =c(levels(df$Survived)),col=c("red","green"))
#Generating Descriptive Statistics with
# histogram split the data by freq into bins
# barplot can show the freq of variation of two variables
> barplot(table(train.data$Survived),names.arg=c("Perished","Survived"))
> plot(iris$Sepal.Length,iris$Sepal.Width,col= iris$Species)
> table(iris$Sepal.Length,iris$Sepal.Width)
# multiple cross values on same page
> par(mfrow=c(2,2))
> plot(SLID$wages ~ SLID$language)
> plot(SLID$wages ~ SLID$age)
> plot(SLID$wages ~ SLID$education)
> plot(SLID$wages ~ SLID$sex)
---
# survival rate by sex
> xtable <- table(train.data$Survived,train.data$Sex)
> barplot(xtable,legend=c("Perished","Survived"),col=c("red","green"))
# make histogram and add second graph on top of first
> hist(train.data$Age[which(train.data$Survived == "0")],col="blue",add = F)
> hist(train.data$Age[which(train.data$Survived == "1")],col="red",add = T)
#Using R to split the data into training and test set
percentage <- 0.7
set.seed(123)
training_index <- sample(seq_len(nrow(train.data)),floor(percentage * nrow(train.data)))
training_data <- train.data[training_index,]
test_data <- train.data[-training_index,]
# train using decision tree
> train.ctree <- ctree(Survived ~ Pclass + Sex + Age + SibSp + Fare + Parch + Embarked,data=training_data)
> plot(train.ctree)
# from decision tree we can see which combination of attributes are useful for classification
> ctree.predict <- predict(train.ctree,test_data)
# install caret package
# use the confusionmatrix to see the prediction accuracy
> confusionMatrix(ctree.predict,test_data$Survived)
##################
# Predictive Models
##################
Before applying the algorithms, look at the nature of the data
- factors, numbers etc
- data cleansing, NA's etc, bad data etc
-- read the data cleaning in R book
- dependent variables etc
after creating the model, check the accuracy etc
and check if the model is correct
# linear regression example
# use linear regression in numeric data prediction etc
> data(Quartet)
> str(Quartet)
'data.frame': 11 obs. of 6 variables:
$ x : int 10 8 13 9 11 14 6 4 12 7 ...
$ y1: num 8.04 6.95 7.58 8.81 8.33 ...
$ y2: num 9.14 8.14 8.74 8.77 9.26 8.1 6.13 3.1 9.13 7.26 ...
$ y3: num 7.46 6.77 12.74 7.11 7.81 ...
$ x4: int 8 8 8 8 8 8 8 19 8 8 ...
$ y4: num 6.58 5.76 7.71 8.84 8.47 7.04 5.25 12.5 5.56 7.91 ...
> plot(Quartet$x,Quartet$y1)
> lmfit <- lm(y1 ~ x, Quartet)
> abline(lmfit,col = "red")
> summary(lm)
#TODO: understand confint
> confint(lmfit, level = 0.95)
2.5 % 97.5 %
(Intercept) 0.4557369 5.5444449
x 0.2333701 0.7668117
#logistic regression ()
## Best for Yes/No kind of results
model <- glm(Survived ~ Age + Sex , data = training_data, family = "binomial")
pred <- predict(model,test_data,type = "response")
Class <- ifelse(pred > 0.5,1,0)
#Regression Diagnostic
> par(mfrow=c(2,2))
> plot(lmfit)
# create the confusion matrix to get the accuracy
#table
ctb <- table(test_data$Survived,Class)
confusionMatrix(ctb)
#Naive Bayes Classification
> model <- naiveBayes(Survived ~ Age + Sex, training_data)
> pred <- predict(model,data = test_data)
> confusionMatrix(table(test_data$Survived,pred))
# SVM Classification
> model = svm(Survived ~ Age + Sex,training_data)
# replace NA with mean value for age: Should be done in training also
>test$Age[which(is.na(test$Age))] = 28
> pred = predict(model,newdata = test)
> pred_class = ifelse(pred == 0 , 0 ,1)
> confusionMatrix(table(pred_class,test$Survived))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment