Last active
December 11, 2015 02:21
-
-
Save eellpp/73f73166e0c3e2a38400 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#In this case the blank cell are to be marked as NA. So have to tell R | |
train.data = read.csv("train.csv", na.strings=c("NA", "")) | |
# taking data from dataframe based on condition : filtering data | |
train.data[which(train.data$Survived == 1),"Survived"]) | |
length(train.data[which(train.data$Survived == 1 & train.data$Age > 50),"Survived"]) | |
OR | |
length(train.data$Survived[train.data$Survived == 1 & train.data$Age > 50]) | |
> length(train.data[which(train.data$Survived == 1 & train.data$Sex == "female"),"Survived"]) | |
[1] 233 | |
> length(train.data[which(train.data$Survived == 1 & train.data$Sex == "male"),"Survived"]) | |
[1] 109 | |
# which | |
> length(which(is.na(train.data$Age))) / length(train.data$Age) | |
[1] 0.1986532 | |
# dataframe is list with some restriction so we can use sapply over dataframe | |
> sapply(train.data,FUN=function(df){length(which(is.na(df))) / length(df)}) | |
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket | |
0.000000000 0.000000000 0.000000000 0.000000000 0.000000000 0.198653199 0.000000000 0.000000000 0.000000000 | |
Fare Cabin Embarked | |
0.000000000 0.771043771 0.002244669 | |
# imputing values | |
Since embarked class has less NA they can be imputed | |
> table(train.data$Embarked,useNA="always") | |
C Q S <NA> | |
168 77 644 2 | |
> train.data$Embarked[which(is.na(train.data$Embarked))] <- as.factor("S") | |
--- | |
# Generate plots for survival according to sex and fare (which seems intutive) | |
> df <- train.data[train.data$Sex == "female",c("Fare","Survived")] | |
> farelevels <- cut(df$Fare,c(0,10,15,30,50,100,200,550),labels = sort(c("a","b","c","d","e","f","g"),decreasing=FALSE) ) | |
> barplot(table(df$Survived,farelevels),beside=TRUE,legend =c(levels(df$Survived)),col=c("red","green")) | |
> df.males <- train.data[train.data$Sex != "female",c("Fare","Survived")] | |
> farelevels.m <- cut(df.males$Fare,c(0,10,15,30,50,100,200,550),labels = sort(c("a","b","c","d","e","f","g"),decreasing=FALSE) ) | |
> barplot(table(df.males$Survived,farelevels.m),beside=TRUE,legend =c(levels(df$Survived)),col=c("red","green")) | |
#Generating Descriptive Statistics with | |
# histogram split the data by freq into bins | |
# barplot can show the freq of variation of two variables | |
> barplot(table(train.data$Survived),names.arg=c("Perished","Survived")) | |
> plot(iris$Sepal.Length,iris$Sepal.Width,col= iris$Species) | |
> table(iris$Sepal.Length,iris$Sepal.Width) | |
# multiple cross values on same page | |
> par(mfrow=c(2,2)) | |
> plot(SLID$wages ~ SLID$language) | |
> plot(SLID$wages ~ SLID$age) | |
> plot(SLID$wages ~ SLID$education) | |
> plot(SLID$wages ~ SLID$sex) | |
--- | |
# survival rate by sex | |
> xtable <- table(train.data$Survived,train.data$Sex) | |
> barplot(xtable,legend=c("Perished","Survived"),col=c("red","green")) | |
# make histogram and add second graph on top of first | |
> hist(train.data$Age[which(train.data$Survived == "0")],col="blue",add = F) | |
> hist(train.data$Age[which(train.data$Survived == "1")],col="red",add = T) | |
#Using R to split the data into training and test set | |
percentage <- 0.7 | |
set.seed(123) | |
training_index <- sample(seq_len(nrow(train.data)),floor(percentage * nrow(train.data))) | |
training_data <- train.data[training_index,] | |
test_data <- train.data[-training_index,] | |
# train using decision tree | |
> train.ctree <- ctree(Survived ~ Pclass + Sex + Age + SibSp + Fare + Parch + Embarked,data=training_data) | |
> plot(train.ctree) | |
# from decision tree we can see which combination of attributes are useful for classification | |
> ctree.predict <- predict(train.ctree,test_data) | |
# install caret package | |
# use the confusionmatrix to see the prediction accuracy | |
> confusionMatrix(ctree.predict,test_data$Survived) | |
################## | |
# Predictive Models | |
################## | |
Before applying the algorithms, look at the nature of the data | |
- factors, numbers etc | |
- data cleansing, NA's etc, bad data etc | |
-- read the data cleaning in R book | |
- dependent variables etc | |
after creating the model, check the accuracy etc | |
and check if the model is correct | |
# linear regression example | |
# use linear regression in numeric data prediction etc | |
> data(Quartet) | |
> str(Quartet) | |
'data.frame': 11 obs. of 6 variables: | |
$ x : int 10 8 13 9 11 14 6 4 12 7 ... | |
$ y1: num 8.04 6.95 7.58 8.81 8.33 ... | |
$ y2: num 9.14 8.14 8.74 8.77 9.26 8.1 6.13 3.1 9.13 7.26 ... | |
$ y3: num 7.46 6.77 12.74 7.11 7.81 ... | |
$ x4: int 8 8 8 8 8 8 8 19 8 8 ... | |
$ y4: num 6.58 5.76 7.71 8.84 8.47 7.04 5.25 12.5 5.56 7.91 ... | |
> plot(Quartet$x,Quartet$y1) | |
> lmfit <- lm(y1 ~ x, Quartet) | |
> abline(lmfit,col = "red") | |
> summary(lm) | |
#TODO: understand confint | |
> confint(lmfit, level = 0.95) | |
2.5 % 97.5 % | |
(Intercept) 0.4557369 5.5444449 | |
x 0.2333701 0.7668117 | |
#logistic regression () | |
## Best for Yes/No kind of results | |
model <- glm(Survived ~ Age + Sex , data = training_data, family = "binomial") | |
pred <- predict(model,test_data,type = "response") | |
Class <- ifelse(pred > 0.5,1,0) | |
#Regression Diagnostic | |
> par(mfrow=c(2,2)) | |
> plot(lmfit) | |
# create the confusion matrix to get the accuracy | |
#table | |
ctb <- table(test_data$Survived,Class) | |
confusionMatrix(ctb) | |
#Naive Bayes Classification | |
> model <- naiveBayes(Survived ~ Age + Sex, training_data) | |
> pred <- predict(model,data = test_data) | |
> confusionMatrix(table(test_data$Survived,pred)) | |
# SVM Classification | |
> model = svm(Survived ~ Age + Sex,training_data) | |
# replace NA with mean value for age: Should be done in training also | |
>test$Age[which(is.na(test$Age))] = 28 | |
> pred = predict(model,newdata = test) | |
> pred_class = ifelse(pred == 0 , 0 ,1) | |
> confusionMatrix(table(pred_class,test$Survived)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment