HarshSingh16/Surviving Titanic.R

## Surviving Titanic.R
########loading the Titanic Train Data Set
TitanicTrain<-train1

######Checking Missing Values in the Train Data Set
sapply(TitanicTrain, function(x)sum(is.na(x)))

#######Loading the Titanic Test Data Set
TitanicTest<-test11

#######Checking Missing Values in the Test Data Set
sapply(TitanicTest, function(x)sum(is.na(x)))

#######Adding the Survived Variable to the Test Data Set with NA's so that we can easily merge Train and Test
TitanicTest$Survived<-NA

###Merging the Two Data Sets
TitanicTrain<-rbind(TitanicTrain,TitanicTest)

#######Making Sex a Factor Variable
TitanicTrain$Sex<-as.factor(TitanicTrain$Sex)

########Extracting the Titles from the Name Column
TitanicTrain$Title <- gsub('(.*, )|(\\..*)', '', TitanicTrain$Name)

###########Fixing the Missing Values in the Variable "Age" with the Median
ROWS<-which(is.na(TitanicTrain$Age))
MedianAge<-median(TitanicTrain$Age,na.rm=TRUE)
TitanicTrain$Age<-as.character(TitanicTrain$Age)
TitanicTrain[ROWS,"Age"]<-MedianAge


###########Fixing the Missing Values in Variable "Embarked"
ROWS2<-which(is.na(TitanicTrain$Embarked))
TitanicTrain[ROWS2,"Embarked"]<-"S"

##########Fixing the Missing Values in Variable "Fare"
Rows3<-which(is.na(TitanicTrain$Fare))
TitanicTrain[Rows3,"Fare"]<-14.45

###########Converting the types of Variables to correct form
TitanicTrain$Survived<-as.factor(TitanicTrain$Survived)
TitanicTrain$PassengerId<-as.factor(TitanicTrain$PassengerId)
TitanicTrain$Pclass<-as.factor(TitanicTrain$Pclass)
TitanicTrain$Age<-as.numeric(TitanicTrain$Age)
TitanicTrain$SibSp<-as.factor(TitanicTrain$SibSp)
TitanicTrain$Parch<-as.factor(TitanicTrain$Parch)
TitanicTrain$Embarked<-as.factor(TitanicTrain$Embarked)
TitanicTrain$Title<-as.factor(TitanicTrain$Title)

#########Removing Variables "Ticket and "Cabin" as they have a huge proportion of missing values and do not add any value to the model
TitanicTrain$Ticket<-NULL
TitanicTrain$Cabin<-NULL
TitanicTrain$Name<-NULL ###We Can remove the variable Name too as we have already extracted the Titles


########Converting the prepared data to a Data Frame
TitanicTrain<-data.frame(TitanicTrain)

#######Converting Passenger Id to numeric so that we can subset the data and sepearate the observations that we have to predict
TitanicTrain$PassengerId<-as.numeric(TitanicTrain$PassengerId)
Train<-subset(TitanicTrain,PassengerId<892)
Test<-subset(TitanicTrain,PassengerId>=892)

#############Separating the Independent (x) and the dependant ("Survived") variables
y<-as.numeric(Train[,2])-1
x<-data.frame(Train[,3:10])
Test$Survived<-NULL
xtest<-data.frame(Test)
xtest$PassengerId<-NULL

########Finally checking Structures for all the created data frames
str(y)
str(x)
str(xtest)


###########Loading the SuperLearner Library
library(SuperLearner)

#############Training the Model using SuperLearner Library (Ensemble Modelling)
single.model2 <- SuperLearner(y,
                             x,
                             family=binomial(),
                             SL.library=list("SL.ranger","SL.ksvm","SL.gbm","SL.xgboost",
                                             "SL.glmnet","SL.randomForest"))

###########Printing the Model to check the Risk Estimates/Error
print(single.model2)


##########Making Predictions on the Test Data
predictions3<-predict.SuperLearner(single.model2,xtest)
###########Observing the frequency distribution of the Predictions
hist(predictions3$pred)

#########Converting the Predictions to Binaries
predictions4<-ifelse(predictions3$pred>=0.73,1,0)


#############Creating a CV File with the Predictions
write.csv(predictions4,"Predictions.csv")
	########loading the Titanic Train Data Set
	TitanicTrain<-train1

	######Checking Missing Values in the Train Data Set
	sapply(TitanicTrain, function(x)sum(is.na(x)))

	#######Loading the Titanic Test Data Set
	TitanicTest<-test11

	#######Checking Missing Values in the Test Data Set
	sapply(TitanicTest, function(x)sum(is.na(x)))

	#######Adding the Survived Variable to the Test Data Set with NA's so that we can easily merge Train and Test
	TitanicTest$Survived<-NA

	###Merging the Two Data Sets
	TitanicTrain<-rbind(TitanicTrain,TitanicTest)

	#######Making Sex a Factor Variable
	TitanicTrain$Sex<-as.factor(TitanicTrain$Sex)

	########Extracting the Titles from the Name Column
	TitanicTrain$Title <- gsub('(., )\|(\\..)', '', TitanicTrain$Name)

	###########Fixing the Missing Values in the Variable "Age" with the Median
	ROWS<-which(is.na(TitanicTrain$Age))
	MedianAge<-median(TitanicTrain$Age,na.rm=TRUE)
	TitanicTrain$Age<-as.character(TitanicTrain$Age)
	TitanicTrain[ROWS,"Age"]<-MedianAge


	###########Fixing the Missing Values in Variable "Embarked"
	ROWS2<-which(is.na(TitanicTrain$Embarked))
	TitanicTrain[ROWS2,"Embarked"]<-"S"

	##########Fixing the Missing Values in Variable "Fare"
	Rows3<-which(is.na(TitanicTrain$Fare))
	TitanicTrain[Rows3,"Fare"]<-14.45

	###########Converting the types of Variables to correct form
	TitanicTrain$Survived<-as.factor(TitanicTrain$Survived)
	TitanicTrain$PassengerId<-as.factor(TitanicTrain$PassengerId)
	TitanicTrain$Pclass<-as.factor(TitanicTrain$Pclass)
	TitanicTrain$Age<-as.numeric(TitanicTrain$Age)
	TitanicTrain$SibSp<-as.factor(TitanicTrain$SibSp)
	TitanicTrain$Parch<-as.factor(TitanicTrain$Parch)
	TitanicTrain$Embarked<-as.factor(TitanicTrain$Embarked)
	TitanicTrain$Title<-as.factor(TitanicTrain$Title)

	#########Removing Variables "Ticket and "Cabin" as they have a huge proportion of missing values and do not add any value to the model
	TitanicTrain$Ticket<-NULL
	TitanicTrain$Cabin<-NULL
	TitanicTrain$Name<-NULL ###We Can remove the variable Name too as we have already extracted the Titles


	########Converting the prepared data to a Data Frame
	TitanicTrain<-data.frame(TitanicTrain)

	#######Converting Passenger Id to numeric so that we can subset the data and sepearate the observations that we have to predict
	TitanicTrain$PassengerId<-as.numeric(TitanicTrain$PassengerId)
	Train<-subset(TitanicTrain,PassengerId<892)
	Test<-subset(TitanicTrain,PassengerId>=892)

	#############Separating the Independent (x) and the dependant ("Survived") variables
	y<-as.numeric(Train[,2])-1
	x<-data.frame(Train[,3:10])
	Test$Survived<-NULL
	xtest<-data.frame(Test)
	xtest$PassengerId<-NULL

	########Finally checking Structures for all the created data frames
	str(y)
	str(x)
	str(xtest)


	###########Loading the SuperLearner Library
	library(SuperLearner)

	#############Training the Model using SuperLearner Library (Ensemble Modelling)
	single.model2 <- SuperLearner(y,
	x,
	family=binomial(),
	SL.library=list("SL.ranger","SL.ksvm","SL.gbm","SL.xgboost",
	"SL.glmnet","SL.randomForest"))

	###########Printing the Model to check the Risk Estimates/Error
	print(single.model2)


	##########Making Predictions on the Test Data
	predictions3<-predict.SuperLearner(single.model2,xtest)
	###########Observing the frequency distribution of the Predictions
	hist(predictions3$pred)

	#########Converting the Predictions to Binaries
	predictions4<-ifelse(predictions3$pred>=0.73,1,0)


	#############Creating a CV File with the Predictions
	write.csv(predictions4,"Predictions.csv")