PirateGrunt/titanic_lm.R

## titanic_lm.R
#File created 1/31/13

#contains R code to
 #-read in Kaggle Competition Titanic Data csv file
 #-create a simple logistic regression model
 #-make predictions on training and test data
 #-write out test predictions to csv file
 #
 #Replace the <your path here> with the full path to your copy of train and test csv files.
 ###################################################################################

#create a Kaggle account http://www.kaggle.com/account/register
 #read and agree to the rules if you choose to continue

#enter the Kaggle Titantic Competition http://www.kaggle.com/c/titanic-gettingStarted
 #download train.csv and test.csv
 #obtain-download R from http://www.r-project.org/
 #you will have to choose a ‘mirror’ or site – usually a university or research site

#read the training data into a dataframe called train
 train<- read.table(“C:/Users/<your path here>/train.csv”,
 header = TRUE, sep = “,”)
 #set the pclass, passengers pseudoclass, to be ordered categorical
 train$pclass <-factor(train$pclass,levels = c(3, 2, 1), ordered = TRUE)

#create a truth vector of survival results from training
 S = train$survived == 1

#read the test data into a dataframe named test
 test<- read.table(“C:/Users/<your path here>/test.csv”,
 header = TRUE, sep = “,”)

#pclass is categorical for test data also
 test$pclass <-factor(test$pclass,levels = c(3, 2, 1), ordered = TRUE)

#create a super simple logistic regression model with the training data
 #predicting survival based on passenger class and sex
 logistic.model <- glm(survived ~ pclass + sex, family = binomial(), data=train)

#generate predictions for training data using the predict method of the logistic model
 training_predictions <- predict(logistic.model, type = “response”)

#compute training error use an outcome cutoff at 0.5
 training_error <-sum((training_predictions >= 0.5) != S)/nrow(train)
 training_error
 1-training_error

#training error for predictions in {0,1}
 test_predictions = predict(logistic.model, test, type = “response”)

#using a probability cutoff of 0.5 for outcome of survived, default missing to deceased
 test_predictions[test_predictions >=0.5] <- 1
 test_predictions[ test_predictions != 1] <- 0
 test_predictions[is.na(test_predictions)] <- 0

#write out the test_predictions to a comma separated value, csv, file
 write.table(test_predictions, “C:/Users/<your path here>/predictions.csv”,col.names = F,row.names=F,quote=FALSE)

#submit your predictions.csv file to Kaggle.com to view the resulting test data score
	#File created 1/31/13

	#contains R code to
	#-read in Kaggle Competition Titanic Data csv file
	#-create a simple logistic regression model
	#-make predictions on training and test data
	#-write out test predictions to csv file
	#
	#Replace the <your path here> with the full path to your copy of train and test csv files.
	###################################################################################

	#create a Kaggle account http://www.kaggle.com/account/register
	#read and agree to the rules if you choose to continue

	#enter the Kaggle Titantic Competition http://www.kaggle.com/c/titanic-gettingStarted
	#download train.csv and test.csv
	#obtain-download R from http://www.r-project.org/
	#you will have to choose a ‘mirror’ or site – usually a university or research site

	#read the training data into a dataframe called train
	train<- read.table(“C:/Users/<your path here>/train.csv”,
	header = TRUE, sep = “,”)
	#set the pclass, passengers pseudoclass, to be ordered categorical
	train$pclass <-factor(train$pclass,levels = c(3, 2, 1), ordered = TRUE)

	#create a truth vector of survival results from training
	S = train$survived == 1

	#read the test data into a dataframe named test
	test<- read.table(“C:/Users/<your path here>/test.csv”,
	header = TRUE, sep = “,”)

	#pclass is categorical for test data also
	test$pclass <-factor(test$pclass,levels = c(3, 2, 1), ordered = TRUE)

	#create a super simple logistic regression model with the training data
	#predicting survival based on passenger class and sex
	logistic.model <- glm(survived ~ pclass + sex, family = binomial(), data=train)

	#generate predictions for training data using the predict method of the logistic model
	training_predictions <- predict(logistic.model, type = “response”)

	#compute training error use an outcome cutoff at 0.5
	training_error <-sum((training_predictions >= 0.5) != S)/nrow(train)
	training_error
	1-training_error

	#training error for predictions in {0,1}
	test_predictions = predict(logistic.model, test, type = “response”)

	#using a probability cutoff of 0.5 for outcome of survived, default missing to deceased
	test_predictions[test_predictions >=0.5] <- 1
	test_predictions[ test_predictions != 1] <- 0
	test_predictions[is.na(test_predictions)] <- 0

	#write out the test_predictions to a comma separated value, csv, file
	write.table(test_predictions, “C:/Users/<your path here>/predictions.csv”,col.names = F,row.names=F,quote=FALSE)

	#submit your predictions.csv file to Kaggle.com to view the resulting test data score