Orhan Yalcin ogyalcin

## clean_traindata.py
import pandas as pd
train = pd.read_csv("train.csv") #load the data from the system
train = train.drop(['Cabin'], 1, inplace=False) # First dropping 'Cabin' column because it has a lot of null values.
train = train.dropna() #delete the rows with empty values
y = train['Survived'] #select the column representing survival
X = train.drop(['Survived', 'PassengerId', 'Name', 'Ticket'], 1, inplace=True) # drop the irrelevant columns and keep the rest
X = pd.get_dummies(train) # convert non-numerical variables to dummy variables

## train_model.py
from sklearn import tree
dtc = tree.DecisionTreeClassifier()
dtc.fit(X, y)

## clean_testdata.py
test = pd.read_csv("test.csv") # load the testing data
ids = test[['PassengerId']] # create a sub-dataset for submission file and saving it
test.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], 1, inplace=True) # drop the irrelevant and keeping the rest
test.fillna(2, inplace=True) # fill (instead of drop) empty rows so that I would get the exact row number required for submission
test = pd.get_dummies(test) # convert non-numerical variables to dummy variables

## make_predictions.py
predictions = dtc.predict(test)

## save_to_file.py
results = ids.assign(Survived = predictions) # assign predictions to ids
results.to_csv("titanic-results.csv", index=False) # write the final dataset to a csv file.

## combined_df.py
import pandas as pd
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
combined = pd.concat([train.drop('Survived',axis=1),test])

## null_values_heatmap.py
#For iPython
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.figure(figsize=(10,25))
sns.heatmap(combined.isnull(),cmap="viridis",yticklabels=False,cbar=False)

## combined_info.py
combined.info()

## readcsv.py
import numpy as np
import pandas as pd
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

## prep_train.py
train['Age'].fillna(train['Age'].median(),inplace=True) # Imputing Missing Age Values
train['Embarked'].fillna(train['Embarked'].value_counts().index[0], inplace=True) # Imputing Missing Embarked Values
d = {1:'1st',2:'2nd',3:'3rd'} #Creating a dictionary to convert Passenger Class from 1,2,3 to 1st,2nd,3rd.
train['Pclass'] = train['Pclass'].map(d) #Mapping the column based on the dictionary
train.drop(['PassengerId','Name','Ticket','Cabin'], 1, inplace=True) # Dropping Unnecessary Columns
categorical_vars = train[['Pclass','Sex','Embarked']] # Getting Dummies of Categorical Variables
dummies = pd.get_dummies(categorical_vars,drop_first=True)
train = train.drop(['Pclass','Sex','Embarked'],axis=1) #Dropping the Original Categorical Variables to avoid duplicates
train = pd.concat([train,dummies],axis=1) #Now, concat the new dummy variables
train.head() #Check the clean version of the train data.
	import pandas as pd
	train = pd.read_csv("train.csv") #load the data from the system
	train = train.drop(['Cabin'], 1, inplace=False) # First dropping 'Cabin' column because it has a lot of null values.
	train = train.dropna() #delete the rows with empty values
	y = train['Survived'] #select the column representing survival
	X = train.drop(['Survived', 'PassengerId', 'Name', 'Ticket'], 1, inplace=True) # drop the irrelevant columns and keep the rest
	X = pd.get_dummies(train) # convert non-numerical variables to dummy variables
	from sklearn import tree
	dtc = tree.DecisionTreeClassifier()
	dtc.fit(X, y)
	test = pd.read_csv("test.csv") # load the testing data
	ids = test[['PassengerId']] # create a sub-dataset for submission file and saving it
	test.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], 1, inplace=True) # drop the irrelevant and keeping the rest
	test.fillna(2, inplace=True) # fill (instead of drop) empty rows so that I would get the exact row number required for submission
	test = pd.get_dummies(test) # convert non-numerical variables to dummy variables
	results = ids.assign(Survived = predictions) # assign predictions to ids
	results.to_csv("titanic-results.csv", index=False) # write the final dataset to a csv file.
	import pandas as pd
	train = pd.read_csv("train.csv")
	test = pd.read_csv("test.csv")
	combined = pd.concat([train.drop('Survived',axis=1),test])
	#For iPython
	import matplotlib.pyplot as plt
	import seaborn as sns
	%matplotlib inline
	plt.figure(figsize=(10,25))
	sns.heatmap(combined.isnull(),cmap="viridis",yticklabels=False,cbar=False)
	import numpy as np
	import pandas as pd
	train = pd.read_csv("train.csv")
	test = pd.read_csv("test.csv")
	train['Age'].fillna(train['Age'].median(),inplace=True) # Imputing Missing Age Values
	train['Embarked'].fillna(train['Embarked'].value_counts().index[0], inplace=True) # Imputing Missing Embarked Values
	d = {1:'1st',2:'2nd',3:'3rd'} #Creating a dictionary to convert Passenger Class from 1,2,3 to 1st,2nd,3rd.
	train['Pclass'] = train['Pclass'].map(d) #Mapping the column based on the dictionary
	train.drop(['PassengerId','Name','Ticket','Cabin'], 1, inplace=True) # Dropping Unnecessary Columns
	categorical_vars = train[['Pclass','Sex','Embarked']] # Getting Dummies of Categorical Variables
	dummies = pd.get_dummies(categorical_vars,drop_first=True)
	train = train.drop(['Pclass','Sex','Embarked'],axis=1) #Dropping the Original Categorical Variables to avoid duplicates
	train = pd.concat([train,dummies],axis=1) #Now, concat the new dummy variables
	train.head() #Check the clean version of the train data.