Last active
October 3, 2019 11:13
-
-
Save Remonhasan/34a81a6f44d278c3c0d0b4b3cfbe7dae to your computer and use it in GitHub Desktop.
Dataset : Titanic with SVM / Research
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Author : Remon Hasan , University of Asia Pacific | |
# Machine Learning algorithm practice for research | |
#implemantation for google colab | |
#import library | |
import numpy as np | |
import pandas as pd | |
#read file | |
train = pd.read_csv('train.csv') | |
test = pd.read_csv('test.csv') | |
train.head() | |
#print data | |
train.shape | |
test.shape | |
#missing train data | |
train.isnull().sum() | |
#missing test data | |
test.isnull().sum() | |
#Graphical libaray for visualization | |
import matplotlib.pyplot as plt | |
%matplotlib inline | |
import seaborn as sns | |
sns.set() | |
#barchart function | |
def bar_chart(feature): | |
survived = train[train['Survived']==1][feature].value_counts() | |
dead = train[train['Survived']==0][feature].value_counts() | |
df = pd.DataFrame([survived,dead]) | |
df.index = ['Survived','Dead'] | |
df.plot(kind='bar',stacked=True, figsize=(10,5)) | |
#show gender | |
bar_chart('Sex') | |
#summing train and test dataset | |
train_test_data = [train,test] | |
for dataset in train_test_data: | |
dataset['Title'] = dataset['Name'].str.extract('([A-Za-z]+)\.',expand=False) | |
#find number of train and test dataset title | |
train['Title'].value_counts() | |
#others titles are defined by 3 | |
title_mapping = {"Mr": 0, "Miss": 1, "Mrs": 2, "Master": 3, "Dr": 3, "Rev": 3, "Col": 3, "Major": 3, "Mlle": 3, | |
"Countess": 3, "Ms": 3, "Lady": 3, "Jonkheer": 3, "Don": 3, "Mme": 3, "Capt": 3, "Sir": 3, } | |
for dataset in train_test_data: | |
dataset['Title'] = dataset['Title'].map(title_mapping) | |
#print train and test data | |
train.head() | |
test.head() | |
# delete unnecessary feature from dataset | |
train.drop('Name', axis=1, inplace=True) | |
test.drop('Name', axis=1, inplace=True) | |
# now print | |
train.head() | |
test.head() | |
#maping sex | |
sex_mapping = {"male": 0, "female": 1} | |
for dataset in train_test_data: | |
dataset['Sex'] = dataset['Sex'].map(sex_mapping) | |
#print barchart | |
bar_chart('Sex') | |
# fill missing age with median age for each title (Mr, Mrs, Miss, Others) | |
train["Age"].fillna(train.groupby("Title")["Age"].transform("median"), inplace=True) | |
test["Age"].fillna(test.groupby("Title")["Age"].transform("median"), inplace=True) | |
train.groupby("Title")["Age"].transform("median") | |
# now print | |
train.head() | |
# showing in graph | |
facet = sns.FacetGrid(train, hue="Survived",aspect=4) | |
facet.map(sns.kdeplot,'Age',shade= True) | |
facet.set(xlim=(0, train['Age'].max())) | |
facet.add_legend() | |
# show | |
plt.show() | |
#differentionate age | |
for dataset in train_test_data: | |
dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0, | |
dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 26), 'Age'] = 1, | |
dataset.loc[(dataset['Age'] > 26) & (dataset['Age'] <= 36), 'Age'] = 2, | |
dataset.loc[(dataset['Age'] > 36) & (dataset['Age'] <= 62), 'Age'] = 3, | |
dataset.loc[ dataset['Age'] > 62, 'Age'] = 4 | |
# now print | |
train.head() | |
#filling missing values | |
Pclass1 = train[train['Pclass']==1]['Embarked'].value_counts() | |
Pclass2 = train[train['Pclass']==2]['Embarked'].value_counts() | |
Pclass3 = train[train['Pclass']==3]['Embarked'].value_counts() | |
df = pd.DataFrame([Pclass1, Pclass2, Pclass3]) | |
df.index = ['1st class','2nd class', '3rd class'] | |
df.plot(kind='bar',stacked=True, figsize=(10,5)) | |
for dataset in train_test_data: | |
dataset['Embarked'] = dataset['Embarked'].fillna('S') | |
train.head() | |
#each city defined by each number | |
embarked_mapping = {"S": 0, "C": 1, "Q": 2} | |
for dataset in train_test_data: | |
dataset['Embarked'] = dataset['Embarked'].map(embarked_mapping) | |
# fill missing Fare with median fare for each Pclass | |
train["Fare"].fillna(train.groupby("Pclass")["Fare"].transform("median"), inplace=True) | |
test["Fare"].fillna(test.groupby("Pclass")["Fare"].transform("median"), inplace=True) | |
train.head(5) | |
#dividing tickets price into different begs | |
for dataset in train_test_data: | |
dataset.loc[ dataset['Fare'] <= 17, 'Fare'] = 0, | |
dataset.loc[(dataset['Fare'] > 17) & (dataset['Fare'] <= 30), 'Fare'] = 1, | |
dataset.loc[(dataset['Fare'] > 30) & (dataset['Fare'] <= 100), 'Fare'] = 2, | |
dataset.loc[ dataset['Fare'] > 100, 'Fare'] = 3 | |
train.head() | |
train.Cabin.value_counts() | |
#define which cabin is it ? | |
for dataset in train_test_data: | |
dataset['Cabin'] = dataset['Cabin'].str[:1] | |
Pclass1 = train[train['Pclass']==1]['Cabin'].value_counts() | |
Pclass2 = train[train['Pclass']==2]['Cabin'].value_counts() | |
Pclass3 = train[train['Pclass']==3]['Cabin'].value_counts() | |
df = pd.DataFrame([Pclass1, Pclass2, Pclass3]) | |
df.index = ['1st class','2nd class', '3rd class'] | |
df.plot(kind='bar',stacked=True, figsize=(10,5)) | |
# cabin mapping | |
cabin_mapping = {"A": 0, "B": 0.4, "C": 0.8, "D": 1.2, "E": 1.6, "F": 2, "G": 2.4, "T": 2.8} | |
for dataset in train_test_data: | |
dataset['Cabin'] = dataset['Cabin'].map(cabin_mapping) | |
# fill missing Fare with median fare for each Pclass | |
train["Cabin"].fillna(train.groupby("Pclass")["Cabin"].transform("median"), inplace=True) | |
test["Cabin"].fillna(test.groupby("Pclass")["Cabin"].transform("median"), inplace=True) | |
train["FamilySize"] = train["SibSp"] + train["Parch"] + 1 | |
test["FamilySize"] = test["SibSp"] + test["Parch"] + 1 | |
facet = sns.FacetGrid(train, hue="Survived",aspect=4) | |
facet.map(sns.kdeplot,'FamilySize',shade= True) | |
facet.set(xlim=(0, train['FamilySize'].max())) | |
facet.add_legend() | |
plt.xlim(0) | |
#family mapping | |
family_mapping = {1: 0, 2: 0.4, 3: 0.8, 4: 1.2, 5: 1.6, 6: 2, 7: 2.4, 8: 2.8, 9: 3.2, 10: 3.6, 11: 4} | |
for dataset in train_test_data: | |
dataset['FamilySize'] = dataset['FamilySize'].map(family_mapping) | |
train.head() | |
#droping features | |
features_drop = ['Ticket', 'SibSp', 'Parch'] | |
train = train.drop(features_drop, axis=1) | |
test = test.drop(features_drop, axis=1) | |
train = train.drop(['PassengerId'], axis=1) | |
train_data = train.drop('Survived', axis=1) | |
target = train['Survived'] | |
#now print | |
train_data.shape, target.shape | |
#import libraries | |
from sklearn.tree import DecisionTreeClassifier | |
from sklearn.model_selection import KFold | |
from sklearn.model_selection import cross_val_score | |
from sklearn.svm import SVC | |
k_fold = KFold(n_splits=10, shuffle=True, random_state=0) | |
train_data.head(50) | |
test.head() | |
#using Decision Tree | |
clf = DecisionTreeClassifier() | |
scoring = 'accuracy' | |
score = cross_val_score(clf, train_data, target, cv=k_fold, n_jobs=1, scoring=scoring) | |
print(score) | |
# decision tree Score | |
round(np.mean(score)*100, 2) | |
#Testing by SVM | |
clf = SVC(gamma="auto") | |
clf.fit(train_data, target) | |
test_data = test.drop("PassengerId", axis=1).copy() | |
prediction = clf.predict(test_data) | |
submission = pd.DataFrame({ | |
"PassengerId": test["PassengerId"], | |
"Survived": prediction | |
}) | |
# servived result will be saved as submission.csv | |
submission.to_csv('submission.csv', index=False) | |
submission = pd.read_csv('submission.csv') | |
submission.head() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment