jesuscmadrigal/ClassificationTask.py

## ClassificationTask.py
# -*- coding: utf-8 -*-
"""Copia de Classification_task.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1ZxjAzE5xRPqsynDGTDEUeAIXgKnIiy9d
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from google.colab import drive
drive.mount('/content/drive')

ruta1="/content/drive/MyDrive/ML/titanic/train.csv"
ruta2="/content/drive/MyDrive/ML/titanic/test.csv"
train_df=pd.read_csv(ruta1)
test_df=pd.read_csv(ruta2)

"""PassengerId - Id of the passenger. It's integer type and does not affect the survived column.

Survived - target variable denoted by 0 - Deceased and 1 - Survived.

Pclass - Class of the passenger 1- First class 2-Second class and 3- third class.

Name - Name of the passenger

Sex - Sex of the passenger

Age - Age of the passenger

SibSp - Number of siblings/spouses

Parch - Number of parents / children

Ticket - Ticket number

Fare - Fare of the passenger

Cabin - Cabin number

Embarked - Embarkation port C - Cherbourg Q - Queenstown S - Southampton

a. Distributions:

i. Check if the data is balanced in the classes in which the data should be
separated.

ii. Analyze the distribution of categorical data and its relationship with the
"survived" class. Start identifying relevant features for accurate classification.

iii. Understand the distribution of numerical data and determine if it is necessary to
apply normalization or standardization processes.

b. Missing data:

i. Identify and visualize missing data.

ii. Decide which characteristics require imputations and which do not. Justify the
decision and eliminate unselected columns.

iii. Apply imputation techniques for missing data. Select the best technique and
justify the choice.

c. Correlation analysis:

i. Perform a correlation analysis to decide which characteristics should be kept and
which should be discarded.

d. Data transformation:
i. Convert categorical data to numerical data. Explore different methods and select
the most suitable one. Justify the choice.
"""

train_df.head()
print("shape_train_df",train_df.shape)
print("shape_test_df",test_df.shape)
print("columnas_train:",train_df.columns)
print("columnas_test:",test_df.columns)

#Drop columns
train_df=train_df.drop(columns=["PassengerId","Ticket","Cabin","Name"])
test_df=test_df.drop(columns=["PassengerId","Ticket","Cabin","Name"])
print(train_df.head(15))
print(test_df.head(15))

#Missing data on test dataset
test_df.isna().sum()

#Filling missing data of column "Fare" on test_dataset
test_df['Fare']=test_df['Fare'].fillna(test_df['Fare'].mean())
test_df.isna().sum()

#Missing data on train dataset
train_df.isnull().sum(axis=0)

#Mean Age imputation

plt.figure(figsize=(16,5))
sns.boxplot(x='Pclass', y='Age', data=train_df, palette='winter')
#plt.axvline(train_df['Age'].mean(), c='y')

#Group by class by age

Age_Pclass=train_df.groupby('Pclass').Age.mean()
print("Mean_Age_by_Pclass: \n",Age_Pclass)

#Function that assigns the mean age according to PClass
def class_age(row):

  Age=row.Age
  Pclass=row.Pclass

  if (pd.isnull(Age)):
    if Pclass==1:
      return 38
    elif Pclass==2:
      return 29
    else:
      return 25
  else:
    return Age

#Mean imputation

train_df['Age']=train_df.apply(class_age,axis="columns")
test_df['Age']=test_df.apply(class_age,axis="columns")

#Convert age into int
train_df['Age']=train_df['Age'].astype('int64')
test_df['Age']=test_df['Age'].astype('int64')
print(train_df['Age'])
print(test_df['Age'])

#test_df.info()

#Drop rows in which "Embarked" column is null
train_df=train_df.dropna(axis=0, how='any')
print("After cleaning train dataset: \n ",train_df.isna().sum())
print("After cleaning test dataset: \n",test_df.isna().sum())

#How many passengers survived?
train_df.groupby('Sex').Survived.count()

#Non.linear Correlation
corr_spearman=train_df.corr(method='pearson')
sns.heatmap(corr_spearman, cmap='Greens', annot=True)

#Linear correlation
corr_spearman=train_df.corr(method='spearman')
sns.heatmap(corr_spearman, cmap='BuPu', annot=True)

#Selecting categorical and numerical columns from train dataset
df_numerical_columns=train_df.select_dtypes(exclude=["object"]).columns
print(df_numerical_columns)

df_categorical_columns=train_df.select_dtypes(["object"]).columns
print(df_categorical_columns)

for col in df_numerical_columns:
  sns.set(style='white')
  sns.histplot(data=train_df[str(col)])
  plt.show()

#One hot encoding categorical data
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct=ColumnTransformer(transformers=[('encoder',OneHotEncoder(),df_categorical_columns)],remainder='passthrough')

X=train_df.drop(["Survived"],axis=1)
X=ct.fit_transform(X)
X.shape

X_test=ct.transform(test_df)
X_test.shape

"""## **Assign the target variable**"""

y=train_df["Survived"]

y.shape

X.shape

"""### **Splitting training data into train and validation**


"""

from sklearn.model_selection import train_test_split
x_train, X_val, y_train, yval=train_test_split(X,y,train_size=0.8, random_state=42)
	# -- coding: utf-8 --
	"""Copia de Classification_task.ipynb

	Automatically generated by Colab.

	Original file is located at
	https://colab.research.google.com/drive/1ZxjAzE5xRPqsynDGTDEUeAIXgKnIiy9d
	"""

	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import seaborn as sns

	from google.colab import drive
	drive.mount('/content/drive')

	ruta1="/content/drive/MyDrive/ML/titanic/train.csv"
	ruta2="/content/drive/MyDrive/ML/titanic/test.csv"
	train_df=pd.read_csv(ruta1)
	test_df=pd.read_csv(ruta2)

	"""PassengerId - Id of the passenger. It's integer type and does not affect the survived column.

	Survived - target variable denoted by 0 - Deceased and 1 - Survived.

	Pclass - Class of the passenger 1- First class 2-Second class and 3- third class.

	Name - Name of the passenger

	Sex - Sex of the passenger

	Age - Age of the passenger

	SibSp - Number of siblings/spouses

	Parch - Number of parents / children

	Ticket - Ticket number

	Fare - Fare of the passenger

	Cabin - Cabin number

	Embarked - Embarkation port C - Cherbourg Q - Queenstown S - Southampton

	a. Distributions:

	i. Check if the data is balanced in the classes in which the data should be
	separated.

	ii. Analyze the distribution of categorical data and its relationship with the
	"survived" class. Start identifying relevant features for accurate classification.

	iii. Understand the distribution of numerical data and determine if it is necessary to
	apply normalization or standardization processes.

	b. Missing data:

	i. Identify and visualize missing data.

	ii. Decide which characteristics require imputations and which do not. Justify the
	decision and eliminate unselected columns.

	iii. Apply imputation techniques for missing data. Select the best technique and
	justify the choice.

	c. Correlation analysis:

	i. Perform a correlation analysis to decide which characteristics should be kept and
	which should be discarded.

	d. Data transformation:
	i. Convert categorical data to numerical data. Explore different methods and select
	the most suitable one. Justify the choice.
	"""

	train_df.head()
	print("shape_train_df",train_df.shape)
	print("shape_test_df",test_df.shape)
	print("columnas_train:",train_df.columns)
	print("columnas_test:",test_df.columns)

	#Drop columns
	train_df=train_df.drop(columns=["PassengerId","Ticket","Cabin","Name"])
	test_df=test_df.drop(columns=["PassengerId","Ticket","Cabin","Name"])
	print(train_df.head(15))
	print(test_df.head(15))

	#Missing data on test dataset
	test_df.isna().sum()

	#Filling missing data of column "Fare" on test_dataset
	test_df['Fare']=test_df['Fare'].fillna(test_df['Fare'].mean())
	test_df.isna().sum()

	#Missing data on train dataset
	train_df.isnull().sum(axis=0)

	#Mean Age imputation

	plt.figure(figsize=(16,5))
	sns.boxplot(x='Pclass', y='Age', data=train_df, palette='winter')
	#plt.axvline(train_df['Age'].mean(), c='y')

	#Group by class by age

	Age_Pclass=train_df.groupby('Pclass').Age.mean()
	print("Mean_Age_by_Pclass: \n",Age_Pclass)

	#Function that assigns the mean age according to PClass
	def class_age(row):

	Age=row.Age
	Pclass=row.Pclass

	if (pd.isnull(Age)):
	if Pclass==1:
	return 38
	elif Pclass==2:
	return 29
	else:
	return 25
	else:
	return Age

	#Mean imputation

	train_df['Age']=train_df.apply(class_age,axis="columns")
	test_df['Age']=test_df.apply(class_age,axis="columns")

	#Convert age into int
	train_df['Age']=train_df['Age'].astype('int64')
	test_df['Age']=test_df['Age'].astype('int64')
	print(train_df['Age'])
	print(test_df['Age'])

	#test_df.info()

	#Drop rows in which "Embarked" column is null
	train_df=train_df.dropna(axis=0, how='any')
	print("After cleaning train dataset: \n ",train_df.isna().sum())
	print("After cleaning test dataset: \n",test_df.isna().sum())

	#How many passengers survived?
	train_df.groupby('Sex').Survived.count()

	#Non.linear Correlation
	corr_spearman=train_df.corr(method='pearson')
	sns.heatmap(corr_spearman, cmap='Greens', annot=True)

	#Linear correlation
	corr_spearman=train_df.corr(method='spearman')
	sns.heatmap(corr_spearman, cmap='BuPu', annot=True)

	#Selecting categorical and numerical columns from train dataset
	df_numerical_columns=train_df.select_dtypes(exclude=["object"]).columns
	print(df_numerical_columns)

	df_categorical_columns=train_df.select_dtypes(["object"]).columns
	print(df_categorical_columns)

	for col in df_numerical_columns:
	sns.set(style='white')
	sns.histplot(data=train_df[str(col)])
	plt.show()

	#One hot encoding categorical data
	from sklearn.compose import ColumnTransformer
	from sklearn.preprocessing import OneHotEncoder

	ct=ColumnTransformer(transformers=[('encoder',OneHotEncoder(),df_categorical_columns)],remainder='passthrough')

	X=train_df.drop(["Survived"],axis=1)
	X=ct.fit_transform(X)
	X.shape

	X_test=ct.transform(test_df)
	X_test.shape

	"""## Assign the target variable"""

	y=train_df["Survived"]

	y.shape

	X.shape

	"""### Splitting training data into train and validation



	"""

	from sklearn.model_selection import train_test_split
	x_train, X_val, y_train, yval=train_test_split(X,y,train_size=0.8, random_state=42)