Created
April 11, 2024 23:21
-
-
Save jesuscmadrigal/5dfbebaec7a09ee76150a6835b7a90be to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
"""Copia de Classification_task.ipynb | |
Automatically generated by Colab. | |
Original file is located at | |
https://colab.research.google.com/drive/1ZxjAzE5xRPqsynDGTDEUeAIXgKnIiy9d | |
""" | |
import pandas as pd | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
from google.colab import drive | |
drive.mount('/content/drive') | |
ruta1="/content/drive/MyDrive/ML/titanic/train.csv" | |
ruta2="/content/drive/MyDrive/ML/titanic/test.csv" | |
train_df=pd.read_csv(ruta1) | |
test_df=pd.read_csv(ruta2) | |
"""PassengerId - Id of the passenger. It's integer type and does not affect the survived column. | |
Survived - target variable denoted by 0 - Deceased and 1 - Survived. | |
Pclass - Class of the passenger 1- First class 2-Second class and 3- third class. | |
Name - Name of the passenger | |
Sex - Sex of the passenger | |
Age - Age of the passenger | |
SibSp - Number of siblings/spouses | |
Parch - Number of parents / children | |
Ticket - Ticket number | |
Fare - Fare of the passenger | |
Cabin - Cabin number | |
Embarked - Embarkation port C - Cherbourg Q - Queenstown S - Southampton | |
a. Distributions: | |
i. Check if the data is balanced in the classes in which the data should be | |
separated. | |
ii. Analyze the distribution of categorical data and its relationship with the | |
"survived" class. Start identifying relevant features for accurate classification. | |
iii. Understand the distribution of numerical data and determine if it is necessary to | |
apply normalization or standardization processes. | |
b. Missing data: | |
i. Identify and visualize missing data. | |
ii. Decide which characteristics require imputations and which do not. Justify the | |
decision and eliminate unselected columns. | |
iii. Apply imputation techniques for missing data. Select the best technique and | |
justify the choice. | |
c. Correlation analysis: | |
i. Perform a correlation analysis to decide which characteristics should be kept and | |
which should be discarded. | |
d. Data transformation: | |
i. Convert categorical data to numerical data. Explore different methods and select | |
the most suitable one. Justify the choice. | |
""" | |
train_df.head() | |
print("shape_train_df",train_df.shape) | |
print("shape_test_df",test_df.shape) | |
print("columnas_train:",train_df.columns) | |
print("columnas_test:",test_df.columns) | |
#Drop columns | |
train_df=train_df.drop(columns=["PassengerId","Ticket","Cabin","Name"]) | |
test_df=test_df.drop(columns=["PassengerId","Ticket","Cabin","Name"]) | |
print(train_df.head(15)) | |
print(test_df.head(15)) | |
#Missing data on test dataset | |
test_df.isna().sum() | |
#Filling missing data of column "Fare" on test_dataset | |
test_df['Fare']=test_df['Fare'].fillna(test_df['Fare'].mean()) | |
test_df.isna().sum() | |
#Missing data on train dataset | |
train_df.isnull().sum(axis=0) | |
#Mean Age imputation | |
plt.figure(figsize=(16,5)) | |
sns.boxplot(x='Pclass', y='Age', data=train_df, palette='winter') | |
#plt.axvline(train_df['Age'].mean(), c='y') | |
#Group by class by age | |
Age_Pclass=train_df.groupby('Pclass').Age.mean() | |
print("Mean_Age_by_Pclass: \n",Age_Pclass) | |
#Function that assigns the mean age according to PClass | |
def class_age(row): | |
Age=row.Age | |
Pclass=row.Pclass | |
if (pd.isnull(Age)): | |
if Pclass==1: | |
return 38 | |
elif Pclass==2: | |
return 29 | |
else: | |
return 25 | |
else: | |
return Age | |
#Mean imputation | |
train_df['Age']=train_df.apply(class_age,axis="columns") | |
test_df['Age']=test_df.apply(class_age,axis="columns") | |
#Convert age into int | |
train_df['Age']=train_df['Age'].astype('int64') | |
test_df['Age']=test_df['Age'].astype('int64') | |
print(train_df['Age']) | |
print(test_df['Age']) | |
#test_df.info() | |
#Drop rows in which "Embarked" column is null | |
train_df=train_df.dropna(axis=0, how='any') | |
print("After cleaning train dataset: \n ",train_df.isna().sum()) | |
print("After cleaning test dataset: \n",test_df.isna().sum()) | |
#How many passengers survived? | |
train_df.groupby('Sex').Survived.count() | |
#Non.linear Correlation | |
corr_spearman=train_df.corr(method='pearson') | |
sns.heatmap(corr_spearman, cmap='Greens', annot=True) | |
#Linear correlation | |
corr_spearman=train_df.corr(method='spearman') | |
sns.heatmap(corr_spearman, cmap='BuPu', annot=True) | |
#Selecting categorical and numerical columns from train dataset | |
df_numerical_columns=train_df.select_dtypes(exclude=["object"]).columns | |
print(df_numerical_columns) | |
df_categorical_columns=train_df.select_dtypes(["object"]).columns | |
print(df_categorical_columns) | |
for col in df_numerical_columns: | |
sns.set(style='white') | |
sns.histplot(data=train_df[str(col)]) | |
plt.show() | |
#One hot encoding categorical data | |
from sklearn.compose import ColumnTransformer | |
from sklearn.preprocessing import OneHotEncoder | |
ct=ColumnTransformer(transformers=[('encoder',OneHotEncoder(),df_categorical_columns)],remainder='passthrough') | |
X=train_df.drop(["Survived"],axis=1) | |
X=ct.fit_transform(X) | |
X.shape | |
X_test=ct.transform(test_df) | |
X_test.shape | |
"""## **Assign the target variable**""" | |
y=train_df["Survived"] | |
y.shape | |
X.shape | |
"""### **Splitting training data into train and validation** | |
""" | |
from sklearn.model_selection import train_test_split | |
x_train, X_val, y_train, yval=train_test_split(X,y,train_size=0.8, random_state=42) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment