Last active
December 10, 2023 10:17
-
-
Save Proteusiq/9fb0f07a2887e124d99a219047f76e88 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
from sklearn.model_selection import train_test_split | |
from sklearn.metrics import ( | |
accuracy_score, | |
precision_score, | |
recall_score, | |
confusion_matrix, | |
) | |
from sklearn.compose import ColumnTransformer, make_column_selector as selector | |
from sklearn.pipeline import Pipeline | |
from sklearn.impute import SimpleImputer | |
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, FunctionTransformer | |
import torch | |
import torch.nn as nn | |
import torch.nn.functional as F | |
from skorch import NeuralNetClassifier | |
# Our Custom Neural Net Classifier | |
class NN(nn.Module): | |
def __init__( | |
self, input_dim=7, first_layer_ns=25, second_layer_ns=25, output_dim=3 | |
): | |
super(NN, self).__init__() | |
self.first_layer = nn.Linear(input_dim, first_layer_ns) | |
self.dropout = nn.Dropout(0.5) | |
self.second_layer = nn.Linear(first_layer_ns, second_layer_ns) | |
self.output = nn.Linear(second_layer_ns, output_dim) | |
self.softmax = nn.Softmax(dim=-1) | |
def forward(self, X, **kwargs): | |
f1 = self.first_layer(X) | |
p1 = F.relu(f1) | |
do = self.dropout(p1) | |
f2 = self.second_layer(do) | |
p2 = F.relu(f2) | |
output = self.output(p2) | |
X = self.softmax(output) | |
return X | |
# Get Data | |
PENGUINS_DATA = ( | |
"https://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv" | |
) | |
df = pd.read_csv( | |
PENGUINS_DATA, | |
) | |
FEATURES = [ | |
"bill_length_mm", | |
"bill_depth_mm", | |
"flipper_length_mm", | |
"body_mass_g", | |
"sex", | |
"island", | |
] | |
TARGET = "species" | |
# Pre-prepocessing | |
df.dropna(axis="index", subset=FEATURES, how="all", inplace=True) | |
df[df.select_dtypes(include=["object"]).columns] = df.select_dtypes( | |
include=["object"] | |
).astype("category") | |
# Split Data | |
X, y = df[FEATURES], df[TARGET] | |
y = y.astype("category") | |
y_ = y.cat.codes # species names to category code | |
X_train, X_test, y_train, y_test = train_test_split( | |
X, y_, train_size=0.7, random_state=42, stratify=y | |
) | |
# initialize our | |
net = NeuralNetClassifier( | |
NN, | |
max_epochs=200, | |
lr=1e-1, | |
iterator_train__shuffle=True, | |
) | |
# numeric transformation | |
numeric_transformer = Pipeline( | |
steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", MinMaxScaler())] | |
) | |
# categorical transformation | |
categorical_transformer = Pipeline( | |
steps=[ | |
("imputer", SimpleImputer(strategy="most_frequent")), | |
("onehot", OneHotEncoder(drop="first")), | |
] | |
) | |
# combine transformations | |
preprocessor = ColumnTransformer( | |
transformers=[ | |
( | |
"num", | |
numeric_transformer, | |
selector(dtype_exclude="category"), | |
), | |
("cat", categorical_transformer, selector(dtype_include="category")), | |
] | |
) | |
# a pipeline with transformation and model | |
pipe = Pipeline( | |
[ | |
("preprocess", preprocessor), | |
("transform", FunctionTransformer(np.float32)), | |
("net", net), | |
] | |
) | |
# preprocess data and train our model | |
_ = pipe.fit(X_train, torch.LongTensor(y_train)) | |
yhat = pipe.predict(X_test) | |
print("Model Evaluation:") | |
print( | |
f"{(y_test.values == yhat).sum()} out of {y_test.shape[0]} correct predictions" | |
f" | Test Accuracy = {(y_test.values == yhat).sum()/ y_test.shape[0]:.2%}\n" | |
) | |
print( | |
f"Accuracy: {accuracy_score(y_test.values, yhat):.2%}" | |
f" | Recall: {recall_score(y_test.values, yhat, average='macro',):.2%} " | |
f"| Precision {precision_score(y_test.values, yhat, average='macro'):.2%}" | |
) | |
print("\nConfusion Matrix") | |
print( | |
pd.DataFrame( | |
confusion_matrix(y_test.values, yhat, labels=[0, 1, 2]), | |
columns=y.cat.categories, | |
index=y.cat.categories, | |
) | |
) | |
# print((y.cat.categories[yhat.numpy()] == 'Adelie').sum()) | |
print("\nCorrect | Actual Species | Predicted Species") | |
for actual_species, predicted_species in zip( | |
y.cat.categories[y_test.values], y.cat.categories[yhat] | |
): | |
print( | |
f"{'✔️' if actual_species == predicted_species else '❌':<10} {actual_species:>5} {predicted_species:>16}" | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment