Last active
January 30, 2021 10:11
-
-
Save Proteusiq/61fb80c0d6ac0588292e5fb5c784fd54 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
from sklearn.model_selection import train_test_split | |
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix | |
from sklearn.compose import ColumnTransformer, make_column_selector as selector | |
from sklearn.pipeline import Pipeline | |
from sklearn.impute import SimpleImputer | |
from sklearn.preprocessing import MinMaxScaler,OneHotEncoder, LabelEncoder | |
import torch | |
import torch.nn as nn | |
import torch.nn.functional as F | |
import torch.optim as optim | |
# Get Data | |
PENGUINS_DATA = ( | |
"https://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv" | |
) | |
df = pd.read_csv(PENGUINS_DATA,) | |
df.sample(4) | |
df.isna().sum() | |
FEATURES = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g','sex', 'island'] | |
TARGET = 'species' | |
df.dropna(axis='index', subset=FEATURES, how='all', inplace=True) | |
df[df.select_dtypes(include=['object']).columns] = df.select_dtypes(include=['object']).astype("category") | |
print('Dataset Sample 10') | |
print(df.sample(10)) | |
print('\n') | |
print('Distribution of Species') | |
print(df['species'].value_counts(normalize=True)) | |
print('\n') | |
# Split Data | |
X, y = df[FEATURES], df[TARGET] | |
y = y.astype('category') | |
y_ = y.cat.codes | |
X_train, X_test, y_train, y_test = train_test_split( | |
X, y_, train_size=0.7, random_state=42, stratify=y | |
) | |
X_train.shape, X_test.shape | |
numeric_transformer = Pipeline(steps=[ | |
('imputer', SimpleImputer(strategy='median')), | |
('scaler', MinMaxScaler())]) | |
categorical_transformer = Pipeline(steps=[ | |
('imputer', SimpleImputer(strategy='most_frequent')), | |
('onehot', OneHotEncoder(drop='first')) | |
]) | |
preprocessor = ColumnTransformer(transformers=[ | |
('num', numeric_transformer, selector(dtype_exclude="category")), # object are category | |
('cat', categorical_transformer, selector(dtype_include="category")) | |
]) | |
X_train_ = preprocessor.fit_transform(X_train) | |
X_test_ = preprocessor.transform(X_test) | |
# Turn to Tensor | |
X_train_ = torch.tensor(X_train_, dtype=torch.float32,) | |
X_test_ = torch.tensor(X_test_, dtype=torch.float32) | |
y_train_ = torch.tensor(y_train.values, dtype=torch.long) | |
y_test_ = torch.tensor(y_test.values, dtype=torch.long) | |
# Dataset Loader (Input Pipline) | |
batch_size = 10 | |
train_dataset = torch.utils.data.TensorDataset(X_train_, y_train_) | |
test_dataset = torch.utils.data.TensorDataset(X_test_, y_test_) | |
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, | |
batch_size=batch_size, | |
shuffle=True) | |
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, | |
batch_size=batch_size, | |
shuffle=False) | |
# Define a Neural Network Architechure | |
class Net(nn.Module): | |
def __init__(self, input_shape, first_layer_ns, second_layer_ns, output_shape): | |
super(Net, self).__init__() | |
self.first_layer = nn.Linear(input_shape, first_layer_ns) | |
self.second_layer = nn.Linear(first_layer_ns, second_layer_ns) | |
self.output = nn.Linear(second_layer_ns, output_shape) | |
def forward(self, X): | |
f1 = self.first_layer(X) | |
p1 = F.relu(f1) | |
f2 = self.second_layer(p1) | |
p2 = F.relu(f2) | |
logit = self.output(p2) | |
return logit | |
input_shape = X_train_.shape[1] | |
output_shape = y_.nunique() | |
input_shape, output_shape | |
# Seed for reproducibility | |
torch.manual_seed(42) | |
# Initate a model, get initial weights, define optimizer and criterion/loss function | |
# regression problem with 5 features, both first and second layers having N neurons | |
model = Net(input_shape, 25, 25, output_shape) | |
parameters = model.parameters() | |
optimizer = optim.SGD(parameters, lr=1e-2, momentum=.8) #optim.Adam(parameters, lr=9e-4, weight_decay=1e-1) | |
criterion = nn.CrossEntropyLoss() | |
accuracy = [] | |
test_accuracy = [] | |
EPOCHS = 45 | |
for epoch in range(EPOCHS): | |
for (Xi, yi) in train_loader: | |
optimizer.zero_grad() | |
logit = model(Xi) | |
loss = criterion(logit, yi) | |
loss.backward() | |
optimizer.step() | |
number_samples = Xi.size(0) | |
yhat = logit.argmax(dim=1) | |
accuracy.append((yhat == yi).sum().item()/number_samples) | |
with torch.no_grad(): | |
for (Xj, yj) in test_loader: | |
logit = model(Xj) | |
test_loss = criterion(logit, yj) | |
number_samples = Xj.size(0) | |
yhat = logit.argmax(dim=1) | |
test_accuracy.append((yhat == yj).sum().item()/number_samples) | |
print( | |
f"[+] Epoch: {epoch:>3} Train loss: {loss.item():^8.4f} Train Accuracy: {sum(accuracy)/len(accuracy):^8.4f}" | |
f" Test loss: {test_loss.item():^4.4f} Test Accuracy: {sum(test_accuracy)/len(test_accuracy):>4.4f}" | |
) | |
logit = model(X_test_) | |
yhat = logit.argmax(dim=1) | |
print("Model Evaluation:") | |
print(f"{(y_test.values == yhat.numpy()).sum()} out of {y_test.shape[0]} correct predictions" | |
f" | Test Accuracy = {(y_test.values == yhat.numpy()).sum() /y_test.shape[0]:.2%}\n") | |
print(f"Accuracy: {accuracy_score(y_test.values, yhat.numpy()):.2%}" | |
f" | Recall: {recall_score(y_test.values, yhat.numpy(), average='macro',):.2%} " | |
f"| Precision {precision_score(y_test.values, yhat.numpy(), average='macro'):.2%}") | |
print("\nConfusion Matrix") | |
print(pd.DataFrame(confusion_matrix(y_test.values, yhat.numpy(), labels=[0, 1, 2]), columns=y.cat.categories, index=y.cat.categories)) | |
# print((y.cat.categories[yhat.numpy()] == 'Adelie').sum()) | |
print("\nCorrect | Actual Species | Predicted Species") | |
for actual_species, predicted_species in zip(y.cat.categories[y_test.values], y.cat.categories[yhat.numpy()]): | |
print(f"{'✔️' if actual_species == predicted_species else '❌':<10} {actual_species:>5} {predicted_species:>16}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment