Skip to content

Instantly share code, notes, and snippets.

@Proteusiq
Last active January 30, 2021 10:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Proteusiq/61fb80c0d6ac0588292e5fb5c784fd54 to your computer and use it in GitHub Desktop.
Save Proteusiq/61fb80c0d6ac0588292e5fb5c784fd54 to your computer and use it in GitHub Desktop.
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.compose import ColumnTransformer, make_column_selector as selector
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler,OneHotEncoder, LabelEncoder
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
# Get Data
PENGUINS_DATA = (
"https://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv"
)
df = pd.read_csv(PENGUINS_DATA,)
df.sample(4)
df.isna().sum()
FEATURES = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g','sex', 'island']
TARGET = 'species'
df.dropna(axis='index', subset=FEATURES, how='all', inplace=True)
df[df.select_dtypes(include=['object']).columns] = df.select_dtypes(include=['object']).astype("category")
print('Dataset Sample 10')
print(df.sample(10))
print('\n')
print('Distribution of Species')
print(df['species'].value_counts(normalize=True))
print('\n')
# Split Data
X, y = df[FEATURES], df[TARGET]
y = y.astype('category')
y_ = y.cat.codes
X_train, X_test, y_train, y_test = train_test_split(
X, y_, train_size=0.7, random_state=42, stratify=y
)
X_train.shape, X_test.shape
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', MinMaxScaler())])
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(drop='first'))
])
preprocessor = ColumnTransformer(transformers=[
('num', numeric_transformer, selector(dtype_exclude="category")), # object are category
('cat', categorical_transformer, selector(dtype_include="category"))
])
X_train_ = preprocessor.fit_transform(X_train)
X_test_ = preprocessor.transform(X_test)
# Turn to Tensor
X_train_ = torch.tensor(X_train_, dtype=torch.float32,)
X_test_ = torch.tensor(X_test_, dtype=torch.float32)
y_train_ = torch.tensor(y_train.values, dtype=torch.long)
y_test_ = torch.tensor(y_test.values, dtype=torch.long)
# Dataset Loader (Input Pipline)
batch_size = 10
train_dataset = torch.utils.data.TensorDataset(X_train_, y_train_)
test_dataset = torch.utils.data.TensorDataset(X_test_, y_test_)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
batch_size=batch_size,
shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
batch_size=batch_size,
shuffle=False)
# Define a Neural Network Architechure
class Net(nn.Module):
def __init__(self, input_shape, first_layer_ns, second_layer_ns, output_shape):
super(Net, self).__init__()
self.first_layer = nn.Linear(input_shape, first_layer_ns)
self.second_layer = nn.Linear(first_layer_ns, second_layer_ns)
self.output = nn.Linear(second_layer_ns, output_shape)
def forward(self, X):
f1 = self.first_layer(X)
p1 = F.relu(f1)
f2 = self.second_layer(p1)
p2 = F.relu(f2)
logit = self.output(p2)
return logit
input_shape = X_train_.shape[1]
output_shape = y_.nunique()
input_shape, output_shape
# Seed for reproducibility
torch.manual_seed(42)
# Initate a model, get initial weights, define optimizer and criterion/loss function
# regression problem with 5 features, both first and second layers having N neurons
model = Net(input_shape, 25, 25, output_shape)
parameters = model.parameters()
optimizer = optim.SGD(parameters, lr=1e-2, momentum=.8) #optim.Adam(parameters, lr=9e-4, weight_decay=1e-1)
criterion = nn.CrossEntropyLoss()
accuracy = []
test_accuracy = []
EPOCHS = 45
for epoch in range(EPOCHS):
for (Xi, yi) in train_loader:
optimizer.zero_grad()
logit = model(Xi)
loss = criterion(logit, yi)
loss.backward()
optimizer.step()
number_samples = Xi.size(0)
yhat = logit.argmax(dim=1)
accuracy.append((yhat == yi).sum().item()/number_samples)
with torch.no_grad():
for (Xj, yj) in test_loader:
logit = model(Xj)
test_loss = criterion(logit, yj)
number_samples = Xj.size(0)
yhat = logit.argmax(dim=1)
test_accuracy.append((yhat == yj).sum().item()/number_samples)
print(
f"[+] Epoch: {epoch:>3} Train loss: {loss.item():^8.4f} Train Accuracy: {sum(accuracy)/len(accuracy):^8.4f}"
f" Test loss: {test_loss.item():^4.4f} Test Accuracy: {sum(test_accuracy)/len(test_accuracy):>4.4f}"
)
logit = model(X_test_)
yhat = logit.argmax(dim=1)
print("Model Evaluation:")
print(f"{(y_test.values == yhat.numpy()).sum()} out of {y_test.shape[0]} correct predictions"
f" | Test Accuracy = {(y_test.values == yhat.numpy()).sum() /y_test.shape[0]:.2%}\n")
print(f"Accuracy: {accuracy_score(y_test.values, yhat.numpy()):.2%}"
f" | Recall: {recall_score(y_test.values, yhat.numpy(), average='macro',):.2%} "
f"| Precision {precision_score(y_test.values, yhat.numpy(), average='macro'):.2%}")
print("\nConfusion Matrix")
print(pd.DataFrame(confusion_matrix(y_test.values, yhat.numpy(), labels=[0, 1, 2]), columns=y.cat.categories, index=y.cat.categories))
# print((y.cat.categories[yhat.numpy()] == 'Adelie').sum())
print("\nCorrect | Actual Species | Predicted Species")
for actual_species, predicted_species in zip(y.cat.categories[y_test.values], y.cat.categories[yhat.numpy()]):
print(f"{'✔️' if actual_species == predicted_species else '❌':<10} {actual_species:>5} {predicted_species:>16}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment