Proteusiq/penguin_pytorch.py

## penguin_pytorch.py
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.compose import ColumnTransformer, make_column_selector as selector
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler,OneHotEncoder, LabelEncoder

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# Get Data
PENGUINS_DATA = (
    "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv"
)
df = pd.read_csv(PENGUINS_DATA,)

df.sample(4)

df.isna().sum()

FEATURES = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g','sex', 'island']
TARGET = 'species'

df.dropna(axis='index', subset=FEATURES, how='all', inplace=True)


df[df.select_dtypes(include=['object']).columns] = df.select_dtypes(include=['object']).astype("category")

print('Dataset Sample 10')
print(df.sample(10))
print('\n')

print('Distribution of Species')
print(df['species'].value_counts(normalize=True))
print('\n')

# Split Data
X, y = df[FEATURES], df[TARGET]
y = y.astype('category')

y_ = y.cat.codes

X_train, X_test, y_train, y_test = train_test_split(
    X, y_, train_size=0.7, random_state=42, stratify=y
)

X_train.shape, X_test.shape

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(drop='first'))
])


preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, selector(dtype_exclude="category")), # object are category
    ('cat', categorical_transformer, selector(dtype_include="category"))
])

X_train_ = preprocessor.fit_transform(X_train)
X_test_ = preprocessor.transform(X_test)

# Turn to Tensor

X_train_ = torch.tensor(X_train_, dtype=torch.float32,)
X_test_ = torch.tensor(X_test_, dtype=torch.float32)

y_train_ = torch.tensor(y_train.values, dtype=torch.long)
y_test_ = torch.tensor(y_test.values, dtype=torch.long)

# Dataset Loader (Input Pipline)

batch_size = 10
train_dataset = torch.utils.data.TensorDataset(X_train_, y_train_)
test_dataset = torch.utils.data.TensorDataset(X_test_, y_test_)

train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=batch_size,
                                           shuffle=True)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                          batch_size=batch_size,
                                          shuffle=False)

# Define a Neural Network Architechure
class Net(nn.Module):
    def __init__(self, input_shape, first_layer_ns, second_layer_ns, output_shape):
        super(Net, self).__init__()

        self.first_layer = nn.Linear(input_shape, first_layer_ns)
        self.second_layer = nn.Linear(first_layer_ns, second_layer_ns)
        self.output = nn.Linear(second_layer_ns, output_shape)


    def forward(self, X):

        f1 = self.first_layer(X)
        p1 = F.relu(f1)
        f2 = self.second_layer(p1)
        p2 = F.relu(f2)
        logit = self.output(p2)

        return logit

input_shape = X_train_.shape[1]
output_shape = y_.nunique()

input_shape, output_shape

# Seed for reproducibility
torch.manual_seed(42)

# Initate a model, get initial weights, define optimizer and criterion/loss function
# regression problem with 5 features, both first and second layers having N neurons
model = Net(input_shape, 25, 25, output_shape)
parameters = model.parameters()
optimizer = optim.SGD(parameters, lr=1e-2, momentum=.8) #optim.Adam(parameters, lr=9e-4, weight_decay=1e-1)
criterion = nn.CrossEntropyLoss()

accuracy = []
test_accuracy = []

EPOCHS = 45
for epoch in range(EPOCHS):
    for (Xi, yi) in train_loader:
        optimizer.zero_grad()
        logit = model(Xi)
        loss = criterion(logit, yi)

        loss.backward()
        optimizer.step()

        number_samples = Xi.size(0)
        yhat = logit.argmax(dim=1)
        accuracy.append((yhat == yi).sum().item()/number_samples)


    with torch.no_grad():
        for (Xj, yj) in test_loader:
            logit = model(Xj)
            test_loss = criterion(logit, yj)

            number_samples = Xj.size(0)
            yhat = logit.argmax(dim=1)
            test_accuracy.append((yhat == yj).sum().item()/number_samples)

    print(
            f"[+] Epoch: {epoch:>3}    Train loss: {loss.item():^8.4f} Train Accuracy: {sum(accuracy)/len(accuracy):^8.4f}"
            f" Test loss: {test_loss.item():^4.4f} Test Accuracy: {sum(test_accuracy)/len(test_accuracy):>4.4f}"
        )


logit = model(X_test_)
yhat = logit.argmax(dim=1)

print("Model Evaluation:")
print(f"{(y_test.values == yhat.numpy()).sum()} out of {y_test.shape[0]} correct predictions"
      f" | Test Accuracy = {(y_test.values == yhat.numpy()).sum() /y_test.shape[0]:.2%}\n")

print(f"Accuracy: {accuracy_score(y_test.values, yhat.numpy()):.2%}"
      f" | Recall: {recall_score(y_test.values, yhat.numpy(), average='macro',):.2%} "
      f"| Precision {precision_score(y_test.values, yhat.numpy(), average='macro'):.2%}")

print("\nConfusion Matrix")
print(pd.DataFrame(confusion_matrix(y_test.values, yhat.numpy(), labels=[0, 1, 2]), columns=y.cat.categories, index=y.cat.categories))
# print((y.cat.categories[yhat.numpy()] == 'Adelie').sum())

print("\nCorrect | Actual Species  | Predicted Species")
for actual_species, predicted_species in zip(y.cat.categories[y_test.values], y.cat.categories[yhat.numpy()]):
    print(f"{'✔️' if actual_species == predicted_species else '❌':<10} {actual_species:>5}  {predicted_species:>16}")
	import numpy as np
	import pandas as pd
	from sklearn.model_selection import train_test_split
	from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
	from sklearn.compose import ColumnTransformer, make_column_selector as selector
	from sklearn.pipeline import Pipeline
	from sklearn.impute import SimpleImputer
	from sklearn.preprocessing import MinMaxScaler,OneHotEncoder, LabelEncoder

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import torch.optim as optim

	# Get Data
	PENGUINS_DATA = (
	"https://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv"
	)
	df = pd.read_csv(PENGUINS_DATA,)

	df.sample(4)

	df.isna().sum()

	FEATURES = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g','sex', 'island']
	TARGET = 'species'

	df.dropna(axis='index', subset=FEATURES, how='all', inplace=True)


	df[df.select_dtypes(include=['object']).columns] = df.select_dtypes(include=['object']).astype("category")

	print('Dataset Sample 10')
	print(df.sample(10))
	print('\n')

	print('Distribution of Species')
	print(df['species'].value_counts(normalize=True))
	print('\n')

	# Split Data
	X, y = df[FEATURES], df[TARGET]
	y = y.astype('category')

	y_ = y.cat.codes

	X_train, X_test, y_train, y_test = train_test_split(
	X, y_, train_size=0.7, random_state=42, stratify=y
	)

	X_train.shape, X_test.shape

	numeric_transformer = Pipeline(steps=[
	('imputer', SimpleImputer(strategy='median')),
	('scaler', MinMaxScaler())])

	categorical_transformer = Pipeline(steps=[
	('imputer', SimpleImputer(strategy='most_frequent')),
	('onehot', OneHotEncoder(drop='first'))
	])


	preprocessor = ColumnTransformer(transformers=[
	('num', numeric_transformer, selector(dtype_exclude="category")), # object are category
	('cat', categorical_transformer, selector(dtype_include="category"))
	])

	X_train_ = preprocessor.fit_transform(X_train)
	X_test_ = preprocessor.transform(X_test)

	# Turn to Tensor

	X_train_ = torch.tensor(X_train_, dtype=torch.float32,)
	X_test_ = torch.tensor(X_test_, dtype=torch.float32)

	y_train_ = torch.tensor(y_train.values, dtype=torch.long)
	y_test_ = torch.tensor(y_test.values, dtype=torch.long)

	# Dataset Loader (Input Pipline)

	batch_size = 10
	train_dataset = torch.utils.data.TensorDataset(X_train_, y_train_)
	test_dataset = torch.utils.data.TensorDataset(X_test_, y_test_)

	train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
	batch_size=batch_size,
	shuffle=True)

	test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
	batch_size=batch_size,
	shuffle=False)

	# Define a Neural Network Architechure
	class Net(nn.Module):
	def __init__(self, input_shape, first_layer_ns, second_layer_ns, output_shape):
	super(Net, self).__init__()

	self.first_layer = nn.Linear(input_shape, first_layer_ns)
	self.second_layer = nn.Linear(first_layer_ns, second_layer_ns)
	self.output = nn.Linear(second_layer_ns, output_shape)


	def forward(self, X):

	f1 = self.first_layer(X)
	p1 = F.relu(f1)
	f2 = self.second_layer(p1)
	p2 = F.relu(f2)
	logit = self.output(p2)

	return logit

	input_shape = X_train_.shape[1]
	output_shape = y_.nunique()

	input_shape, output_shape

	# Seed for reproducibility
	torch.manual_seed(42)

	# Initate a model, get initial weights, define optimizer and criterion/loss function
	# regression problem with 5 features, both first and second layers having N neurons
	model = Net(input_shape, 25, 25, output_shape)
	parameters = model.parameters()
	optimizer = optim.SGD(parameters, lr=1e-2, momentum=.8) #optim.Adam(parameters, lr=9e-4, weight_decay=1e-1)
	criterion = nn.CrossEntropyLoss()

	accuracy = []
	test_accuracy = []

	EPOCHS = 45
	for epoch in range(EPOCHS):
	for (Xi, yi) in train_loader:
	optimizer.zero_grad()
	logit = model(Xi)
	loss = criterion(logit, yi)

	loss.backward()
	optimizer.step()

	number_samples = Xi.size(0)
	yhat = logit.argmax(dim=1)
	accuracy.append((yhat == yi).sum().item()/number_samples)


	with torch.no_grad():
	for (Xj, yj) in test_loader:
	logit = model(Xj)
	test_loss = criterion(logit, yj)

	number_samples = Xj.size(0)
	yhat = logit.argmax(dim=1)
	test_accuracy.append((yhat == yj).sum().item()/number_samples)

	print(
	f"[+] Epoch: {epoch:>3} Train loss: {loss.item():^8.4f} Train Accuracy: {sum(accuracy)/len(accuracy):^8.4f}"
	f" Test loss: {test_loss.item():^4.4f} Test Accuracy: {sum(test_accuracy)/len(test_accuracy):>4.4f}"
	)


	logit = model(X_test_)
	yhat = logit.argmax(dim=1)

	print("Model Evaluation:")
	print(f"{(y_test.values == yhat.numpy()).sum()} out of {y_test.shape[0]} correct predictions"
	f" \| Test Accuracy = {(y_test.values == yhat.numpy()).sum() /y_test.shape[0]:.2%}\n")

	print(f"Accuracy: {accuracy_score(y_test.values, yhat.numpy()):.2%}"
	f" \| Recall: {recall_score(y_test.values, yhat.numpy(), average='macro',):.2%} "
	f"\| Precision {precision_score(y_test.values, yhat.numpy(), average='macro'):.2%}")

	print("\nConfusion Matrix")
	print(pd.DataFrame(confusion_matrix(y_test.values, yhat.numpy(), labels=[0, 1, 2]), columns=y.cat.categories, index=y.cat.categories))
	# print((y.cat.categories[yhat.numpy()] == 'Adelie').sum())

	print("\nCorrect \| Actual Species \| Predicted Species")
	for actual_species, predicted_species in zip(y.cat.categories[y_test.values], y.cat.categories[yhat.numpy()]):
	print(f"{'✔️' if actual_species == predicted_species else '❌':<10} {actual_species:>5} {predicted_species:>16}")