Last active
May 31, 2022 14:20
-
-
Save zommiommy/24f712f22eee4af763bbddbc1fbe86c2 to your computer and use it in GitHub Desktop.
Manual Logistic Regression with SGD, Momentum, Adam, Nadam
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import time | |
import numpy as np | |
from tqdm.auto import trange | |
import matplotlib.pyplot as plt | |
from optimizers import SGD, Momentum, Adam, Nadam | |
# Generate some data | |
positives = np.random.normal(loc=-0.3, size=1000) | |
negatives = np.random.normal(loc=0.0, size=1000) | |
# hyper params | |
compute_loss = True | |
n_of_samples = len(positives) + len(negatives) | |
n_of_epochs = 1_000 | |
seed = 31337 | |
for opt in [SGD, Momentum, Adam, Nadam]: | |
start_time = time.time() | |
# init values | |
np.random.seed(seed) | |
m, b = np.random.normal(), np.random.normal() | |
# optimizers | |
optm = Adam(learning_rate=0.001) | |
optb = Adam(learning_rate=0.001) | |
losses = np.zeros(n_of_epochs) | |
# training loop | |
for epoch in trange(n_of_epochs, leave=False): | |
# gradients | |
gradient_m, gradient_b = 0, 0 | |
# compute the gradient for the positives | |
for x in positives: | |
z = m * x + b | |
p = 1.0 / (1.0 + np.exp(-z)) | |
gradient_b += (p - 1) | |
gradient_m += (p - 1) * x | |
# compute the gradient for the negatives | |
for x in negatives: | |
z = m * x + b | |
p = 1.0 / (1.0 + np.exp(-z)) | |
gradient_b += p | |
gradient_m += p * x | |
# update the weights | |
m -= optm.gradient_update(gradient_m / n_of_samples) | |
b -= optb.gradient_update(gradient_b / n_of_samples) | |
if compute_loss: | |
for positive in positives: | |
losses[epoch] += np.log(1 + np.exp(-m*positive-b)) | |
for negative in negatives: | |
losses[epoch] -= np.log(1 - 1/(1 + np.exp(-m*negative-b))) | |
print(f"{opt.__name__:<10}:{time.time() - start_time:.3f}s\tm:{m}\tb:{b}\tloss:{losses[-1]}") | |
if compute_loss: | |
plt.plot(losses, label=opt.__name__) | |
plt.legend() | |
plt.ylabel("Binary Cross Entropy") | |
plt.xlabel("Epoch") | |
plt.show() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
class Optimizer: | |
def gradient_update(self, gradient: float) -> float: | |
raise NotImplementedError("The child should implement this") | |
class SGD(Optimizer): | |
"""Standard stochastic gradient descent""" | |
def __init__(self, learning_rate: float=0.001): | |
self.learning_rate = learning_rate | |
def gradient_update(self, gradient: float) -> float: | |
return self.learning_rate * gradient | |
class Momentum(Optimizer): | |
"""Classical momentum (Polyak, 1964)""" | |
def __init__(self, decay_factor: float=0.9, learning_rate: float=0.001): | |
self.decay_factor, self.learning_rate = decay_factor, learning_rate | |
self.momentum = 0 | |
def gradient_update(self, gradient: float) -> float: | |
self.momentum = self.decay_factor * self.momentum +\ | |
self.learning_rate * gradient | |
return self.momentum | |
class Adam(Optimizer): | |
def __init__(self, beta1: float=0.9, beta2:float=0.999, epsilon: float=1e-8, learning_rate: float=0.001): | |
self.beta1, self.beta2, self.epsilon, self.learning_rate = beta1, beta2, epsilon, learning_rate | |
self.first_moment = 0 | |
self.second_moment = 0 | |
self.t = 0 | |
def gradient_update(self, gradient: float) -> float: | |
self.t += 1 | |
self.first_moment = self.beta1 * self.first_moment + \ | |
(1 - self.beta1) * gradient | |
self.second_moment = self.beta2 * self.second_moment + \ | |
(1 - self.beta2) * gradient**2 | |
alpha_t = self.learning_rate * (1 - self.beta2**self.t)**0.5 / (1 - self.beta1**self.t) | |
return alpha_t * self.first_moment / (self.epsilon + self.second_moment**0.5) | |
class Nadam(Optimizer): | |
def __init__(self, beta1: float=0.9, beta2:float=0.999, epsilon: float=1e-8, learning_rate: float=0.001): | |
self.beta1, self.beta2, self.epsilon, self.learning_rate = beta1, beta2, epsilon, learning_rate | |
self.first_moment = 0 | |
self.second_moment = 0 | |
self.t = 0 | |
def gradient_update(self, gradient: float) -> float: | |
self.t += 1 | |
self.first_moment = self.beta1 * self.first_moment + \ | |
(1 - self.beta1) * gradient | |
self.second_moment = self.beta2 * self.second_moment + \ | |
(1 - self.beta2) * gradient**2 | |
estimated_first_moment = self.beta1 * self.first_moment / (1 - self.beta1**(self.t+1)) + \ | |
(1 - self.beta1) * gradient / (1 - self.beta1**self.t) | |
estimated_second_moment = self.beta2 * self.second_moment / (1 - self.beta2**self.t) | |
return self.learning_rate * estimated_first_moment / (self.epsilon + estimated_second_moment**0.5) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment