Skip to content

Instantly share code, notes, and snippets.

@borgwang
Last active June 14, 2020 18:43
Show Gist options
  • Save borgwang/2859dec4a5149ac55d665ae175b25a40 to your computer and use it in GitHub Desktop.
Save borgwang/2859dec4a5149ac55d665ae175b25a40 to your computer and use it in GitHub Desktop.
backpropagation with numpy
import numpy as np
from sklearn.datasets import load_iris
def softmax(inputs):
return np.exp(inputs) / np.sum(np.exp(inputs), 1)[:, None]
def construct_net(in_dim, out_dim, hidden_dim=20):
bound1 = np.sqrt(6.0 / (in_dim + hidden_dim))
W1 = np.random.uniform(-bound1, bound1, size=[in_dim, hidden_dim])
b1 = np.zeros(20)
bound2 = np.sqrt(6.0 / (hidden_dim + out_dim))
W2 = np.random.uniform(-bound2, bound2, size=[hidden_dim, out_dim])
b2 = np.zeros(3)
return [W1, b1, W2, b2]
def propagate(batch_X, batch_y, params):
# one-hot label
labels = np.zeros((len(batch_X), 3))
for i in range(len(batch_y)):
labels[i][batch_y[i]] = 1
# forward
W1, b1, W2, b2 = params
h1 = np.dot(batch_X, W1) + b1
a1 = np.copy(h1)
a1[a1 < 0.0] = 0.0
h2 = np.dot(a1, W2) + b2
p = softmax(h2)
# NLL loss
loss = np.mean(-np.log(np.sum(p * labels, 1)))
# backward
dl_dh2 = p - labels # [batch, 3]
dl_dW2 = np.dot(a1.T, dl_dh2)
dl_db2 = np.sum(dl_dh2, 0)
dl_da1 = np.dot(dl_dh2, W2.T)
da1_dh1 = (h1 > 0).astype(float)
dl_dh1 = dl_da1 * da1_dh1
dl_dW1 = np.dot(batch_X.T, dl_dh1)
dl_db1 = np.sum(dl_dh1, 0)
return p, loss, [dl_dW1, dl_db1, dl_dW2, dl_db2]
def main():
# prepare dataset
iris = load_iris()
dataset = iris.data
dataset -= np.mean(dataset)
dataset /= np.std(dataset)
data_size = len(dataset)
test_size = int(0.2 * data_size)
test_idxs = np.random.randint(0, data_size, test_size)
train_idxs = np.array([i for i in range(data_size) if i not in test_idxs])
train_X = dataset[train_idxs]
train_y = iris.target[train_idxs]
test_X = dataset[test_idxs]
test_y = iris.target[test_idxs]
params = construct_net(4, 3)
# train
batch_size = 16
leanring_rate = 0.003
running_loss = 0
for step in range(1000):
batch_idx = np.random.randint(0, len(train_X), size=batch_size)
batch_X = train_X[batch_idx]
batch_y = train_y[batch_idx]
_, loss, grads = propagate(batch_X, batch_y, params)
if running_loss:
running_loss = 0.9 * running_loss + 0.1 * loss
else:
running_loss = loss
# update params
for i in range(len(params)):
params[i] -= leanring_rate * grads[i]
if step % 50 == 0:
print(step, running_loss)
# evaluate
predict, eval_loss, _ = propagate(test_X, test_y, params)
predict = np.argmax(predict, 1)
count = 0.0
for i in range(test_size):
if predict[i] == test_y[i]:
count += 1.0
print(eval_loss)
print(count / test_size)
if __name__ == '__main__':
main()
@zchrissirhcz
Copy link

zchrissirhcz commented Jun 8, 2020

你好,borgwang

我看了你的博客 A Step-by-Step Neural Net Example ,并对照这里贴出的程序进行比对,认为文章中NLL Loss的描述以及这里的实现有问题。

其中 p_i 是网络对第 i 类的输出 logits(即预测概率)
这里的y_i是ground truth,而p_i应当是预测的概率,y和p是随机变量x的两个独立分布。p_i并不是logits。p_i的取值是[0,1]之间,而logits根据定义它的范围是(-inf, inf)

代码中计算NLL的实现:

loss = np.mean(-np.log(np.sum(p * labels, 1)))

应该改成

loss = np.mean(-np.sum(labels * np.log(p), 1))

此外,NLL的公式,应该把batch size也考虑进去

@borgwang
Copy link
Author

@zchrissirhcz 感谢指出。

  • h 是 logits,p 是预测概率,已修正
  • 由于 labels 是 one-hot 的,两种实现是等效的
  • loss function 严格来说应该是定义在一个数据点上,只是在 DL 中 mini-batch 训练比较常见,有时候也会把 batch_size 写进去

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment