thecharlieblake/abc_lr_trick.py

## abc_lr_trick.py
from torch import manual_seed, nn, optim, randn
manual_seed(1234)

### Uncomment one of these lines -> in both cases y2 comes out the same!
lr = 1; mult = 1e-3; init_std = 1 / mult;
# lr = 1e-3; mult = 1; init_std = 1

l = nn.Linear(1024, 2048, bias=False)
nn.init.normal_(l.weight, std=init_std)
model = lambda x: l(x) * mult
opt = optim.Adam(l.parameters(), lr=lr, eps=0)

x = randn(512, 1024).requires_grad_()
y1 = model(x).mean()
print(y1)
y1.backward(); opt.step()
y2 = model(x).mean()
print(y2)  # Comes out the same, regardless of LR
	from torch import manual_seed, nn, optim, randn
	manual_seed(1234)

	### Uncomment one of these lines -> in both cases y2 comes out the same!
	lr = 1; mult = 1e-3; init_std = 1 / mult;
	# lr = 1e-3; mult = 1; init_std = 1

	l = nn.Linear(1024, 2048, bias=False)
	nn.init.normal_(l.weight, std=init_std)
	model = lambda x: l(x) * mult
	opt = optim.Adam(l.parameters(), lr=lr, eps=0)

	x = randn(512, 1024).requires_grad_()
	y1 = model(x).mean()
	print(y1)
	y1.backward(); opt.step()
	y2 = model(x).mean()
	print(y2) # Comes out the same, regardless of LR