thomasaarholt/torch_LBFGS.py

## torch_LBFGS.py
# Let's minimize the function f(x,y) = (x-50)**2 + (y-100)**2
# We can tell from looking at the equation that the minimum should be at (50, 100).

def func(params):
    x, y = params
    return (x-50)**2 + (y-100)**2


# Optionally, view what it looks like

# N = 100
# xi = np.linspace(-N,N, 1000)
# yi = np.linspace(-N,N, 1000)
# params = np.meshgrid(xi, yi)
# fig, ax = plt.subplots()
# ax.imshow(func(params), extent=(-N, N, N, -N))

# These are our guesses for x and y. The values in this object will change as the optimizer proceeds.
# requires_grad ensures that the val object has space for the gradients used in the optimization
val = torch.tensor((1000., 800.), requires_grad=True)

# To perform it on the GPU, we would instead use:
# val = torch.tensor((1000., 800.), requires_grad=True, device='cuda')

# We will use the L-BFGS optimizer.
optimizer = torch.optim.LBFGS([val])

guesses = [] # to record the values the optimizer think are good
losses = [] # to monitor the losses - for the scalar loss, this will be `output - target = f(val) - 0 = f(val)`

# LBGFS takes a function that must return the loss
def closure():
    optimizer.zero_grad() # Zeroes gradients. This is important.
    output = func(val) # Calculate the function at current guess
    loss = output # unnecessary line, just showing that the output is the loss, since we're finding the minimum where f(x,y) = 0
    loss.backward() # Calculate the gradients
    guesses.append(val.clone()) # val will change, so we must clone it to keep track of its current value
    losses.append(loss.clone()) # same with loss
    return loss

# "step" is a misnomer here. In contrast to other optimizers (e.g. Adam), the LBFGS does the entire optimization, not just one step.
# We hence to not need to do "for i in range(1000): optimizer.step()"
optimizer.step(closure)
print(f"Minimum: {val[0]}, {val[1]}")
print(f"Number of steps: {len(guesses)}")
print(f"Guesses: {guesses}")
print(f"Losses: {losses}")
	# Let's minimize the function f(x,y) = (x-50)2 + (y-100)2
	# We can tell from looking at the equation that the minimum should be at (50, 100).

	def func(params):
	x, y = params
	return (x-50)2 + (y-100)2


	# Optionally, view what it looks like

	# N = 100
	# xi = np.linspace(-N,N, 1000)
	# yi = np.linspace(-N,N, 1000)
	# params = np.meshgrid(xi, yi)
	# fig, ax = plt.subplots()
	# ax.imshow(func(params), extent=(-N, N, N, -N))

	# These are our guesses for x and y. The values in this object will change as the optimizer proceeds.
	# requires_grad ensures that the val object has space for the gradients used in the optimization
	val = torch.tensor((1000., 800.), requires_grad=True)

	# To perform it on the GPU, we would instead use:
	# val = torch.tensor((1000., 800.), requires_grad=True, device='cuda')

	# We will use the L-BFGS optimizer.
	optimizer = torch.optim.LBFGS([val])

	guesses = [] # to record the values the optimizer think are good
	losses = [] # to monitor the losses - for the scalar loss, this will be `output - target = f(val) - 0 = f(val)`

	# LBGFS takes a function that must return the loss
	def closure():
	optimizer.zero_grad() # Zeroes gradients. This is important.
	output = func(val) # Calculate the function at current guess
	loss = output # unnecessary line, just showing that the output is the loss, since we're finding the minimum where f(x,y) = 0
	loss.backward() # Calculate the gradients
	guesses.append(val.clone()) # val will change, so we must clone it to keep track of its current value
	losses.append(loss.clone()) # same with loss
	return loss

	# "step" is a misnomer here. In contrast to other optimizers (e.g. Adam), the LBFGS does the entire optimization, not just one step.
	# We hence to not need to do "for i in range(1000): optimizer.step()"
	optimizer.step(closure)
	print(f"Minimum: {val[0]}, {val[1]}")
	print(f"Number of steps: {len(guesses)}")
	print(f"Guesses: {guesses}")
	print(f"Losses: {losses}")