Source code for DLL.DeepLearning.Optimisers._LBFGS

import torch
from collections import deque

from ._BaseOptimiser import BaseOptimiser


[docs] class LBFGS(BaseOptimiser): """ Limited-memory Broyden-Fletcher-Goldfarb-Shanno optimiser. A second order method and approximates the hessian matrix using changes in position and gradient. Hence, requires more memory than first order methods. Args: loss (Callable[[], float]): The target function. For a deep learning model, one could use eg. lambda: model.loss.loss(model.predict(x_train), y_train). learning_rate (float, optional): The learning rate of the optimiser. Must be positive. Defaults to 0.001. history_size (int, optional): The number of old changes in position and gradient stored. Must be a non-negative integer. Defaults to 10. maxiterls (int, optional): The maximum number of iterations in the line search. Must be a non-negative integer. Defaults to 20. """ def __init__(self, loss, learning_rate=0.001, history_size=10, maxiterls=20): if not isinstance(learning_rate, int | float) or learning_rate <= 0: raise ValueError("learning_rate must be a positive real number.") if not isinstance(history_size, int) or history_size < 0: raise ValueError("history_size must be a non-negative integer.") if not isinstance(maxiterls, int) or maxiterls < 0: raise ValueError("maxiterls must be a non-negative integer.") self.loss = loss self.learning_rate = learning_rate self.history_size = history_size self.maxiterls = maxiterls
[docs] def initialise_parameters(self, model_parameters): """ Initialises the optimiser with the parameters that need to be optimised. Args: model_parameters (list[torch.Tensor]): The parameters that will be optimised. Must be a list or a tuple of torch tensors. """ if not isinstance(model_parameters, list | tuple): raise TypeError("model_parameters must be a list or a tuple of torch tensors.") self.model_parameters = model_parameters self.s_history = [deque([], maxlen=self.history_size) for _ in model_parameters] self.y_history = [deque([], maxlen=self.history_size) for _ in model_parameters] self.prevs = [param.clone() for param in model_parameters] self.prev_Bs = [torch.ones_like(param.flatten()) for param in model_parameters]
[docs] def update_parameters(self): """ Takes a step towards the optimum for each parameter. """ for i, param in enumerate(self.model_parameters): if param.grad is None: continue current = param.clone() current.grad = param.grad.clone() s = current - self.prevs[i] y = current.grad - self.prevs[i].grad if self.prevs[i].grad is not None else torch.zeros_like(current.grad) if s.flatten() @ y.flatten() > 1e-8: self.s_history[i].append(s) # automatically pops the oldest values as maxlen is spesified for the deque self.y_history[i].append(y) # automatically pops the oldest values as maxlen is spesified for the deque Bs = self.learning_rate * self._recursion_two_loops(i) learning_rate = self._line_search(param, Bs) param -= learning_rate * Bs.view_as(param) self.prevs[i] = current self.prev_Bs[i] = Bs
def _recursion_two_loops(self, i): q = self.model_parameters[i].grad.clone().flatten() alphas = torch.zeros(self.history_size) rhos = torch.tensor([1.0 / (self.y_history[i][j].flatten() @ self.s_history[i][j].flatten()) for j in range(self.history_size) if len(self.s_history[i]) > j]) for j, (s, y) in enumerate(zip(reversed(self.s_history[i]), reversed(self.y_history[i]))): j = len(self.s_history[i]) - j - 1 s, y = s.flatten(), y.flatten() alphas[j] = rhos[j] * (s @ q) q -= alphas[j] * y if len(self.s_history[i]) > 0: last_s, last_y = self.s_history[i][-1].flatten(), self.y_history[i][-1].flatten() Hk_0 = (last_s @ last_y) / last_y @ last_y else: Hk_0 = 1 r = Hk_0 * q for j, (s, y) in enumerate(zip(self.s_history[i], self.y_history[i])): s, y = s.flatten(), y.flatten() beta = rhos[j] * (y @ r) r += (alphas[j] - beta) * s return r def _line_search(self, param, direction): step = 1.0 initial_func_value = self.loss() grad_dot_direction = torch.sum(param.grad.flatten() * direction) orig_param = param.clone() c = 1e-4 iter = 0 while iter < self.maxiterls: new_param_value = orig_param - step * direction.view_as(orig_param) param.data = new_param_value new_func_value = self.loss() # Does not include the other wolfe condition as it requires us to calculate the gradient of the parameter # at the new point, which is only possible to calculate using finite differences with current architecture. if new_func_value <= initial_func_value + c * step * grad_dot_direction: break else: step *= 0.5 iter += 1 param.data = orig_param.data return step