Shortcuts

Source code for betty.hypergradient.neumann

import warnings

import torch

from betty.utils import neg_with_none


[docs]def neumann(vector, curr, prev, sync): """ Approximate the matrix-vector multiplication with the best response Jacobian by the Neumann Series as proposed in `Optimizing Millions of Hyperparameters by Implicit Differentiation <https://arxiv.org/abs/1911.02590>`_ based on implicit function theorem (IFT). Users may specify learning rate (``neumann_alpha``) and unrolling steps (``neumann_iterations``) in ``Config``. :param vector: Vector with which matrix-vector multiplication with best-response Jacobian (matrix) would be performed. :type vector: Sequence of Tensor :param curr: A current level problem :type curr: Problem :param prev: A directly lower-level problem to the current problem :type prev: Problem :return: (Intermediate) gradient :rtype: Sequence of Tensor """ # ! Mabye replace with child.loss by adding self.loss attribute to save computation assert len(curr.paths) == 0, "neumann method is not supported for higher order MLO!" config = curr.config in_loss = curr.training_step_exec(curr.cur_batch) with warnings.catch_warnings(): warnings.simplefilter("ignore") in_grad = torch.autograd.grad( in_loss, curr.trainable_parameters(), create_graph=True ) v2 = approx_inverse_hvp( vector, in_grad, curr.trainable_parameters(), iterations=config.neumann_iterations, alpha=config.neumann_alpha, ) if sync: v2 = [neg_with_none(x) for x in v2] torch.autograd.backward( in_grad, inputs=prev.trainable_parameters(), grad_tensors=v2 ) implicit_grad = None else: implicit_grad = torch.autograd.grad( in_grad, prev.trainable_parameters(), grad_outputs=v2 ) implicit_grad = [neg_with_none(ig) for ig in implicit_grad] return implicit_grad
def approx_inverse_hvp(v, f, params, iterations=3, alpha=1.0): p = v for _ in range(iterations): hvp = torch.autograd.grad(f, params, grad_outputs=v, retain_graph=True) v = [v_i - alpha * hvp_i for v_i, hvp_i in zip(v, hvp)] p = [v_i + p_i for v_i, p_i in zip(v, p)] return [alpha * p_i for p_i in p]