Shortcuts

Source code for betty.problems.problem

# Copyright Sang Keun Choe
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

import sys
import abc

import torch
import torch.distributed as dist

from betty.patch.data_loader import get_distributed_data_loader
from betty.patch.optimizer import patch_optimizer
from betty.patch.scheduler import patch_scheduler
from betty.configs import Config
from betty.hypergradient import get_grads
from betty.utils import convert_tensor, log_from_loss_dict


[docs]class Problem: """ This is the base class for an optimization problem in multilevel optimization. Specifically, each problem is defined by the parameter (or module), the sets of the upper and lower constraining problems, the dataset, the loss function, the optimizer, and other optimization configurations (e.g. best-response Jacobian calculation algorithm, number of unrolling steps, etc.). """ def __init__( self, name, config=None, module=None, optimizer=None, scheduler=None, train_data_loader=None, extra_config=None, ): # basic configurations self._name = name self._config = config if config is not None else Config() self.cfg = extra_config # device self.device = None # distributed self._strategy = None self.accelerator = None self._distributed = False self._backend = None self._world_size = None self._rank = None self._local_rank = None # computation graph depedency self._parents = [] self._children = [] self._paths = [] # data loader self.train_data_loader = train_data_loader self.train_data_iterator = None self.cur_batch = None self.epoch_counter = None # module self.module = module # optimizer & lr scheduler self.optimizer = optimizer self.scheduler = scheduler # environment self.env = None # fp16 scaler self._fp16 = config.fp16 self.scaler = None if self._fp16: self.initial_dynamic_scale = config.initial_dynamic_scale self.scale_factor = config.scale_factor # gradient accumulation self.gas = config.gradient_accumulation # gradient clipping self.gradient_clipping = config.gradient_clipping # warmup self.warmup_steps = config.warmup_steps # logger self.logger = None self.log_step = config.log_step self.log_local_step = config.log_local_step # step counter self._count = 0 self._global_step = 0 # misc self._leaf = False self._first_order = False self._retain_graph = config.retain_graph self._allow_unused = config.allow_unused self._unroll_steps = config.unroll_steps self._roll_back = False self._inner_loop_start = True self._training = True self.ready = None
[docs] def initialize(self): """ ``initialize`` patches/sets up module, optimizer, data loader, etc. after compiling a user-provided configuration (e.g., fp16 training, iterative differentiation) """ # initialize update ready to False self.ready = [False for _ in range(len(self._children))] # compile parents configurations first_order = [] for problem in self._parents: parent_config = problem.config first_order.append(parent_config.first_order) self._first_order = all(first_order) # set inner_loop_start to True self._inner_loop_start = True # set up data loader if self.is_implemented("configure_train_data_loader"): if self.train_data_loader is None: self.train_data_loader = self.configure_train_data_loader() if self.train_data_loader is not None: if not isinstance(self.train_data_loader, tuple): self.train_data_loader = (self.train_data_loader,) else: assert self.is_implemented("get_batch") # set up module if self.is_implemented("configure_module"): if self.module is None: self.module = self.configure_module() assert self.module is not None, "Module must be specified!" # set up optimizer if self.is_implemented("configure_optimizer"): if self.optimizer is None: self.optimizer = self.configure_optimizer() # set up lr scheduler if self.is_implemented("configure_scheduler"): if self.scheduler is None: self.scheduler = self.configure_scheduler() # set up fp16 training if self._is_default_fp16(): assert torch.cuda.is_available() scaler_cls = torch.cuda.amp.GradScaler if self._strategy == "fsdp": from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler scaler_cls = ShardedGradScaler self.scaler = scaler_cls( init_scale=self.initial_dynamic_scale, growth_factor=self.scale_factor ) # patch module, optimizer, data loader, and scheduler self.patch_everything() # make train_data_loader as iterator if self.train_data_loader is not None: self.train_data_iterator = [] self.epoch_counter = [] for train_data_loader in self.train_data_loader: self.train_data_iterator.append(iter(train_data_loader)) self.epoch_counter.append(0) # Logging INFO path_str = [[node.name for node in path] for path in self._paths] children_str = [node.name for node in self._children] parents_str = [node.name for node in self._parents] if self.is_rank_zero(): self.logger.info("*** Problem Information ***") self.logger.info(f"Name: {self._name}") self.logger.info(f"Uppers: {parents_str}") self.logger.info(f"Lowers: {children_str}") self.logger.info(f"Paths: {path_str}\n")
[docs] def patch_everything(self): """ We patch module, optimizer, data loader, and lr scheduler for device placement, distributed training, zero optimizer, fsdp, etc. """ self.patch_module() self.patch_optimizer() if self.scheduler is not None: self.patch_scheduler() if self.train_data_loader is not None: self.train_data_loader = [ self.patch_data_loader(data_loader) for data_loader in self.train_data_loader ]
[docs] def patch_module(self): """ Patch module given the systems configuration (e.g., DDP, FSDP) """ self.module.to(self.device) if self._strategy in ["distributed", "zero"]: self.synchronize_params(self.parameters()) self.module = torch.nn.parallel.DistributedDataParallel( module=self.module, gradient_as_bucket_view=True, ) elif self._strategy == "fsdp": if self.is_rank_zero(): self.logger.warning("FSDP requires PyTorch version >= 1.12") from torch.distributed.fsdp import FullyShardedDataParallel as FSDP self.synchronize_params(self.parameters()) self.module = FSDP(self.module, device_id=self.device) elif self._strategy == "accelerate": self.module = self.accelerator.prepare(self.module)
[docs] def patch_optimizer(self): """ Patch optimizer given the systems configuration (e.g., DDP, FSDP) """ params = self.trainable_parameters() if self.is_implemented("param_groups") and self._strategy != "fsdp": params = self.param_groups() is_zero = True if self._strategy == "zero" else False if self._strategy == "accelerate": self.optimizer = self.accelerator.prepare(self.optimizer) else: self.optimizer = patch_optimizer(self.optimizer, params, is_zero)
[docs] def patch_scheduler(self): """ Patch scheduler given the systems configuration (e.g., DDP, FSDP) """ self.scheduler = patch_scheduler(self.scheduler, self.optimizer) if self._strategy == "accelerate": self.scheduler = self.accelerator.prepare(self.scheduler)
[docs] def patch_data_loader(self, loader): """ Patch data loader given the systems configuration (e.g., DDP, FSDP) """ if self._strategy in ["distributed", "zero", "fsdp"]: patched_loader = get_distributed_data_loader( loader, world_size=self._world_size, rank=self._rank ) elif self._strategy == "accelerate": patched_loader = self.accelerator.prepare(loader) else: patched_loader = loader return patched_loader
[docs] def set_module(self, module): """ Set new module for the current Problem class. """ self.module = module self.patch_module()
[docs] def set_optimizer(self, optimizer): """ Set new optimizer for the current Problem class. """ self.optimizer = optimizer self.patch_optimizer()
[docs] def set_scheduler(self, scheduler): """ Set new scheduler for the current Problem class. """ self.scheduler = scheduler self.patch_scheduler()
[docs] def set_train_data_loader(self, loader, idx=0): """ Set new data loader for the current Problem class. """ self.train_data_loader[idx] = self.patch_data_loader(loader)
def __call__(self, *args, **kwargs): return self.forward(*args, **kwargs)
[docs] def forward(self, *args, **kwargs): """ Users define how forward (or call) function is defined for the problem here. """ return self.module(*args, **kwargs)
[docs] @abc.abstractmethod def training_step(self, batch): """ Users define the loss function of the problem here. """ raise NotImplementedError
[docs] def training_step_exec(self, batch): if self._is_default_fp16(): with torch.cuda.amp.autocast(): return self.training_step(batch) else: return self.training_step(batch)
[docs] def one_step_descent(self, batch=None): # load data if batch is None: self.cur_batch = self.get_batch() batch = self.cur_batch # calculate loss loss, loss_dict = self.get_loss(batch) # calculate gradient (a.k.a backward) self.backward( loss=loss, params=self.trainable_parameters(), paths=self._paths, create_graph=not self._first_order, retain_graph=self._retain_graph, allow_unused=self._allow_unused, ) if self.is_implemented("grad_callback"): self.grad_callback() # calculate parameter update if self._count % self.gas == 0: self.optimizer_step() # param callback (e.g., parameter clipping) if self.is_implemented("param_callback"): self.param_callback() if self._strategy != "default" and self._count % (self.gas * 20) == 0: self.synchronize_params(self.trainable_parameters()) # zero-out grad self.zero_grad() return loss_dict
[docs] def step_normal(self, global_step=None): if self.check_ready(): # loop start if self._inner_loop_start: if self.is_implemented("on_inner_loop_start"): self.on_inner_loop_start() self._inner_loop_start = False # copy current parameters, buffers, optimizer states if self._roll_back: self.cache_states() # increase count (local step) if self._training: self._count += 1 # one step grdient descent loss_dict = self.one_step_descent() # lr scheduler step if self.scheduler is not None and not self._roll_back: self.scheduler.step() # logging if ( self.log_step > 0 and self._count % self.log_step == 0 and self.is_rank_zero() ): self.log(loss_dict, global_step) # call parent step_normal after unrolling if ( self._training and self._count % (self._unroll_steps * self.gas) == 0 and self._count > self.warmup_steps ): for problem in self._parents: idx = problem.children.index(self) problem.ready[idx] = True problem.step_normal(global_step=global_step) self._inner_loop_start = True self.ready = [False for _ in range(len(self._children))]
[docs] def step_after_roll_back(self): if self.check_ready() and self._training: if self._roll_back: # recover from cached states self.recover_states() # one step gradient step _ = self.one_step_descent(batch=self.cur_batch) # lr scheduler if self.scheduler is not None: self.scheduler.step() # call parent step_after_roll_back for problem in self._parents: idx = problem.children.index(self) problem.ready[idx] = True problem.step_after_roll_back() self.ready = [False for _ in range(len(self._children))]
[docs] def step(self, global_step=None): """ ``step`` method abstracts a one-step gradient descent update with four sub-steps: 1) data loading, 2) cost calculation, 3) gradient calculation, and 4) parameter update. It also calls upper-level problems' step methods after unrolling gradient steps based on the hierarchical dependency graph. :param global_step: global step of the whole multilevel optimization. Defaults to None. :type global_step: int, optional """ self._global_step = global_step self.step_normal(global_step=global_step) if ( self._count % (self._unroll_steps * self.gas) == 0 and self._count > self.warmup_steps ): self.step_after_roll_back()
[docs] def get_batch(self): """ Load training batch from the user-provided data loader :return: New training batch :rtype: Any """ batch = tuple( self.get_batch_single_loader(i) for i in range(len(self.train_data_loader)) ) return batch[0] if len(batch) == 1 else batch
[docs] def get_batch_single_loader(self, idx): """ Load training batch from one of the user-provided data loader(s) :return: New training batch :rtype: Any """ data_iterator = self.train_data_iterator[idx] try: batch = next(data_iterator) except StopIteration: if idx == 0: self.epoch_callback_exec() self.epoch_counter[idx] += 1 train_data_loader = self.train_data_loader[idx] if self._strategy in ["distributed", "zero", "fsdp"]: train_data_loader.set_epoch(self.epoch_counter[idx]) self.train_data_iterator[idx] = iter(train_data_loader) batch = next(self.train_data_iterator[idx]) if not isinstance(batch, dict): batch = tuple( convert_tensor(value, self.device, self._is_default_fp16()) for value in batch ) else: for key, value in batch.items(): batch[key] = convert_tensor(value, self.device, self._is_default_fp16()) return batch
[docs] def get_loss(self, batch): """ Calculate loss and log metrics for the current batch based on the user-defined loss function. :return: loss and log metrics (e.g. classification accuracy) :rtype: dict """ maybe_loss_dict = self.training_step_exec(batch) is_dict = isinstance(maybe_loss_dict, dict) loss = maybe_loss_dict["loss"] if is_dict else maybe_loss_dict loss_no_scale = loss.item() if self._is_default_fp16(): loss = self.scaler.scale(loss) loss = loss / self.gas # construct loss dict loss_dict = {"loss": loss_no_scale} if is_dict: for key, value in maybe_loss_dict.items(): if key != "loss": loss_dict[key] = value return loss, loss_dict
[docs] def backward( self, loss, params, paths, create_graph=False, retain_graph=True, allow_unused=True, ): """ Calculate the gradient of ``loss`` with respect to ``params`` based on a user-defined ``config``. :param loss: Outputs of the differentiated function. :type loss: Tensor :param params: Inputs with respect to which the gradient will be returned. :type params: Sequence of Tensor :param paths: Paths on which the gradient will be calculated. :type paths: List of list of Problem :param create_graph: If ``True``, graph of the derivative will be constructed, allowing to compute higher order derivative products. Default: ``True``. :type create_graph: bool, optional :param retain_graph: If ``False``, the graph used to compute the grad will be freed. Note that in nearly all cases setting this option to ``True`` is not needed and often can be worked around in a much more efficient way. Defaults to the value of ``create_graph``. :type retain_graph: bool, optional :param allow_unused: If ``False``, specifying inputs that were not used when computing outputs (and therefore their grad is always zero) is an error. Defaults to ``False``. :type allow_unused: bool, optional """ # direct grad if len(paths) > 0 or not self.gradient_accumulation_boundary(): grads = torch.autograd.grad( loss, params, create_graph=create_graph, retain_graph=retain_graph, allow_unused=allow_unused, ) self.set_grads(params, grads) else: torch.autograd.backward( loss, inputs=params, create_graph=create_graph, retain_graph=retain_graph, ) # indirect grad: best-response Jacobian if self._config.first_order: for idx, path in enumerate(paths): retain_graph_implicit = False if idx == len(paths) - 1 else True do_sync = bool( idx == len(paths) - 1 and self.gradient_accumulation_boundary() ) grads = get_grads(loss, path, retain_graph_implicit, do_sync) if not do_sync: self.set_grads(params, grads)
[docs] def set_grads(self, params, grads): """ Set gradients for trainable parameters. ``params.grad = grads`` :param params: Trainable parameters :type params: Sequence of Tensor :param grads: Calculated gradient :type grads: Sequence of Tensor """ for param, grad in zip(params, grads): if grad is not None: if hasattr(param, "grad") and param.grad is not None: param.grad = param.grad + grad else: param.grad = grad
[docs] def synchronize_params(self, params): """ synchronize parameters across distributed data-parallel processes """ if self._world_size > 1 and self._strategy not in ["fsdp", "accelerate"]: for param in params: dist.broadcast(param.data, 0)
[docs] @abc.abstractmethod def optimizer_step(self, *args, **kwargs): """ Update weights as in PyTorch's native ``optim.step()`` """ raise NotImplementedError
[docs] def zero_grad(self): """ Set gradients for trainable parameters for the current problem to 0. Similar with PyTorch's ``optim.zero_grad()`` or ``module.zero_grad()``. """ for param in list(self.trainable_parameters()): if hasattr(param, "grad"): del param.grad
[docs] def clip_grad(self): """ Perform gradient clipping based on the norm provided by Config """ if self._strategy != "fsdp": torch.nn.utils.clip_grad_norm_( parameters=self.trainable_parameters(), max_norm=self.gradient_clipping ) else: self.module.clip_grad_norm_(max_norm=self.gradient_clipping)
[docs] def state_dict(self): """ Return all states involved in ``Problem`` with a Python dictionary. By default, it includes ``self.module.state_dict`` and ``self.optimizer.state_dict``. Depending on users' configurations, it may include ``self.scheuler.state_dict`` (lr scheduler) and ``self.scaler.state_dict`` (fp16 training) """ state_dict = {} state_dict["module"] = self.module.state_dict() state_dict["optimizer"] = self.optimizer.state_dict() if self.scheduler is not None: state_dict["scheduler"] = self.scheduler.state_dict() if self._is_default_fp16(): state_dict["scaler"] = self.scaler.state_dict() return state_dict
[docs] def load_state_dict(self, state_dict): """Load the state for the ``Problem`` Args: state_dict (dict): Python dictionary of Problem states. """ self.module.load_state_dict(state_dict["module"]) self.optimizer.load_state_dict(state_dict["optimizer"]) if self.scheduler is not None and "scheduler" in state_dict: self.scheduler.load_state_dict(state_dict["scheduler"]) if self._is_default_fp16() and "scaler" in state_dict: self.scaler.load_state_dict(state_dict["scaler"])
[docs] def configure_distributed_training(self, dictionary): """ Set the configuration for distributed training. :param dictionary: Python dictionary of distributed training provided by Engine. :type dictionary: dict """ self._strategy = dictionary["strategy"] self._backend = dictionary["backend"] self._world_size = dictionary["world_size"] self._rank = dictionary["rank"] self._local_rank = dictionary["local_rank"]
[docs] def configure_roll_back(self, roll_back): """ Set the roll-back (warm- start) option from Engine :param roll_back: roll-back (warm-start) on/off :type roll_back: bool """ if len(self._parents) > 0: self._roll_back = roll_back
[docs] def configure_device(self, device): """ Set the device for the current problem. """ self.device = device
[docs] def get_opt_param_group_for_param(self, param): """ Get optimizer param_group for specific parameter :param param: Parameter for which optimizer param_group is inquired :type param: torch.nn.Parameter :return: param_group for the given parameter :rtype: dict """ param_groups = self.optimizer.param_groups for group in param_groups: for p in group["params"]: if param is p: return group
[docs] def get_opt_state_for_param(self, param): """ Get optimizer state for specific parameter :param param: Parameter for which optimizer state is inquired :type param: torch.nn.Parameter :return: optimizer state for the given parameter :rtype: dict """ state = self.optimizer.state return state[param]
[docs] @abc.abstractmethod def cache_states(self): """ Cache params, buffers, optimizer states when ``config.roll_back`` is set to ``True`` in ``step``. """ raise NotImplementedError
[docs] @abc.abstractmethod def recover_states(self): """ Recover params, buffers, optimizer states when ``config.roll_back`` is set to ``True`` in ``step``. """ raise NotImplementedError
[docs] def epoch_callback_exec(self): if self.is_implemented("epoch_callback"): self.epoch_callback()
[docs] def gradient_accumulation_boundary(self): """ Check whether the current step is on the gradient accumulation boundary """ return bool(self._count % self.gas == 0)
def _is_default_fp16(self): """ Check whether to use PyTorch native fp16 (mixed-precision) feature """ if not self._fp16 or self._strategy in ["accelerate"]: return False return True
[docs] def is_implemented(self, fn_name): """ Check if ``fn_name`` method is implemented in the class :rtype: bool """ return callable(getattr(self, fn_name, None))
[docs] def check_ready(self): """ Check if unrolling processes of lower level problems in the hierarchical dependency graph are all ready/done. ``step`` function is only excuted when this method returns ``True``. :rtype: bool """ return all(self.ready)
[docs] def log(self, stats, global_step): """ Log (training) stats to the ``self.logger`` :param stats: log metrics such as loss and classification accuracy. :type stats: Any :param step: global/local step associated with the ``stats``. :type step: int """ loss_log = log_from_loss_dict(stats) if global_step is None: self.logger.info( f'[Problem "{self._name}"] [Local Step {self._count}] {loss_log}' ) else: self.logger.info( f'[Problem "{self._name}"] [Global Step {global_step}] [Local Step {self._count}] ' f"{loss_log}" ) cur_step = global_step if global_step is None or self.log_local_step: cur_step = self._count self.logger.log(stats, tag=self._name, step=cur_step)
[docs] def add_child(self, problem): """ Add ``problem`` to the lower-level problem list. :param problem: lower-level problem in the dependency graph :type problem: Problem """ assert problem not in self._children self._children.append(problem)
[docs] def add_parent(self, problem): """ Add ``problem`` to the upper-level problem list. :param problem: upper-level problem in the dependency graph :type problem: Problem """ assert problem not in self._parents self._parents.append(problem)
[docs] def add_paths(self, paths): """ Add new hypergradient backpropagation paths. """ self._paths.extend(paths)
[docs] def add_logger(self, logger): """ Add logger to the current problem. :param logger: logger defined by users in ``Engine``. """ if self.logger is None: self.logger = logger
[docs] def add_env(self, env): """ Add environment to the current problem. :param env: Environment. """ if self.env is None: self.env = env
[docs] @abc.abstractmethod def parameters(self): """ Return all parameters for the current problem. """ raise NotImplementedError
[docs] @abc.abstractmethod def trainable_parameters(self): """ Define all *trainable* parameters for the current problem. """ raise NotImplementedError
[docs] def clear_dependencies(self): """ Clear the dependencies of the current problem. """ self._children = [] self._parents = [] self._paths = []
[docs] def train(self): """ Set the current problem to the training mode. """ self._training = True
[docs] def eval(self): """ Set the current problem to the evaluation mode. """ self._training = False
[docs] def is_rank_zero(self): """ Check whether the current device is rank 0. """ return self._rank == 0
@property def name(self): """[summary] Return the user-defined name of the module. """ return self._name @property def config(self): """ Return the configuration for the current problem. """ return self._config @property def children(self): """ Return lower-level problems for the current problem. """ return self._children @property def parents(self): """ Return upper-level problems for the current problem. """ return self._parents @property def paths(self): """ Return hypergradient calculation paths for the current problem. """ return self._paths @property def leaf(self): """ Return whether the current problem is leaf or not. :return: leaf :rtype: bool """ return self._leaf @property def count(self): """ Return the local step for the current problem. :return: local step :rtype: int """ return self._count @leaf.setter def leaf(self, leaf): """ Set the current problem as a leaf problem. """ self._leaf = leaf