Shortcuts

Note

You are reading the documentation for MMSelfSup 0.x, which will soon be deprecated by the end of 2022. We recommend you upgrade to MMSelfSup 1.0.0rc versions to enjoy fruitful new features and better performance brought by OpenMMLab 2.0. Check out the changelog, code and documentation of MMSelfSup 1.0.0rc for more details.

Source code for mmselfsup.core.hooks.optimizer_hook

# Copyright (c) OpenMMLab. All rights reserved.
from mmcv.runner import (HOOKS, Fp16OptimizerHook, OptimizerHook,
                         allreduce_grads)
from mmcv.utils import TORCH_VERSION, _BatchNorm, digit_version


[docs]@HOOKS.register_module() class DistOptimizerHook(OptimizerHook): """Optimizer hook for distributed training. This hook can accumulate gradients every n intervals and freeze some layers for some iters at the beginning. Args: update_interval (int, optional): The update interval of the weights, set > 1 to accumulate the grad. Defaults to 1. grad_clip (dict, optional): Dict to config the value of grad clip. E.g., grad_clip = dict(max_norm=10). Defaults to None. coalesce (bool, optional): Whether allreduce parameters as a whole. Defaults to True. bucket_size_mb (int, optional): Size of bucket, the unit is MB. Defaults to -1. frozen_layers_cfg (dict, optional): Dict to config frozen layers. The key-value pair is layer name and its frozen iters. If frozen, the layer gradient would be set to None. Defaults to dict(). """ def __init__(self, update_interval=1, grad_clip=None, coalesce=True, bucket_size_mb=-1, frozen_layers_cfg=dict()): self.grad_clip = grad_clip self.coalesce = coalesce self.bucket_size_mb = bucket_size_mb self.update_interval = update_interval self.frozen_layers_cfg = frozen_layers_cfg self.initialized = False def has_batch_norm(self, module): if isinstance(module, _BatchNorm): return True for m in module.children(): if self.has_batch_norm(m): return True return False def _init(self, runner): if runner.iter % self.update_interval != 0: runner.logger.warning( 'Resume iter number is not divisible by update_interval in ' 'GradientCumulativeOptimizerHook, which means the gradient of ' 'some iters is lost and the result may be influenced slightly.' ) if self.has_batch_norm(runner.model) and self.update_interval > 1: runner.logger.warning( 'GradientCumulativeOptimizerHook may slightly decrease ' 'performance if the model has BatchNorm layers.') residual_iters = runner.max_iters self.divisible_iters = ( residual_iters // self.update_interval * self.update_interval) self.remainder_iters = residual_iters - self.divisible_iters self.initialized = True def before_run(self, runner): runner.optimizer.zero_grad() def after_train_iter(self, runner): # In some cases, MMCV's GradientCumulativeOptimizerHook will # cause the loss_factor to be zero and we fix this bug in our # implementation. if not self.initialized: self._init(runner) if runner.iter < self.divisible_iters: loss_factor = self.update_interval else: loss_factor = self.remainder_iters runner.outputs['loss'] /= loss_factor runner.outputs['loss'].backward() if (self.every_n_iters(runner, self.update_interval) or self.is_last_iter(runner)): # cancel gradient of certain layer for n iters # according to frozen_layers_cfg dict for layer, iters in self.frozen_layers_cfg.items(): if runner.iter < iters: for name, p in runner.model.module.named_parameters(): if layer in name: p.grad = None if self.grad_clip is not None: grad_norm = self.clip_grads(runner.model.parameters()) if grad_norm is not None: # Add grad norm to the logger runner.log_buffer.update({'grad_norm': float(grad_norm)}, runner.outputs['num_samples']) runner.optimizer.step() runner.optimizer.zero_grad()
if (TORCH_VERSION != 'parrots' and digit_version(TORCH_VERSION) >= digit_version('1.6.0')): @HOOKS.register_module() class GradAccumFp16OptimizerHook(Fp16OptimizerHook): """Fp16 optimizer hook (using PyTorch's implementation). This hook can accumulate gradients every n intervals and freeze some layers for some iters at the beginning. If you are using PyTorch >= 1.6, torch.cuda.amp is used as the backend, to take care of the optimization procedure. Args: update_interval (int, optional): The update interval of the weights, set > 1 to accumulate the grad. Defaults to 1. frozen_layers_cfg (dict, optional): Dict to config frozen layers. The key-value pair is layer name and its frozen iters. If frozen, the layer gradient would be set to None. Defaults to dict(). """ def __init__(self, update_interval=1, frozen_layers_cfg=dict(), **kwargs): super(GradAccumFp16OptimizerHook, self).__init__(**kwargs) self.update_interval = update_interval self.frozen_layers_cfg = frozen_layers_cfg def after_train_iter(self, runner): runner.outputs['loss'] /= self.update_interval self.loss_scaler.scale(runner.outputs['loss']).backward() if self.every_n_iters(runner, self.update_interval): # cancel gradient of certain layer for n iters # according to frozen_layers_cfg dict for layer, iters in self.frozen_layers_cfg.items(): if runner.iter < iters: for name, p in runner.model.module.named_parameters(): if layer in name: p.grad = None # copy fp16 grads in the model to fp32 params in the optimizer self.loss_scaler.unscale_(runner.optimizer) if self.grad_clip is not None: grad_norm = self.clip_grads(runner.model.parameters()) if grad_norm is not None: # Add grad norm to the logger runner.log_buffer.update( {'grad_norm': float(grad_norm)}, runner.outputs['num_samples']) # backward and update scaler self.loss_scaler.step(runner.optimizer) self.loss_scaler.update(self._scale_update_param) # save state_dict of loss_scaler runner.meta.setdefault( 'fp16', {})['loss_scaler'] = self.loss_scaler.state_dict() # clear grads runner.model.zero_grad() runner.optimizer.zero_grad() else:
[docs] @HOOKS.register_module() class GradAccumFp16OptimizerHook(Fp16OptimizerHook): """Fp16 optimizer hook (using mmcv's implementation). This hook can accumulate gradients every n intervals and freeze some layers for some iters at the beginning. Args: update_interval (int, optional): The update interval of the weights, set > 1 to accumulate the grad. Defaults to 1. frozen_layers_cfg (dict, optional): Dict to config frozen layers. The key-value pair is layer name and its frozen iters. If frozen, the layer gradient would be set to None. Defaults to dict(). """ def __init__(self, update_interval=1, frozen_layers_cfg=dict(), **kwargs): super(GradAccumFp16OptimizerHook, self).__init__(**kwargs) self.update_interval = update_interval self.frozen_layers_cfg = frozen_layers_cfg
[docs] def after_train_iter(self, runner): runner.outputs['loss'] /= self.update_interval # scale the loss value scaled_loss = runner.outputs['loss'] * self.loss_scaler.loss_scale scaled_loss.backward() if self.every_n_iters(runner, self.update_interval): # cancel gradient of certain layer for n iters # according to frozen_layers_cfg dict for layer, iters in self.frozen_layers_cfg.items(): if runner.iter < iters: for name, p in runner.model.module.named_parameters(): if layer in name: p.grad = None # copy fp16 grads in the model to fp32 params in the optimizer fp32_weights = [] for param_group in runner.optimizer.param_groups: fp32_weights += param_group['params'] self.copy_grads_to_fp32(runner.model, fp32_weights) # allreduce grads if self.distributed: allreduce_grads(fp32_weights, self.coalesce, self.bucket_size_mb) has_overflow = self.loss_scaler.has_overflow(fp32_weights) # if has overflow, skip this iteration if not has_overflow: # scale the gradients back for param in fp32_weights: if param.grad is not None: param.grad.div_(self.loss_scaler.loss_scale) if self.grad_clip is not None: grad_norm = self.clip_grads(fp32_weights) if grad_norm is not None: # Add grad norm to the logger runner.log_buffer.update( {'grad_norm': float(grad_norm)}, runner.outputs['num_samples']) # update fp32 params runner.optimizer.step() # copy fp32 params to the fp16 model self.copy_params_to_fp16(runner.model, fp32_weights) else: runner.logger.warning( 'Check overflow, downscale loss scale ' f'to {self.loss_scaler.cur_scale}') self.loss_scaler.update_scale(has_overflow) # save state_dict of loss_scaler runner.meta.setdefault( 'fp16', {})['loss_scaler'] = self.loss_scaler.state_dict() # clear grads runner.model.zero_grad() runner.optimizer.zero_grad()
Read the Docs v: 0.x
Versions
latest
stable
1.x
dev-1.x
0.x
Downloads
On Read the Docs
Project Home
Builds

Free document hosting provided by Read the Docs.