Note
You are reading the documentation for MMSelfSup 0.x, which will soon be deprecated by the end of 2022. We recommend you upgrade to MMSelfSup 1.0.0rc versions to enjoy fruitful new features and better performance brought by OpenMMLab 2.0. Check out the changelog, code and documentation of MMSelfSup 1.0.0rc for more details.
Source code for mmselfsup.core.hooks.optimizer_hook
# Copyright (c) OpenMMLab. All rights reserved.
from mmcv.runner import (HOOKS, Fp16OptimizerHook, OptimizerHook,
allreduce_grads)
from mmcv.utils import TORCH_VERSION, _BatchNorm, digit_version
[docs]@HOOKS.register_module()
class DistOptimizerHook(OptimizerHook):
"""Optimizer hook for distributed training.
This hook can accumulate gradients every n intervals and freeze some
layers for some iters at the beginning.
Args:
update_interval (int, optional): The update interval of the weights,
set > 1 to accumulate the grad. Defaults to 1.
grad_clip (dict, optional): Dict to config the value of grad clip.
E.g., grad_clip = dict(max_norm=10). Defaults to None.
coalesce (bool, optional): Whether allreduce parameters as a whole.
Defaults to True.
bucket_size_mb (int, optional): Size of bucket, the unit is MB.
Defaults to -1.
frozen_layers_cfg (dict, optional): Dict to config frozen layers.
The key-value pair is layer name and its frozen iters. If frozen,
the layer gradient would be set to None. Defaults to dict().
"""
def __init__(self,
update_interval=1,
grad_clip=None,
coalesce=True,
bucket_size_mb=-1,
frozen_layers_cfg=dict()):
self.grad_clip = grad_clip
self.coalesce = coalesce
self.bucket_size_mb = bucket_size_mb
self.update_interval = update_interval
self.frozen_layers_cfg = frozen_layers_cfg
self.initialized = False
def has_batch_norm(self, module):
if isinstance(module, _BatchNorm):
return True
for m in module.children():
if self.has_batch_norm(m):
return True
return False
def _init(self, runner):
if runner.iter % self.update_interval != 0:
runner.logger.warning(
'Resume iter number is not divisible by update_interval in '
'GradientCumulativeOptimizerHook, which means the gradient of '
'some iters is lost and the result may be influenced slightly.'
)
if self.has_batch_norm(runner.model) and self.update_interval > 1:
runner.logger.warning(
'GradientCumulativeOptimizerHook may slightly decrease '
'performance if the model has BatchNorm layers.')
residual_iters = runner.max_iters
self.divisible_iters = (
residual_iters // self.update_interval * self.update_interval)
self.remainder_iters = residual_iters - self.divisible_iters
self.initialized = True
def before_run(self, runner):
runner.optimizer.zero_grad()
def after_train_iter(self, runner):
# In some cases, MMCV's GradientCumulativeOptimizerHook will
# cause the loss_factor to be zero and we fix this bug in our
# implementation.
if not self.initialized:
self._init(runner)
if runner.iter < self.divisible_iters:
loss_factor = self.update_interval
else:
loss_factor = self.remainder_iters
runner.outputs['loss'] /= loss_factor
runner.outputs['loss'].backward()
if (self.every_n_iters(runner, self.update_interval)
or self.is_last_iter(runner)):
# cancel gradient of certain layer for n iters
# according to frozen_layers_cfg dict
for layer, iters in self.frozen_layers_cfg.items():
if runner.iter < iters:
for name, p in runner.model.module.named_parameters():
if layer in name:
p.grad = None
if self.grad_clip is not None:
grad_norm = self.clip_grads(runner.model.parameters())
if grad_norm is not None:
# Add grad norm to the logger
runner.log_buffer.update({'grad_norm': float(grad_norm)},
runner.outputs['num_samples'])
runner.optimizer.step()
runner.optimizer.zero_grad()
if (TORCH_VERSION != 'parrots'
and digit_version(TORCH_VERSION) >= digit_version('1.6.0')):
@HOOKS.register_module()
class GradAccumFp16OptimizerHook(Fp16OptimizerHook):
"""Fp16 optimizer hook (using PyTorch's implementation).
This hook can accumulate gradients every n intervals and freeze some
layers for some iters at the beginning.
If you are using PyTorch >= 1.6, torch.cuda.amp is used as the backend,
to take care of the optimization procedure.
Args:
update_interval (int, optional): The update interval of the
weights, set > 1 to accumulate the grad. Defaults to 1.
frozen_layers_cfg (dict, optional): Dict to config frozen layers.
The key-value pair is layer name and its frozen iters. If
frozen, the layer gradient would be set to None.
Defaults to dict().
"""
def __init__(self,
update_interval=1,
frozen_layers_cfg=dict(),
**kwargs):
super(GradAccumFp16OptimizerHook, self).__init__(**kwargs)
self.update_interval = update_interval
self.frozen_layers_cfg = frozen_layers_cfg
def after_train_iter(self, runner):
runner.outputs['loss'] /= self.update_interval
self.loss_scaler.scale(runner.outputs['loss']).backward()
if self.every_n_iters(runner, self.update_interval):
# cancel gradient of certain layer for n iters
# according to frozen_layers_cfg dict
for layer, iters in self.frozen_layers_cfg.items():
if runner.iter < iters:
for name, p in runner.model.module.named_parameters():
if layer in name:
p.grad = None
# copy fp16 grads in the model to fp32 params in the optimizer
self.loss_scaler.unscale_(runner.optimizer)
if self.grad_clip is not None:
grad_norm = self.clip_grads(runner.model.parameters())
if grad_norm is not None:
# Add grad norm to the logger
runner.log_buffer.update(
{'grad_norm': float(grad_norm)},
runner.outputs['num_samples'])
# backward and update scaler
self.loss_scaler.step(runner.optimizer)
self.loss_scaler.update(self._scale_update_param)
# save state_dict of loss_scaler
runner.meta.setdefault(
'fp16', {})['loss_scaler'] = self.loss_scaler.state_dict()
# clear grads
runner.model.zero_grad()
runner.optimizer.zero_grad()
else:
[docs] @HOOKS.register_module()
class GradAccumFp16OptimizerHook(Fp16OptimizerHook):
"""Fp16 optimizer hook (using mmcv's implementation).
This hook can accumulate gradients every n intervals and freeze some
layers for some iters at the beginning.
Args:
update_interval (int, optional): The update interval of the
weights, set > 1 to accumulate the grad. Defaults to 1.
frozen_layers_cfg (dict, optional): Dict to config frozen layers.
The key-value pair is layer name and its frozen iters. If
frozen, the layer gradient would be set to None.
Defaults to dict().
"""
def __init__(self,
update_interval=1,
frozen_layers_cfg=dict(),
**kwargs):
super(GradAccumFp16OptimizerHook, self).__init__(**kwargs)
self.update_interval = update_interval
self.frozen_layers_cfg = frozen_layers_cfg
[docs] def after_train_iter(self, runner):
runner.outputs['loss'] /= self.update_interval
# scale the loss value
scaled_loss = runner.outputs['loss'] * self.loss_scaler.loss_scale
scaled_loss.backward()
if self.every_n_iters(runner, self.update_interval):
# cancel gradient of certain layer for n iters
# according to frozen_layers_cfg dict
for layer, iters in self.frozen_layers_cfg.items():
if runner.iter < iters:
for name, p in runner.model.module.named_parameters():
if layer in name:
p.grad = None
# copy fp16 grads in the model to fp32 params in the optimizer
fp32_weights = []
for param_group in runner.optimizer.param_groups:
fp32_weights += param_group['params']
self.copy_grads_to_fp32(runner.model, fp32_weights)
# allreduce grads
if self.distributed:
allreduce_grads(fp32_weights, self.coalesce,
self.bucket_size_mb)
has_overflow = self.loss_scaler.has_overflow(fp32_weights)
# if has overflow, skip this iteration
if not has_overflow:
# scale the gradients back
for param in fp32_weights:
if param.grad is not None:
param.grad.div_(self.loss_scaler.loss_scale)
if self.grad_clip is not None:
grad_norm = self.clip_grads(fp32_weights)
if grad_norm is not None:
# Add grad norm to the logger
runner.log_buffer.update(
{'grad_norm': float(grad_norm)},
runner.outputs['num_samples'])
# update fp32 params
runner.optimizer.step()
# copy fp32 params to the fp16 model
self.copy_params_to_fp16(runner.model, fp32_weights)
else:
runner.logger.warning(
'Check overflow, downscale loss scale '
f'to {self.loss_scaler.cur_scale}')
self.loss_scaler.update_scale(has_overflow)
# save state_dict of loss_scaler
runner.meta.setdefault(
'fp16', {})['loss_scaler'] = self.loss_scaler.state_dict()
# clear grads
runner.model.zero_grad()
runner.optimizer.zero_grad()