nni.compression.pruning.movement_pruner 源代码

# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

from __future__ import annotations

from collections import defaultdict
import logging
from typing import Dict, List, overload

import torch
from torch.optim import Adam

from .scheduled_pruner import ScheduledPruner
from .tools import is_active_target, generate_sparsity
from ..base.compressor import Compressor
from ..base.target_space import TargetType
from ..base.wrapper import ModuleWrapper
from ..utils import Evaluator, _EVALUATOR_DOCSTRING

MOVEMENT_SCORE_PNAME = '{}_mvp_score'
_logger = logging.getLogger(__name__)



[文档]
class MovementPruner(ScheduledPruner):
    __doc__ = r"""
    Movement pruner is an implementation of movement pruning.
    This is a "fine-pruning" algorithm, which means the masks may change during each fine-tuning step.
    Each weight element will be scored by the opposite of the sum of the product of weight and its gradient during each step.
    This means the weight elements moving towards zero will accumulate negative scores,
    the weight elements moving away from zero will accumulate positive scores.
    The weight elements with low scores will be masked during inference.

    The following figure from the paper shows the weight pruning by movement pruning.

    .. image:: ../../../img/movement_pruning.png
        :target: ../../../img/movement_pruning.png
        :alt:

    For more details, please refer to `Movement Pruning: Adaptive Sparsity by Fine-Tuning <https://arxiv.org/abs/2005.07683>`__.

    Parameters
    ----------
    model
        Model to be pruned.
    config_list
        A list of dict, each dict configure which module need to be pruned, and how to prune.
        Please refer :doc:`Compression Config Specification </compression/config_list>` for more information.
    evaluator
        {evaluator_docstring}
    warmup_step
        The total `optimizer.step()` number before start pruning for warm up.
        Make sure ``warmup_step`` is smaller than ``cooldown_begin_step``.
    cooldown_begin_step
        The number of steps at which sparsity stops growing, note that the sparsity stop growing doesn't mean masks not changed.
        The sparse ratio or sparse threshold after each `optimizer.step()` is::

            final_sparse * (1 - (1 - (current_step - warm_up_step) / (cool_down_beginning_step - warm_up_step)) ** 3)
    regular_scale
        A scale factor used to control the movement score regular loss.
        This factor only works on pruning target controlled by ``sparse_threshold``,
        the pruning target controlled by ``sparse_ratio`` will not be regularized.

    Examples
    --------
        Please refer to
        :githublink:`examples/tutorials/new_pruning_bert_glue.py <examples/tutorials/new_pruning_bert_glue.py>`.
    """.format(evaluator_docstring=_EVALUATOR_DOCSTRING)

    @overload
    def __init__(self, model: torch.nn.Module, config_list: List[Dict], evaluator: Evaluator, warmup_step: int,
                 cooldown_begin_step: int, regular_scale: float = 1.):
        ...

    @overload
    def __init__(self, model: torch.nn.Module, config_list: List[Dict], evaluator: Evaluator, warmup_step: int,
                 cooldown_begin_step: int, regular_scale: float = 1., existed_wrappers: Dict[str, ModuleWrapper] | None = None):
        ...

    def __init__(self, model: torch.nn.Module, config_list: List[Dict], evaluator: Evaluator, warmup_step: int,
                 cooldown_begin_step: int, regular_scale: float = 1., existed_wrappers: Dict[str, ModuleWrapper] | None = None):
        super().__init__(model, config_list, evaluator, existed_wrappers)
        self.evaluator: Evaluator
        assert 0 <= warmup_step < cooldown_begin_step
        self.warmup_step = warmup_step
        self.cooldown_begin_step = cooldown_begin_step
        self.regular_scale = regular_scale
        self._init_sparse_goals()
        self._set_apply_method()

        self.interval_steps = 1
        self.total_times = (self.cooldown_begin_step - self.warmup_step) // self.interval_steps
        self._remaining_times: int
        self.scores: Dict[str, Dict[str, torch.Tensor]] = defaultdict(dict)

    @classmethod
    def from_compressor(cls, compressor: Compressor, new_config_list: List[Dict], warmup_step: int,
                        cooldown_begin_step: int, regular_scale: float = 1., evaluator: Evaluator | None = None):
        return super().from_compressor(compressor, new_config_list, warmup_step=warmup_step, cooldown_begin_step=cooldown_begin_step,
                                       regular_scale=regular_scale, evaluator=evaluator)

    def _set_apply_method(self):
        for _, ts in self._target_spaces.items():
            for _, target_space in ts.items():
                if target_space.apply_method == 'mul':
                    target_space.apply_method = 'movement_mul'
                if target_space.apply_method == 'add':
                    target_space.apply_method = 'movement_add'

    def _register_movement_scores(self):
        for module_name, ts in self._target_spaces.items():
            for target_name, target_space in ts.items():
                if is_active_target(target_space):
                    # TODO: add input / output
                    if target_space.type is TargetType.PARAMETER:
                        # TODO: here using a shrinked score to save memory, but need to test the speed.
                        score_val = torch.zeros_like(target_space.target)  # type: ignore
                        if target_space._scaler is not None:
                            score_val = target_space._scaler.shrink(score_val, keepdim=True)
                        target_space._wrapper.register_parameter(MOVEMENT_SCORE_PNAME.format(target_name),
                                                                 torch.nn.Parameter(score_val))
                        score = target_space._get_wrapper_attr(MOVEMENT_SCORE_PNAME.format(target_name))
                        self.scores[module_name][target_name] = score
                    else:
                        raise NotImplementedError()

    def _register_scores_optimization(self, evaluator: Evaluator):
        scores = []
        for _, target_scores in self.scores.items():
            for _, score in target_scores.items():
                scores.append(score)

        if not scores:
            return

        params = [{"params": scores}]
        optimizer = Adam(params, 1e-2)

        def optimizer_task():
            optimizer.step()
            optimizer.zero_grad()

        evaluator.patch_optimizer_step(before_step_tasks=[optimizer_task], after_step_tasks=[])

    def _patch_loss(self, evaluator: Evaluator):
        def loss_patch(original_loss, batch):
            reg_loss = 0.
            count = 0
            for module_name, target_scores in self.scores.items():
                for target_name, score in target_scores.items():
                    target_space = self._target_spaces[module_name][target_name]
                    if target_space.sparse_threshold is not None:
                        reg_loss += torch.norm(score.sigmoid(), p=1) / score.numel()  # type: ignore
                        count += 1
            ratio = max(0., min(1., 1 - (self._remaining_times / self.total_times) ** 3))
            if count > 0:
                reg_loss = self.regular_scale * ratio * reg_loss / count
            return original_loss + reg_loss

        evaluator.patch_loss(loss_patch)

    def _register_trigger(self, evaluator: Evaluator):
        self._current_step = 0
        self._iterial_step = 0
        self._remaining_times = self.total_times

        def optimizer_task():
            self._current_step += 1
            if self.warmup_step < self._current_step <= self.cooldown_begin_step:
                self._iterial_step += 1
                if self._iterial_step == self.interval_steps:
                    self._remaining_times -= 1
                    self.update_sparse_goals(self.total_times - self._remaining_times)
                    debug_msg = f'{self.__class__.__name__} generate masks, remaining times {self._remaining_times}'
                    _logger.debug(debug_msg)
                    if self._remaining_times > 0:
                        self._iterial_step = 0
            if self.warmup_step < self._current_step:
                self.update_masks(self.generate_masks())

        evaluator.patch_optimizer_step(before_step_tasks=[], after_step_tasks=[optimizer_task])

    def update_sparse_goals(self, current_times: int):
        ratio = max(0., min(1., 1 - (1 - current_times / self.total_times) ** 3))
        self._update_sparse_goals_by_ratio(ratio)

    def _collect_data(self) -> Dict[str, Dict[str, torch.Tensor]]:
        data = defaultdict(dict)
        for module_name, ts in self._target_spaces.items():
            for target_name, target_space in ts.items():
                score: torch.Tensor = getattr(target_space._wrapper, MOVEMENT_SCORE_PNAME.format(target_name), None)  # type: ignore
                if score is not None:
                    data[module_name][target_name] = score.clone().detach()
        return data

    def _calculate_metrics(self, data: Dict[str, Dict[str, torch.Tensor]]) -> Dict[str, Dict[str, torch.Tensor]]:
        metrics = defaultdict(dict)
        for module_name, td in data.items():
            for target_name, target_data in td.items():
                if self._target_spaces[module_name][target_name].sparse_threshold is not None:
                    metrics[module_name][target_name] = target_data.sigmoid()
                else:
                    metrics[module_name][target_name] = target_data
        return metrics

    def _generate_sparsity(self, metrics: Dict[str, Dict[str, torch.Tensor]]) -> Dict[str, Dict[str, torch.Tensor]]:
        return generate_sparsity(metrics=metrics, target_spaces=self._target_spaces)

    def _single_compress(self, max_steps: int | None, max_epochs: int | None):
        self._fusion_compress(max_steps, max_epochs)

    def _fuse_preprocess(self, evaluator: Evaluator):
        self._update_sparse_goals_by_ratio(0.)
        self._register_movement_scores()
        self._patch_loss(evaluator)
        self._register_scores_optimization(evaluator)
        self._register_trigger(evaluator)

    def _fuse_postprocess(self, evaluator: Evaluator):
        pass

    def compress(self, max_steps: int | None, max_epochs: int | None):
        if max_steps is not None:
            assert max_steps >= self.cooldown_begin_step
        else:
            warn_msg = \
                f'Using epochs number as training duration, please make sure the total training steps larger than `cooldown_begin_step`.'
            _logger.warning(warn_msg)
        return super().compress(max_steps, max_epochs)