# Source code for nni.nas.hub.pytorch.proxylessnas

# Copyright (c) Microsoft Corporation.

import math
from typing import Optional, Callable, List, Tuple, Iterator, Union, cast, overload

import torch
import nni.nas.nn.pytorch as nn
from nni.nas import model_wrapper

from .utils.fixed import FixedFactory
from .utils.pretrained import load_pretrained_weight

def make_divisible(v: Union[int, float], divisor, min_val=None) -> int:
...

def make_divisible(v: Union[nn.ChoiceOf[int], nn.ChoiceOf[float]], divisor, min_val=None) -> nn.ChoiceOf[int]:
...

def make_divisible(v: Union[nn.ChoiceOf[int], nn.ChoiceOf[float], int, float], divisor, min_val=None) -> nn.MaybeChoice[int]:
"""
This function is taken from the original tf repo.
It ensures that all layers have a channel number that is divisible by 8
It can be seen here:
https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
"""
if min_val is None:
min_val = divisor
# This should work for both value choices and constants.
new_v = nn.ValueChoice.max(min_val, round(v + divisor // 2) // divisor * divisor)
# Make sure that round down does not go down by more than 10%.
return nn.ValueChoice.condition(new_v < 0.9 * v, new_v + divisor, new_v)

def simplify_sequential(sequentials: List[nn.Module]) -> Iterator[nn.Module]:
"""
Flatten the sequential blocks so that the hierarchy looks better.
Eliminate identity modules automatically.
"""
for module in sequentials:
if isinstance(module, nn.Sequential):
for submodule in module.children():
# no recursive expansion
if not isinstance(submodule, nn.Identity):
yield submodule
else:
if not isinstance(module, nn.Identity):
yield module

class ConvBNReLU(nn.Sequential):
"""
The template for a conv-bn-relu block.
"""

def __init__(
self,
in_channels: nn.MaybeChoice[int],
out_channels: nn.MaybeChoice[int],
kernel_size: nn.MaybeChoice[int] = 3,
stride: int = 1,
groups: nn.MaybeChoice[int] = 1,
norm_layer: Optional[Callable[[int], nn.Module]] = None,
activation_layer: Optional[Callable[..., nn.Module]] = None,
dilation: int = 1,
) -> None:
padding = (kernel_size - 1) // 2 * dilation
if norm_layer is None:
norm_layer = nn.BatchNorm2d
if activation_layer is None:
activation_layer = nn.ReLU6
# If no normalization is used, set bias to True
norm = norm_layer(cast(int, out_channels))
no_normalization = isinstance(norm, nn.Identity)
blocks: List[nn.Module] = [
nn.Conv2d(
cast(int, in_channels),
cast(int, out_channels),
cast(int, kernel_size),
stride,
dilation=dilation,
groups=cast(int, groups),
bias=no_normalization
),
# Normalization, regardless of batchnorm or identity
norm,
# One pytorch implementation as an SE here, to faithfully reproduce paper
# We follow a more accepted approach to put SE outside
# Reference: https://github.com/d-li14/mobilenetv3.pytorch/issues/18
activation_layer(inplace=True)
]

super().__init__(*simplify_sequential(blocks))

class DepthwiseSeparableConv(nn.Sequential):
"""
In the original MobileNetV2 implementation, this is InvertedResidual when expand ratio = 1.
Residual connection is added if input and output shape are the same.

References:

- https://github.com/rwightman/pytorch-image-models/blob/b7cb8d03/timm/models/efficientnet_blocks.py#L90
- https://github.com/ultmaster/AceNAS/blob/46c8895f/searchspace/proxylessnas/utils.py#L100
"""

def __init__(
self,
in_channels: nn.MaybeChoice[int],
out_channels: nn.MaybeChoice[int],
kernel_size: nn.MaybeChoice[int] = 3,
stride: int = 1,
squeeze_excite: Optional[Callable[[nn.MaybeChoice[int], nn.MaybeChoice[int]], nn.Module]] = None,
norm_layer: Optional[Callable[[int], nn.Module]] = None,
activation_layer: Optional[Callable[..., nn.Module]] = None,
) -> None:
blocks = [
# dw
ConvBNReLU(in_channels, in_channels, stride=stride, kernel_size=kernel_size, groups=in_channels,
norm_layer=norm_layer, activation_layer=activation_layer),
# optional se
squeeze_excite(in_channels, in_channels) if squeeze_excite else nn.Identity(),
# pw-linear
ConvBNReLU(in_channels, out_channels, kernel_size=1, norm_layer=norm_layer, activation_layer=nn.Identity)
]
super().__init__(*simplify_sequential(blocks))
# NOTE: "is" is used here instead of "==" to avoid creating a new value choice.
self.has_skip = stride == 1 and in_channels is out_channels

def forward(self, x: torch.Tensor) -> torch.Tensor:
if self.has_skip:
return x + super().forward(x)
else:
return super().forward(x)

[docs]class InvertedResidual(nn.Sequential): """ An Inverted Residual Block, sometimes called an MBConv Block, is a type of residual block used for image models that uses an inverted structure for efficiency reasons. It was originally proposed for the `MobileNetV2 <https://arxiv.org/abs/1801.04381>`__ CNN architecture. It has since been reused for several mobile-optimized CNNs. It follows a narrow -> wide -> narrow approach, hence the inversion. It first widens with a 1x1 convolution, then uses a 3x3 depthwise convolution (which greatly reduces the number of parameters), then a 1x1 convolution is used to reduce the number of channels so input and output can be added. This implementation is sort of a mixture between: - https://github.com/google-research/google-research/blob/20736344/tunas/rematlib/mobile_model_v3.py#L453 - https://github.com/rwightman/pytorch-image-models/blob/b7cb8d03/timm/models/efficientnet_blocks.py#L134 Parameters ---------- in_channels The number of input channels. Can be a value choice. out_channels The number of output channels. Can be a value choice. expand_ratio The ratio of intermediate channels with respect to input channels. Can be a value choice. kernel_size The kernel size of the depthwise convolution. Can be a value choice. stride The stride of the depthwise convolution. squeeze_excite Callable to create squeeze and excitation layer. Take hidden channels and input channels as arguments. norm_layer Callable to create normalization layer. Take input channels as argument. activation_layer Callable to create activation layer. No input arguments. """ def __init__( self, in_channels: nn.MaybeChoice[int], out_channels: nn.MaybeChoice[int], expand_ratio: nn.MaybeChoice[float], kernel_size: nn.MaybeChoice[int] = 3, stride: int = 1, squeeze_excite: Optional[Callable[[nn.MaybeChoice[int], nn.MaybeChoice[int]], nn.Module]] = None, norm_layer: Optional[Callable[[int], nn.Module]] = None, activation_layer: Optional[Callable[..., nn.Module]] = None, ) -> None: super().__init__() self.stride = stride self.out_channels = out_channels assert stride in [1, 2] hidden_ch = cast(int, make_divisible(in_channels * expand_ratio, 8)) # NOTE: this equivalence check (==) does NOT work for ValueChoice, need to use "is" self.has_skip = stride == 1 and in_channels is out_channels layers: List[nn.Module] = [ # point-wise convolution # NOTE: some paper omit this point-wise convolution when stride = 1. # In our implementation, if this pw convolution is intended to be omitted, # please use SepConv instead. ConvBNReLU(in_channels, hidden_ch, kernel_size=1, norm_layer=norm_layer, activation_layer=activation_layer), # depth-wise ConvBNReLU(hidden_ch, hidden_ch, stride=stride, kernel_size=kernel_size, groups=hidden_ch, norm_layer=norm_layer, activation_layer=activation_layer), # SE squeeze_excite( cast(int, hidden_ch), cast(int, in_channels) ) if squeeze_excite is not None else nn.Identity(), # pw-linear ConvBNReLU(hidden_ch, out_channels, kernel_size=1, norm_layer=norm_layer, activation_layer=nn.Identity), ] super().__init__(*simplify_sequential(layers)) def forward(self, x: torch.Tensor) -> torch.Tensor: if self.has_skip: return x + super().forward(x) else: return super().forward(x)
def inverted_residual_choice_builder( expand_ratios: List[int], kernel_sizes: List[int], downsample: bool, stage_input_width: int, stage_output_width: int, label: str ): def builder(index): stride = 1 inp = stage_output_width if index == 0: # first layer in stage # do downsample and width reshape inp = stage_input_width if downsample: stride = 2 oup = stage_output_width op_choices = {} for exp_ratio in expand_ratios: for kernel_size in kernel_sizes: op_choices[f'k{kernel_size}e{exp_ratio}'] = InvertedResidual(inp, oup, exp_ratio, kernel_size, stride) # It can be implemented with ValueChoice, but we use LayerChoice here # to be aligned with the intention of the original ProxylessNAS. return nn.LayerChoice(op_choices, label=f'{label}_i{index}') return builder