huggingface
diff --git a/‎timm/models/convnext.py‎
Lines changed: 64 additions & 18 deletions b/‎timm/models/convnext.py‎
Lines changed: 64 additions & 18 deletions
@@ -44,10 +44,27 @@
 import torch.nn as nn
 
 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
-from timm.layers import trunc_normal_, AvgPool2dSame, DropPath, calculate_drop_path_rates, Mlp, GlobalResponseNormMlp, \
- LayerNorm2d, LayerNorm, RmsNorm2d, RmsNorm, create_conv2d, get_act_layer, get_norm_layer, make_divisible, to_ntuple
-from timm.layers import SimpleNorm2d, SimpleNorm
-from timm.layers import NormMlpClassifierHead, ClassifierHead
+from timm.layers import (
+ trunc_normal_,
+ AvgPool2dSame,
+ DropPath,
+ calculate_drop_path_rates,
+ Mlp,
+ GlobalResponseNormMlp,
+ LayerNorm2d,
+ LayerNorm,
+ RmsNorm2d,
+ RmsNorm,
+ SimpleNorm2d,
+ SimpleNorm,
+ create_conv2d,
+ get_act_layer,
+ get_norm_layer,
+ make_divisible,
+ to_ntuple,
+ NormMlpClassifierHead,
+ ClassifierHead,
+)
 from ._builder import build_model_with_cfg
 from ._features import feature_take_indices
 from ._manipulate import named_apply, checkpoint_seq
@@ -59,7 +76,15 @@
 class Downsample(nn.Module):
  """Downsample module for ConvNeXt."""
 
- def __init__(self, in_chs: int, out_chs: int, stride: int = 1, dilation: int = 1) -> None:
+ def __init__(
+ self,
+ in_chs: int,
+ out_chs: int,
+ stride: int = 1,
+ dilation: int = 1,
+ device=None,
+ dtype=None,
+ ) -> None:
  """Initialize Downsample module.
 
  Args:
@@ -68,6 +93,7 @@ def __init__(self, in_chs: int, out_chs: int, stride: int = 1, dilation: int = 1
  stride: Stride for downsampling.
  dilation: Dilation rate.
  """
+ dd = {'device': device, 'dtype': dtype}
  super().__init__()
  avg_stride = stride if dilation == 1 else 1
  if stride > 1 or dilation > 1:
@@ -77,7 +103,7 @@ def __init__(self, in_chs: int, out_chs: int, stride: int = 1, dilation: int = 1
  self.pool = nn.Identity()
 
  if in_chs != out_chs:
- self.conv = create_conv2d(in_chs, out_chs, 1, stride=1)
+ self.conv = create_conv2d(in_chs, out_chs, 1, stride=1, **dd)
  else:
  self.conv = nn.Identity()
 
@@ -115,6 +141,8 @@ def __init__(
  act_layer: Union[str, Callable] = 'gelu',
  norm_layer: Optional[Callable] = None,
  drop_path: float = 0.,
+ device=None,
+ dtype=None,
  ):
  """
 
@@ -133,6 +161,7 @@ def __init__(
  norm_layer: Normalization layer (defaults to LN if not specified).
  drop_path: Stochastic depth probability.
  """
+ dd = {'device': device, 'dtype': dtype}
  super().__init__()
  out_chs = out_chs or in_chs
  dilation = to_ntuple(2)(dilation)
@@ -149,12 +178,18 @@ def __init__(
  dilation=dilation[0],
  depthwise=True,
  bias=conv_bias,
+ **dd,
+ )
+ self.norm = norm_layer(out_chs, **dd)
+ self.mlp = mlp_layer(
+ out_chs,
+ int(mlp_ratio * out_chs),
+ act_layer=act_layer,
+ **dd,
  )
- self.norm = norm_layer(out_chs)
- self.mlp = mlp_layer(out_chs, int(mlp_ratio * out_chs), act_layer=act_layer)
- self.gamma = nn.Parameter(ls_init_value * torch.ones(out_chs)) if ls_init_value is not None else None
+ self.gamma = nn.Parameter(ls_init_value * torch.ones(out_chs, **dd)) if ls_init_value is not None else None
  if in_chs != out_chs or stride != 1 or dilation[0] != dilation[1]:
- self.shortcut = Downsample(in_chs, out_chs, stride=stride, dilation=dilation[0])
+ self.shortcut = Downsample(in_chs, out_chs, stride=stride, dilation=dilation[0], **dd)
  else:
  self.shortcut = nn.Identity()
  self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
@@ -196,7 +231,9 @@ def __init__(
  use_grn: bool = False,
  act_layer: Union[str, Callable] = 'gelu',
  norm_layer: Optional[Callable] = None,
- norm_layer_cl: Optional[Callable] = None
+ norm_layer_cl: Optional[Callable] = None,
+ device=None,
+ dtype=None,
  ) -> None:
  """Initialize ConvNeXt stage.
 
@@ -216,14 +253,15 @@ def __init__(
  norm_layer: Normalization layer.
  norm_layer_cl: Normalization layer for channels last.
  """
+ dd = {'device': device, 'dtype': dtype}
  super().__init__()
  self.grad_checkpointing = False
 
  if in_chs != out_chs or stride > 1 or dilation[0] != dilation[1]:
  ds_ks = 2 if stride > 1 or dilation[0] != dilation[1] else 1
  pad = 'same' if dilation[1] > 1 else 0 # same padding needed if dilation used
  self.downsample = nn.Sequential(
- norm_layer(in_chs),
+ norm_layer(in_chs, **dd),
  create_conv2d(
  in_chs,
  out_chs,
@@ -232,6 +270,7 @@ def __init__(
  dilation=dilation[0],
  padding=pad,
  bias=conv_bias,
+ **dd,
  ),
  )
  in_chs = out_chs
@@ -253,6 +292,7 @@ def __init__(
  use_grn=use_grn,
  act_layer=act_layer,
  norm_layer=norm_layer if conv_mlp else norm_layer_cl,
+ **dd,
  ))
  in_chs = out_chs
  self.blocks = nn.Sequential(*stage_blocks)
@@ -324,6 +364,8 @@ def __init__(
  norm_eps: Optional[float] = None,
  drop_rate: float = 0.,
  drop_path_rate: float = 0.,
+ device=None,
+ dtype=None,
  ):
  """
  Args:
@@ -349,6 +391,7 @@ def __init__(
  drop_path_rate: Stochastic depth drop rate.
  """
  super().__init__()
+ dd = {'device': device, 'dtype': dtype}
  assert output_stride in (8, 16, 32)
  kernel_sizes = to_ntuple(4)(kernel_sizes)
  norm_layer, norm_layer_cl = _get_norm_layers(norm_layer, conv_mlp, norm_eps)
@@ -362,17 +405,17 @@ def __init__(
  if stem_type == 'patch':
  # NOTE: this stem is a minimal form of ViT PatchEmbed, as used in SwinTransformer w/ patch_size = 4
  self.stem = nn.Sequential(
- nn.Conv2d(in_chans, dims[0], kernel_size=patch_size, stride=patch_size, bias=conv_bias),
- norm_layer(dims[0]),
+ nn.Conv2d(in_chans, dims[0], kernel_size=patch_size, stride=patch_size, bias=conv_bias, **dd),
+ norm_layer(dims[0], **dd),
  )
  stem_stride = patch_size
  else:
  mid_chs = make_divisible(dims[0] // 2) if 'tiered' in stem_type else dims[0]
  self.stem = nn.Sequential(*filter(None, [
- nn.Conv2d(in_chans, mid_chs, kernel_size=3, stride=2, padding=1, bias=conv_bias),
+ nn.Conv2d(in_chans, mid_chs, kernel_size=3, stride=2, padding=1, bias=conv_bias, **dd),
  act_layer() if 'act' in stem_type else None,
- nn.Conv2d(mid_chs, dims[0], kernel_size=3, stride=2, padding=1, bias=conv_bias),
- norm_layer(dims[0]),
+ nn.Conv2d(mid_chs, dims[0], kernel_size=3, stride=2, padding=1, bias=conv_bias, **dd),
+ norm_layer(dims[0], **dd),
  ]))
  stem_stride = 4
 
@@ -406,6 +449,7 @@ def __init__(
  act_layer=act_layer,
  norm_layer=norm_layer,
  norm_layer_cl=norm_layer_cl,
+ **dd,
  ))
  prev_chs = out_chs
  # NOTE feature_info use currently assumes stage 0 == stride 1, rest are stride 2
@@ -417,12 +461,13 @@ def __init__(
  # otherwise pool -> norm -> fc, the default ConvNeXt ordering (pretrained FB weights)
  if head_norm_first:
  assert not head_hidden_size
- self.norm_pre = norm_layer(self.num_features)
+ self.norm_pre = norm_layer(self.num_features, **dd)
  self.head = ClassifierHead(
  self.num_features,
  num_classes,
  pool_type=global_pool,
  drop_rate=self.drop_rate,
+ **dd,
  )
  else:
  self.norm_pre = nn.Identity()
@@ -434,6 +479,7 @@ def __init__(
  drop_rate=self.drop_rate,
  norm_layer=norm_layer,
  act_layer='gelu',
+ **dd,
  )
  self.head_hidden_size = self.head.num_features
  named_apply(partial(_init_weights, head_init_scale=head_init_scale), self)