huggingface
diff --git a/‎timm/models/eva.py‎
Lines changed: 54 additions & 26 deletions b/‎timm/models/eva.py‎
Lines changed: 54 additions & 26 deletions
@@ -121,6 +121,8 @@ def __init__(
  qk_norm: bool = False,
  scale_norm: bool = True,
  rotate_half: bool = False,
+ device=None,
+ dtype=None,
  ):
  """
  Args:
@@ -139,6 +141,7 @@ def __init__(
  scale_norm: Enable normalization (scaling) of attention output with norm_layer
  rotate_half: Use half rotation layout instead of interleaved
  """
+ dd = {'device': device, 'dtype': dtype}
  super().__init__()
  if scale_norm or qk_norm:
  assert norm_layer is not None, 'norm_layer must be provided if qk_norm or scale_norm is True'
@@ -154,25 +157,25 @@ def __init__(
  self.rotate_half = rotate_half
 
  if qkv_fused:
- self.qkv = nn.Linear(dim, attn_dim * 3, bias=False)
+ self.qkv = nn.Linear(dim, attn_dim * 3, bias=False, **dd)
  self.q_proj = self.k_proj = self.v_proj = None
  if qkv_bias:
- self.q_bias = nn.Parameter(torch.zeros(attn_dim))
- self.register_buffer('k_bias', torch.zeros(attn_dim), persistent=False)
- self.v_bias = nn.Parameter(torch.zeros(attn_dim))
+ self.q_bias = nn.Parameter(torch.zeros(attn_dim, **dd))
+ self.register_buffer('k_bias', torch.zeros(attn_dim, **dd), persistent=False)
+ self.v_bias = nn.Parameter(torch.zeros(attn_dim, **dd))
  else:
  self.q_bias = self.k_bias = self.v_bias = None
  else:
- self.q_proj = nn.Linear(dim, attn_dim, bias=qkv_bias)
- self.k_proj = nn.Linear(dim, attn_dim, bias=False)
- self.v_proj = nn.Linear(dim, attn_dim, bias=qkv_bias)
+ self.q_proj = nn.Linear(dim, attn_dim, bias=qkv_bias, **dd)
+ self.k_proj = nn.Linear(dim, attn_dim, bias=False, **dd)
+ self.v_proj = nn.Linear(dim, attn_dim, bias=qkv_bias, **dd)
  self.qkv = None
  self.q_bias = self.k_bias = self.v_bias = None
- self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
- self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+ self.q_norm = norm_layer(self.head_dim, **dd) if qk_norm else nn.Identity()
+ self.k_norm = norm_layer(self.head_dim, **dd) if qk_norm else nn.Identity()
  self.attn_drop = nn.Dropout(attn_drop)
- self.norm = norm_layer(attn_dim) if scale_norm else nn.Identity()
- self.proj = nn.Linear(attn_dim, dim)
+ self.norm = norm_layer(attn_dim, **dd) if scale_norm else nn.Identity()
+ self.proj = nn.Linear(attn_dim, dim, **dd)
  self.proj_drop = nn.Dropout(proj_drop)
 
  def forward(
@@ -263,6 +266,8 @@ def __init__(
  act_layer: Callable = nn.GELU,
  norm_layer: Callable = LayerNorm,
  attn_head_dim: Optional[int] = None,
+ device=None,
+ dtype=None,
  **kwargs,
  ):
  """ Initialize the EVA transformer block.
@@ -286,8 +291,10 @@ def __init__(
  norm_layer: Normalization layer constructor
  attn_head_dim: Dimension of each attention head (if None, computed as dim // num_heads)
  """
+ dd = {'device': device, 'dtype': dtype}
  super().__init__()
- self.norm1 = norm_layer(dim)
+
+ self.norm1 = norm_layer(dim, **dd)
  attn_cls = AttentionRope if attn_type == 'rope' else EvaAttention
  self.attn = attn_cls(
  dim,
@@ -301,11 +308,12 @@ def __init__(
  norm_layer=norm_layer,
  scale_norm=scale_attn_inner,
  rotate_half=rotate_half,
+ **dd,
  )
- self.gamma_1 = nn.Parameter(init_values * torch.ones(dim)) if init_values is not None else None
+ self.gamma_1 = nn.Parameter(init_values * torch.ones(dim, **dd)) if init_values is not None else None
  self.drop_path1 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
 
- self.norm2 = norm_layer(dim)
+ self.norm2 = norm_layer(dim, **dd)
  hidden_features = int(dim * mlp_ratio)
  if swiglu_mlp:
  if scale_mlp or swiglu_align_to:
@@ -316,6 +324,7 @@ def __init__(
  norm_layer=norm_layer if scale_mlp else None,
  drop=proj_drop,
  align_to=swiglu_align_to,
+ **dd,
  )
  else:
  # w/o any extra norm, an impl with packed weights is used
@@ -326,6 +335,7 @@ def __init__(
  act_layer=nn.SiLU,
  gate_last=False,
  drop=proj_drop,
+ **dd,
  )
  else:
  self.mlp = Mlp(
@@ -334,8 +344,9 @@ def __init__(
  act_layer=act_layer,
  norm_layer=norm_layer if scale_mlp else None,
  drop=proj_drop,
+ **dd,
  )
- self.gamma_2 = nn.Parameter(init_values * torch.ones(dim)) if init_values is not None else None
+ self.gamma_2 = nn.Parameter(init_values * torch.ones(dim, **dd)) if init_values is not None else None
  self.drop_path2 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
 
  def forward(
@@ -376,6 +387,8 @@ def __init__(
  act_layer: Callable = nn.GELU,
  norm_layer: Callable = nn.LayerNorm,
  attn_head_dim: Optional[int] = None,
+ device=None,
+ dtype=None,
  ):
  """ Initialize the post-norm EVA transformer block.
 
@@ -398,7 +411,9 @@ def __init__(
  norm_layer: Normalization layer constructor
  attn_head_dim: Dimension of each attention head (if None, computed as dim // num_heads)
  """
+ dd = {'device': device, 'dtype': dtype}
  super().__init__()
+
  attn_cls = AttentionRope if attn_type == 'rope' else EvaAttention
  self.attn = attn_cls(
  dim,
@@ -412,8 +427,9 @@ def __init__(
  norm_layer=norm_layer,
  scale_norm=scale_attn_inner,
  rotate_half=rotate_half,
+ **dd,
  )
- self.norm1 = norm_layer(dim)
+ self.norm1 = norm_layer(dim, **dd)
  self.drop_path1 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
 
  hidden_features = int(dim * mlp_ratio)
@@ -426,6 +442,7 @@ def __init__(
  norm_layer=norm_layer if scale_mlp else None,
  drop=proj_drop,
  align_to=swiglu_align_to,
+ **dd,
  )
  else:
  # w/o any extra norm, an impl with packed fc1 weights is used, matches existing GluMLP
@@ -436,6 +453,7 @@ def __init__(
  act_layer=nn.SiLU,
  gate_last=False,
  drop=proj_drop,
+ **dd,
  )
  else:
  self.mlp = Mlp(
@@ -444,8 +462,9 @@ def __init__(
  act_layer=act_layer,
  norm_layer=norm_layer if scale_mlp else None,
  drop=proj_drop,
+ **dd,
  )
- self.norm2 = norm_layer(dim)
+ self.norm2 = norm_layer(dim, **dd)
  self.drop_path2 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
 
  def forward(
@@ -513,6 +532,8 @@ def __init__(
  dynamic_img_pad: bool = False,
  ref_feat_shape: Optional[Union[Tuple[int, int], int]] = None,
  head_init_scale: float = 0.001,
+ device=None,
+ dtype=None,
  ):
  """Initialize the EVA Vision Transformer model.
 
@@ -562,6 +583,7 @@ def __init__(
  head_init_scale: Initialization scale for classification head weights
  """
  super().__init__()
+ dd = {'device': device, 'dtype': dtype}
  assert global_pool in ('', 'avg', 'avgmax', 'max', 'token', 'map')
  self.num_classes = num_classes
  self.global_pool = global_pool
@@ -594,16 +616,17 @@ def __init__(
  dynamic_img_pad=dynamic_img_pad,
  bias=not use_pre_transformer_norm,
  **embed_args,
+ **dd,
  )
  num_patches = self.patch_embed.num_patches
  r = self.patch_embed.feat_ratio() if hasattr(self.patch_embed, 'feat_ratio') else patch_size
 
- self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) if class_token else None
- self.reg_token = nn.Parameter(torch.zeros(1, num_reg_tokens, embed_dim)) if num_reg_tokens else None
+ self.cls_token = nn.Parameter(torch.empty(1, 1, embed_dim, **dd)) if class_token else None
+ self.reg_token = nn.Parameter(torch.empty(1, num_reg_tokens, embed_dim, **dd)) if num_reg_tokens else None
  self.cls_embed = class_token and self.reg_token is None
 
  num_pos_tokens = num_patches if no_embed_class else num_patches + self.num_prefix_tokens
- self.pos_embed = nn.Parameter(torch.zeros(1, num_pos_tokens, embed_dim)) if use_abs_pos_emb else None
+ self.pos_embed = nn.Parameter(torch.empty(1, num_pos_tokens, embed_dim, **dd)) if use_abs_pos_emb else None
  self.pos_drop = nn.Dropout(p=pos_drop_rate)
  if patch_drop_rate > 0:
  self.patch_drop = PatchDropoutWithIndices(patch_drop_rate, num_prefix_tokens=self.num_prefix_tokens)
@@ -621,6 +644,7 @@ def __init__(
  feat_shape=None if dynamic_img_size else self.patch_embed.grid_size,
  temperature=rope_temperature,
  grid_indexing=rope_grid_indexing,
+ **dd,
  )
  if rope_type == 'mixed':
  rope_kwargs.update(dict(depth=depth))
@@ -636,7 +660,7 @@ def __init__(
  else:
  self.rope = None
 
- self.norm_pre = norm_layer(embed_dim) if activate_pre_norm else nn.Identity()
+ self.norm_pre = norm_layer(embed_dim, **dd) if activate_pre_norm else nn.Identity()
 
  dpr = calculate_drop_path_rates(drop_path_rate, depth) # stochastic depth decay rule
  block_fn = EvaBlockPostNorm if use_post_norm else EvaBlock
@@ -659,12 +683,13 @@ def __init__(
  drop_path=dpr[i],
  norm_layer=norm_layer,
  init_values=init_values,
+ **dd,
  )
  for i in range(depth)])
  self.feature_info = [
  dict(module=f'blocks.{i}', num_chs=embed_dim, reduction=r) for i in range(depth)]
 
- self.norm = norm_layer(embed_dim) if activate_post_norm else nn.Identity()
+ self.norm = norm_layer(embed_dim, **dd) if activate_post_norm else nn.Identity()
 
  if global_pool == 'map':
  self.attn_pool = AttentionPoolLatent(
@@ -673,23 +698,26 @@ def __init__(
  mlp_ratio=attn_pool_mlp_ratio or mlp_ratio,
  norm_layer=norm_layer,
  act_layer=nn.GELU,
+ **dd,
  )
  else:
  self.attn_pool = None
- self.fc_norm = norm_layer(embed_dim) if activate_fc_norm else nn.Identity()
+ self.fc_norm = norm_layer(embed_dim, **dd) if activate_fc_norm else nn.Identity()
  self.head_drop = nn.Dropout(drop_rate)
- self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+ self.head = nn.Linear(embed_dim, num_classes, **dd) if num_classes > 0 else nn.Identity()
 
+ self.init_weights(head_init_scale=head_init_scale)
+
+ def init_weights(self, head_init_scale=None):
  self.apply(self._init_weights)
  if self.pos_embed is not None:
  trunc_normal_(self.pos_embed, std=.02)
  if self.cls_token is not None:
  trunc_normal_(self.cls_token, std=.02)
  if self.reg_token is not None:
  trunc_normal_(self.reg_token, std=.02)
-
  self.fix_init_weight()
- if isinstance(self.head, nn.Linear):
+ if head_init_scale and isinstance(self.head, nn.Linear):
  trunc_normal_(self.head.weight, std=.02)
  self.head.weight.data.mul_(head_init_scale)
  self.head.bias.data.mul_(head_init_scale)