huggingface
diff --git a/‎src/transformers/integrations/hub_kernels.py‎
Lines changed: 3 additions & 0 deletions b/‎src/transformers/integrations/hub_kernels.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/transformers/modeling_utils.py‎
Lines changed: 28 additions & 7 deletions b/‎src/transformers/modeling_utils.py‎
Lines changed: 28 additions & 7 deletions
@@ -341,6 +341,9 @@ def lazy_load_kernel(kernel_name: str, mapping: dict[str, ModuleType | None] = _
  mapping[kernel_name] = kernel
  except FileNotFoundError:
  mapping[kernel_name] = None
+ except AssertionError:
+ # Happens when torch is built without an accelerator backend; fall back to slow path.
+ mapping[kernel_name] = None
 
  else:
  # Try to import is_{kernel_name}_available from ..utils
 
@@ -268,16 +268,24 @@ def get_torch_context_manager_or_global_device():
  return device_in_context
 
 
-def get_state_dict_dtype(state_dict):
+def get_state_dict_dtype(state_dict, config_dtype: Optional[torch.dtype] = None):
  """
  Returns the first found floating dtype in `state_dict` if there is one, otherwise returns the first dtype.
+
+ If `config_dtype` is provided (for instance when `dtype="auto"` and the config already carries a dtype), it is used.
  """
+ if config_dtype is not None:
+ return config_dtype
+
+ if len(state_dict) == 0:
+ return torch.get_default_dtype()
+
  for t in state_dict.values():
  if t.is_floating_point():
  return t.dtype
 
  # if no floating dtype was found return whatever the first dtype is
- return next(state_dict.values()).dtype
+ return next(iter(state_dict.values())).dtype
 
 
 str_to_torch_dtype = {
@@ -722,12 +730,16 @@ def _get_dtype(
  if is_sharded and "dtype" in sharded_metadata:
  dtype = sharded_metadata["dtype"]
  elif state_dict is not None:
- dtype = get_state_dict_dtype(state_dict)
+ dtype = get_state_dict_dtype(state_dict, getattr(config, "dtype", None))
  else:
  state_dict = load_state_dict(
  checkpoint_files[0], map_location="meta", weights_only=weights_only
  )
- dtype = get_state_dict_dtype(state_dict)
+ dtype = get_state_dict_dtype(state_dict, getattr(config, "dtype", None))
+ config.dtype = dtype
+ for sub_config_key in config.sub_configs:
+ if (sub_config := getattr(config, sub_config_key)) is not None:
+ sub_config.dtype = dtype
  logger.info(
  "Since the `dtype` attribute can't be found in model's config object, "
  "will use dtype={dtype} as derived from model's weights"
@@ -1219,6 +1231,14 @@ def __init__(self, config: PreTrainedConfig, *inputs, **kwargs):
  f"`model = {self.__class__.__name__}.from_pretrained(PRETRAINED_MODEL_NAME)`"
  )
  self.config = config
+ if getattr(self.config, "dtype", None) is None:
+ default_dtype = torch.get_default_dtype()
+ self.config.dtype = default_dtype
+ for sub_config_key in self.config.sub_configs:
+ if (sub_config := getattr(self.config, sub_config_key)) is not None and getattr(
+ sub_config, "dtype", None
+ ) is None:
+ sub_config.dtype = default_dtype
 
  # Check the attention implementation is supported, or set it if not yet set (on the internal attr, to avoid
  # setting it recursively)
@@ -3789,7 +3809,8 @@ def from_pretrained(
  output_loading_info = kwargs.pop("output_loading_info", False)
  from_pipeline = kwargs.pop("_from_pipeline", None)
  from_auto_class = kwargs.pop("_from_auto", False)
- dtype = kwargs.pop("dtype", None)
+ dtype_kwarg_provided = "dtype" in kwargs
+ dtype = kwargs.pop("dtype", "auto")
  torch_dtype = kwargs.pop("torch_dtype", None) # kept for BC
  device_map = kwargs.pop("device_map", None)
  max_memory = kwargs.pop("max_memory", None)
@@ -3820,8 +3841,8 @@ def from_pretrained(
  _ = kwargs.pop(name, None)
 
  # For BC on torch_dtype argument
- if torch_dtype is not None:
- dtype = dtype if dtype is not None else torch_dtype
+ if torch_dtype is not None and (not dtype_kwarg_provided or dtype is None):
+ dtype = torch_dtype
 
  if is_offline_mode() and not local_files_only:
  local_files_only = True