huggingface · ArthurZucker · Oct 17, 2024 · Oct 16, 2024 · Oct 16, 2024 · Oct 16, 2024
diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py
@@ -184,6 +184,9 @@ class PretrainedConfig(PushToHubMixin):
  Whether the model should use legacy TensorFlow losses. Legacy losses have variable output shapes and may
  not be XLA-compatible. This option is here for backward compatibility and will be removed in Transformers
  v5.
+ loss_type (`str`, *optional*):
+ The type of loss that the model should use. It should be in `LOSS_MAPPING`'s keys, otherwise the loss will
+ be automatically infered from the model architecture.
  """
 
  model_type: str = ""

diff --git a/src/transformers/loss_utils.py b/src/transformers/loss_utils.py
@@ -0,0 +1,98 @@
+import torch
+import torch.nn as nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from .models.detr.loss_detr import ForObjectDetectionLoss, ForSegmentationLoss
+
+
+def DefaultCrossEntropyLoss(logits, labels, **kwargs):
+ # Upcast to float if we need to compute the loss to avoid potential precision issues
+ logits = logits.float()
+ # Shift so that tokens < n predict n
+ shift_logits = logits[..., :-1, :].contiguous()
+ shift_labels = labels[..., 1:].contiguous()
+
+ # Flatten the tokens
+ shift_logits = shift_logits.view(-1, kwargs["vocab_size"])
+ shift_labels = shift_labels.view(-1)
+ # Enable model parallelism
+ shift_labels = shift_labels.to(shift_logits.device)
+
+ num_items = kwargs.pop("num_items", None)
+
+ if num_items is not None:
+ # Calculate the CrossEntropyLoss manually when using grad accum
+ log_probs = nn.functional.log_softmax(shift_logits, dim=-1)
+ loss = -log_probs[range(shift_labels.size(0)), shift_labels]
+ loss = loss.sum() / num_items
+ else:
+ loss = nn.functional.cross_entropy(shift_logits, shift_labels, ignore_index=-100)
+
+ return loss
+
+
+def ForSequenceClassificationLoss(logits, labels, pooled_logits, **kwargs):
+ config = kwargs["config"]
+ num_labels = config.num_labels
+ if config.problem_type is None:
+ if num_labels == 1:
+ config.problem_type = "regression"
+ elif num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+ config.problem_type = "single_label_classification"
+ else:
+ config.problem_type = "multi_label_classification"
+
+ if config.problem_type == "regression":
+ loss_fct = MSELoss()
+ if num_labels == 1:
+ loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+ else:
+ loss = loss_fct(pooled_logits, labels)
+ elif config.problem_type == "single_label_classification":
+ loss_fct = CrossEntropyLoss()
+ loss = loss_fct(pooled_logits.view(-1, num_labels), labels.view(-1))
+ elif config.problem_type == "multi_label_classification":
+ loss_fct = BCEWithLogitsLoss()
+ loss = loss_fct(pooled_logits, labels)
+ return loss
+
+
+def ForQuestionAnsweringLoss(start_logits, end_logits, start_positions, end_positions):
+ total_loss = None
+ if start_positions is not None and end_positions is not None:
+ # If we are on multi-GPU, split add a dimension
+ if len(start_positions.size()) > 1:
+ start_positions = start_positions.squeeze(-1).to(start_logits.device)
+ if len(end_positions.size()) > 1:
+ end_positions = end_positions.squeeze(-1).to(end_logits.device)
+ # sometimes the start/end positions are outside our model inputs, we ignore these terms
+ ignored_index = start_logits.size(1)
+ start_positions = start_positions.clamp(0, ignored_index)
+ end_positions = end_positions.clamp(0, ignored_index)
+
+ loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+ start_loss = loss_fct(start_logits, start_positions)
+ end_loss = loss_fct(end_logits, end_positions)
+ total_loss = (start_loss + end_loss) / 2
+ return total_loss
+
+
+def ForTokenClassification(logits, labels, config, **kwargs):
+ # Upcast to float if we need to compute the loss to avoid potential precision issues
+ logits = logits.view(-1, config.num_labels)
+ labels = labels.view(-1)
+ logits = logits.float()
+ # Flatten the tokens
+ loss_fct = CrossEntropyLoss()
+ return loss_fct(logits, labels)
+
+
+LOSS_MAPPING = {
+ "ForCausalLM": DefaultCrossEntropyLoss,
+ "ForQuestionAnswering": ForQuestionAnsweringLoss,
+ "ForSequenceClassification": ForSequenceClassificationLoss,
+ "ForTokenClassification": ForTokenClassification,
+}
+
+LOSS_MAPPING["ForSegmentation"] = ForSegmentationLoss
+LOSS_MAPPING["ForObjectDetection"] = ForObjectDetectionLoss
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
@@ -28,7 +28,7 @@
 import warnings
 from contextlib import contextmanager
 from dataclasses import dataclass
-from functools import partial, wraps
+from functools import lru_cache, partial, wraps
 from threading import Thread
 from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
 from zipfile import is_zipfile
@@ -45,6 +45,7 @@
 from .dynamic_module_utils import custom_object_save
 from .generation import GenerationConfig, GenerationMixin
 from .integrations import PeftAdapterMixin, deepspeed_config, is_deepspeed_zero3_enabled
+from .loss_utils import LOSS_MAPPING
 from .pytorch_utils import ( # noqa: F401
  Conv1D,
  apply_chunking_to_forward,
@@ -4979,6 +4980,32 @@ def _is_quantized_training_enabled(self):
 
  return self.hf_quantizer.is_trainable
 
+ @property
+ @lru_cache
+ def loss_function(self):
+ if getattr(self.config, "loss_type", None) is not None:
+ loss_type = self.config.loss_type
+ else:
+ loss_type = self.__class__.__name__
+ if loss_type not in LOSS_MAPPING:
+ loss_groups = f"({'|'.join(LOSS_MAPPING)})"
+ loss_type = re.findall(loss_groups, self.__class__.__name__)
+ if len(loss_type) > 0:
+ loss_type = loss_type[0]
+ else:
+ loss_type = None
+ if loss_type is None:
+ raise ValueError(
+ "We could not determine which loss function to use."
+ f"based on the the class name. Make sure you add `{ self.__class__.__name__}` to the `LOSS_MAPPING`"
+ )
+ if loss_type not in LOSS_MAPPING and getattr(self.config, "loss_type", None) is not None:
+ raise ValueError(
+ f"`loss_type={loss_type}` was set in the config but it is unrecognised"
+ f"based on the the class name. Make sure you add `{loss_type}` to the `LOSS_MAPPING`"
+ )
+ return LOSS_MAPPING[loss_type]
+
 
 PreTrainedModel.push_to_hub = copy_func(PreTrainedModel.push_to_hub)
 if PreTrainedModel.push_to_hub.__doc__ is not None: