vllm-project
diff --git a/‎examples/offline_inference/vision_language.py‎
Lines changed: 176 additions & 118 deletions b/‎examples/offline_inference/vision_language.py‎
Lines changed: 176 additions & 118 deletions
diff --git a/‎vllm/model_executor/models/aria.py‎
Lines changed: 3 additions & 1 deletion b/‎vllm/model_executor/models/aria.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎vllm/model_executor/models/blip2.py‎
Lines changed: 3 additions & 1 deletion b/‎vllm/model_executor/models/blip2.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎vllm/model_executor/models/chameleon.py‎
Lines changed: 3 additions & 1 deletion b/‎vllm/model_executor/models/chameleon.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎vllm/model_executor/models/deepseek_vl2.py‎
Lines changed: 3 additions & 1 deletion b/‎vllm/model_executor/models/deepseek_vl2.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎vllm/model_executor/models/florence2.py‎
Lines changed: 3 additions & 1 deletion b/‎vllm/model_executor/models/florence2.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎vllm/model_executor/models/fuyu.py‎
Lines changed: 4 additions & 2 deletions b/‎vllm/model_executor/models/fuyu.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎vllm/model_executor/models/glm4v.py‎
Lines changed: 3 additions & 1 deletion b/‎vllm/model_executor/models/glm4v.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎vllm/model_executor/models/idefics3.py‎
Lines changed: 3 additions & 1 deletion b/‎vllm/model_executor/models/idefics3.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎vllm/model_executor/models/interfaces.py‎
Lines changed: 9 additions & 9 deletions b/‎vllm/model_executor/models/interfaces.py‎
Lines changed: 9 additions & 9 deletions
@@ -602,7 +602,9 @@ def _process_image_input(
 
  return self.multi_modal_projector(image_outputs, image_attn_mask)
 
- def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+ def get_multimodal_embeddings(
+ self, **kwargs
+ ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
  image_input = self._parse_and_validate_image_input(**kwargs)
  if image_input is None:
  return None
 
@@ -628,7 +628,9 @@ def _process_image_input(self,
 
  return self.language_projection(query_output)
 
- def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+ def get_multimodal_embeddings(
+ self, **kwargs
+ ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
  image_input = self._parse_and_validate_image_input(**kwargs)
  if image_input is None:
  return None
 
@@ -986,7 +986,9 @@ def _parse_and_validate_image_input(
  data=self._validate_pixel_values(pixel_values),
  )
 
- def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+ def get_multimodal_embeddings(
+ self, **kwargs
+ ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
  image_input = self._parse_and_validate_image_input(**kwargs)
  if image_input is None:
  return None
 
@@ -606,7 +606,9 @@ def _process_image_input(
  return self._pixel_values_to_embedding(
  pixel_values=pixel_values, images_spatial_crop=images_spatial_crop)
 
- def get_multimodal_embeddings(self, **kwargs: object) -> torch.Tensor:
+ def get_multimodal_embeddings(
+ self, **kwargs: object
+ ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
  image_input = self._parse_and_validate_image_input(**kwargs)
  if image_input is None:
  return None
 
@@ -1037,7 +1037,9 @@ def _process_image_input(
  pixel_values = image_input["data"]
  return self._encode_image(pixel_values)
 
- def get_multimodal_embeddings(self, **kwargs: object) -> torch.Tensor:
+ def get_multimodal_embeddings(
+ self, **kwargs: object
+ ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
  image_input = self._parse_and_validate_image_input(**kwargs)
  if image_input is None:
  return None
 
@@ -18,7 +18,7 @@
 """ PyTorch Fuyu model."""
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from typing import List, Literal, Optional, Set, Tuple, TypedDict
+from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union
 
 import torch
 import torch.nn as nn
@@ -327,7 +327,9 @@ def _process_image_input(
  image_patches_flat)
  return vision_embeddings_flat.split(patches_per_image, dim=0)
 
- def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+ def get_multimodal_embeddings(
+ self, **kwargs
+ ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
  image_input = self._parse_and_validate_image_input(**kwargs)
  if image_input is None:
  return None
 
@@ -595,7 +595,9 @@ def _process_image_input(
 
  return self.transformer.vision(pixel_values)
 
- def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+ def get_multimodal_embeddings(
+ self, **kwargs
+ ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
  image_input = self._parse_and_validate_image_input(**kwargs)
  if image_input is None:
  return None
 
@@ -617,7 +617,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
  self.logits_processor = LogitsProcessor(config.text_config.vocab_size)
  self.sampler = get_sampler()
 
- def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+ def get_multimodal_embeddings(
+ self, **kwargs
+ ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
  image_input = self.model._parse_and_validate_image_input(**kwargs)
  if image_input is None:
  return None
 
@@ -4,6 +4,7 @@
  Protocol, Type, Union, overload, runtime_checkable)
 
 import torch
+from torch import Tensor
 from typing_extensions import TypeIs, TypeVar
 
 from vllm.logger import init_logger
@@ -15,12 +16,11 @@
 
 if TYPE_CHECKING:
  from vllm.attention import AttentionMetadata
- from vllm.multimodal.inputs import NestedTensors # noqa: F401
  from vllm.sequence import IntermediateTensors
 
 logger = init_logger(__name__)
 
-T = TypeVar("T", default="NestedTensors")
+T = TypeVar("T", default=Union[list[Tensor], Tensor, tuple[Tensor, ...]])
 
 
 @runtime_checkable
@@ -36,7 +36,7 @@ class SupportsMultiModal(Protocol):
  MRO of your model class.
  """
 
- def get_multimodal_embeddings(self, **kwargs) -> Optional[T]:
+ def get_multimodal_embeddings(self, **kwargs) -> T:
  """
  Returns multimodal embeddings generated from multimodal kwargs 
  to be merged with text embeddings.
@@ -59,18 +59,18 @@ def get_multimodal_embeddings(self, **kwargs) -> Optional[T]:
  @overload
  def get_input_embeddings(
  self,
- input_ids: torch.Tensor,
+ input_ids: Tensor,
  multimodal_embeddings: Optional[T] = None,
  attn_metadata: Optional["AttentionMetadata"] = None,
- ) -> torch.Tensor:
+ ) -> Tensor:
  ...
 
  @overload
  def get_input_embeddings(
  self,
- input_ids: torch.Tensor,
+ input_ids: Tensor,
  multimodal_embeddings: Optional[T] = None,
- ) -> torch.Tensor:
+ ) -> Tensor:
  """
  Returns the input embeddings merged from the text embeddings from 
  input_ids and the multimodal embeddings generated from multimodal 
@@ -210,7 +210,7 @@ def forward(
  self,
  *,
  intermediate_tensors: Optional["IntermediateTensors"],
- ) -> Union[torch.Tensor, "IntermediateTensors"]:
+ ) -> Union[Tensor, "IntermediateTensors"]:
  """
  Accept :class:`IntermediateTensors` when PP rank > 0.
 
@@ -237,7 +237,7 @@ def forward(
  self,
  *,
  intermediate_tensors: Optional["IntermediateTensors"],
- ) -> Union[torch.Tensor, "IntermediateTensors"]:
+ ) -> Union[Tensor, "IntermediateTensors"]:
  ...