EmbeddedLLM
diff --git a/‎requirements/test.in‎
Lines changed: 1 addition & 1 deletion b/‎requirements/test.in‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎requirements/test.txt‎
Lines changed: 1 addition & 1 deletion b/‎requirements/test.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/models/multimodal/generation/test_common.py‎
Lines changed: 2 additions & 2 deletions b/‎tests/models/multimodal/generation/test_common.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎tests/models/multimodal/processing/test_common.py‎
Lines changed: 1 addition & 0 deletions b/‎tests/models/multimodal/processing/test_common.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tests/models/test_initialization.py‎
Lines changed: 10 additions & 2 deletions b/‎tests/models/test_initialization.py‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎vllm/inputs/registry.py‎
Lines changed: 1 addition & 7 deletions b/‎vllm/inputs/registry.py‎
Lines changed: 1 addition & 7 deletions
diff --git a/‎vllm/model_executor/models/commandr.py‎
Lines changed: 5 additions & 2 deletions b/‎vllm/model_executor/models/commandr.py‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎vllm/model_executor/models/fuyu.py‎
Lines changed: 18 additions & 7 deletions b/‎vllm/model_executor/models/fuyu.py‎
Lines changed: 18 additions & 7 deletions
diff --git a/‎vllm/model_executor/models/gemma3.py‎
Lines changed: 6 additions & 3 deletions b/‎vllm/model_executor/models/gemma3.py‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎vllm/model_executor/models/minicpmo.py‎
Lines changed: 11 additions & 10 deletions b/‎vllm/model_executor/models/minicpmo.py‎
Lines changed: 11 additions & 10 deletions
@@ -34,7 +34,7 @@ opencv-python-headless >= 4.11.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
 lm-eval[api]==0.4.8 # required for model evaluation test
 mteb[bm25s]>=1.38.11, <2 # required for mteb test
-transformers==4.52.4
+transformers==4.53.2
 tokenizers==0.21.1
 huggingface-hub[hf_xet]>=0.33.0 # Required for Xet downloads.
 schemathesis>=3.39.15 # Required for openai schema test.
 
@@ -800,7 +800,7 @@ tqdm==4.66.6
  # transformers
 tqdm-multiprocess==0.0.11
  # via lm-eval
-transformers==4.52.4
+transformers==4.53.2
  # via
  # -r requirements/test.in
  # genai-perf
 
@@ -318,6 +318,7 @@
  num_logprobs=10,
  image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
  auto_cls=AutoModelForImageTextToText,
+ marks=[large_gpu_mark(min_gb=32)],
  ),
  "glm4_1v-video": VLMTestInfo(
  models=["THUDM/GLM-4.1V-9B-Thinking"],
@@ -331,8 +332,7 @@
  inputs=custom_inputs.video_with_metadata_glm4_1v(),
  limit_mm_per_prompt={"video": 1},
  )],
- # This is needed to run on machine with 24GB VRAM
- vllm_runner_kwargs={"gpu_memory_utilization": 0.95},
+ marks=[large_gpu_mark(min_gb=32)],
  ),
  "h2ovl": VLMTestInfo(
  models = [
 
@@ -159,6 +159,7 @@ def _test_processing_correctness(
 _ADD_SPECIAL_TOKENS_OVERRIDES = {
  "mllama": False,
  "ovis": False,
+ "paligemma": False,
  "ultravox": False,
  "whisper": False,
 }
 
@@ -31,7 +31,8 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
  model_info.check_transformers_version(on_fail="skip")
 
  # FIXME: Possible memory leak in the previous tests?
- if model_arch in ("GraniteSpeechForConditionalGeneration",
+ if model_arch in ("Glm4vForConditionalGeneration",
+ "GraniteSpeechForConditionalGeneration",
  "KimiVLForConditionalGeneration"):
  pytest.skip("Avoid OOM")
 
@@ -46,16 +47,23 @@ def hf_overrides(hf_config: PretrainedConfig) -> PretrainedConfig:
  n_group = getattr(text_config, 'n_group', None)
  num_experts = n_group * 2 if n_group is not None else 2
 
+ # we use three layers for Gemma-3n to check
+ # both normal layer and kv_shared_layer
+ num_hidden_layers = (3 if model_arch
+ == "Gemma3nForConditionalGeneration" else 1)
+
  text_config.update({
  "num_layers": 1,
- "num_hidden_layers": 1,
+ "num_hidden_layers": num_hidden_layers,
  "num_experts": num_experts,
  "num_experts_per_tok": 2,
  "num_local_experts": num_experts,
  # Otherwise there will not be any expert layers
  "first_k_dense_replace": 0,
  # To avoid OOM on DeepSeek-V3
  "n_routed_experts": num_experts,
+ # For Gemma-3n
+ "num_kv_shared_layers": 1,
  })
 
  if hasattr(hf_config, "vision_config"):
 
@@ -5,9 +5,7 @@
 from typing import TYPE_CHECKING, Any, NamedTuple, Optional, Union
 
 import torch
-from packaging.version import Version
 from transformers import BatchFeature, PretrainedConfig, ProcessorMixin
-from transformers import __version__ as TRANSFORMERS_VERSION
 from typing_extensions import TypeVar
 
 from vllm.jsontree import JSONTree, json_map_leaves
@@ -137,13 +135,9 @@ def get_hf_processor(
  /,
  **kwargs: object,
  ) -> _P:
- # Transformers 4.53.0 has issue with passing tokenizer to
- # initialize processor. We disable it for this version.
- # See: https://github.com/vllm-project/vllm/issues/20224
- if Version(TRANSFORMERS_VERSION) != Version("4.53.0"):
- kwargs["tokenizer"] = self.tokenizer
  return super().get_hf_processor(
  typ,
+ tokenizer=self.tokenizer,
  **kwargs,
  )
 
 
@@ -189,10 +189,13 @@ def __init__(
 
  layer_idx = extract_layer_index(prefix)
  layer_has_sliding_window = (
- getattr(config, "sliding_window_pattern", False)
- and (layer_idx + 1) % self.config.sliding_window_pattern != 0)
+ getattr(config, "sliding_window_pattern", False) and
+ (layer_idx + 1) % self.config.sliding_window_pattern
+ != 0) or (getattr(config, "layer_types", False)
+ and config.layer_types[layer_idx] == "sliding_attention")
 
  self.sliding_window = (interleaved_sliding_window
+ or config.sliding_window
  if layer_has_sliding_window else None)
 
  self.attn = Attention(self.num_heads,
 
@@ -175,12 +175,21 @@ def _call_hf_processor(
 
  # Original output: (1, num_images, Pn, Px * Py * C)
  # New output: (num_images, Pn, Px * Py * C)
- assert (isinstance(image_patches, list)
- and len(image_patches) == 1)
- assert (isinstance(image_patches[0], torch.Tensor)
- and len(image_patches[0]) == len(images))
-
- processed_outputs["image_patches"] = image_patches[0]
+ # image_patches is a list with shape:
+ # (1, num_images, Pn, Px * Py * C)
+ # before Transformers 4.53
+ if isinstance(image_patches, list):
+ assert len(image_patches) == 1
+ assert (isinstance(image_patches[0], torch.Tensor)
+ and len(image_patches[0]) == len(images))
+ processed_outputs["image_patches"] = image_patches[0]
+ # image_patches is a tensor with shape:
+ # (num_images, Pn, Px * Py * C)
+ # after Transformers 4.53
+ elif isinstance(image_patches, torch.Tensor):
+ assert len(image_patches) == len(images)
+ else:
+ raise AssertionError("This line should be unreachable.")
 
  return processed_outputs
 
@@ -193,8 +202,10 @@ def _apply_hf_processor_tokens_only(
  vocab = tokenizer.get_vocab()
 
  boa_token_id = vocab["<0x04>"]
+ if prompt_tokens[-1] != boa_token_id:
+ prompt_tokens.append(boa_token_id)
 
- return prompt_tokens + [boa_token_id]
+ return prompt_tokens
 
  def _get_mm_fields_config(
  self,
 
@@ -149,14 +149,17 @@ def __init__(self,
  # TODO(woosuk): Add reference to the original HF implementation.
  layer_idx = extract_layer_index(prefix)
  self.is_sliding = (getattr(
- config, "interleaved_sliding_window", None) is not None and bool(
- (layer_idx + 1) % config.sliding_window_pattern))
+ config, "interleaved_sliding_window", None) is not None and (bool(
+ (layer_idx + 1) % config.sliding_window_pattern))) or (
+ getattr(config, "layer_types", None) is not None
+ and config.layer_types[layer_idx] == "sliding_attention")
  # Initialize the rotary embedding.
  if self.is_sliding:
  # Local attention. Override the values in config.json.
  self.rope_theta = config.rope_local_base_freq
  self.rope_scaling = {"rope_type": "default"}
- self.sliding_window = config.interleaved_sliding_window
+ self.sliding_window = (config.interleaved_sliding_window
+ or config.sliding_window)
  else:
  # Global attention. Use the values in config.json.
  self.rope_theta = config.rope_theta
 
@@ -30,8 +30,10 @@
 from torch import nn
 from transformers import BatchFeature, PretrainedConfig
 from transformers.modeling_outputs import BaseModelOutputWithPast
-from transformers.models.whisper.modeling_whisper import (
- ACT2FN, WHISPER_ATTENTION_CLASSES, WhisperConfig, WhisperEncoder)
+from transformers.models.whisper.modeling_whisper import (ACT2FN,
+ WhisperAttention,
+ WhisperConfig,
+ WhisperEncoder)
 
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.quantization import QuantizationConfig
@@ -378,14 +380,13 @@ class MiniCPMWhisperEncoderLayer(nn.Module):
  def __init__(self, config: WhisperConfig, layer_idx: int):
  super().__init__()
  self.embed_dim = config.d_model
- self.self_attn = WHISPER_ATTENTION_CLASSES[
- config._attn_implementation](
- embed_dim=self.embed_dim,
- num_heads=config.encoder_attention_heads,
- dropout=config.attention_dropout,
- config=config,
- layer_idx=layer_idx,
- )
+ self.self_attn = WhisperAttention(
+ embed_dim=self.embed_dim,
+ num_heads=config.encoder_attention_heads,
+ dropout=config.attention_dropout,
+ config=config,
+ layer_idx=layer_idx,
+ )
  self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
  self.dropout = config.dropout
  self.activation_fn = ACT2FN[config.activation_function]
Original file line number	Diff line number	Diff line change
`@@ -159,6 +159,7 @@ def _test_processing_correctness(`
`159`	`159`	`_ADD_SPECIAL_TOKENS_OVERRIDES = {`
`160`	`160`	`"mllama": False,`
`161`	`161`	`"ovis": False,`
	`162`	`+ "paligemma": False,`
`162`	`163`	`"ultravox": False,`
`163`	`164`	`"whisper": False,`
`164`	`165`	`}`