EmbeddedLLM
diff --git a/‎vllm/config/__init__.py‎
Lines changed: 20 additions & 70 deletions b/‎vllm/config/__init__.py‎
Lines changed: 20 additions & 70 deletions
diff --git a/‎vllm/config/compilation.py‎
Lines changed: 2 additions & 3 deletions b/‎vllm/config/compilation.py‎
Lines changed: 2 additions & 3 deletions
@@ -384,19 +384,7 @@ def __post_init__(self):
  else:
  self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
 
- if self.cache_config.cpu_offload_gb > 0 and \
- self.compilation_config.level != CompilationLevel.NO_COMPILATION \
- and not envs.VLLM_USE_V1:
- logger.warning(
- "CPU offload is not supported with `torch.compile` in v0 yet."
- " Disabling `torch.compile`.")
- self.compilation_config.level = CompilationLevel.NO_COMPILATION
-
  if self.cache_config.kv_sharing_fast_prefill:
- if not envs.VLLM_USE_V1:
- raise NotImplementedError(
- "Fast prefill optimization for KV sharing is not supported "
- "in V0 currently.")
 
  if self.speculative_config is not None and \
  self.speculative_config.use_eagle():
@@ -410,14 +398,6 @@ def __post_init__(self):
  "--kv-sharing-fast-prefill requires changes on model side for "
  "correctness and to realize prefill savings. ")
 
- if ((not envs.VLLM_USE_V1) and self.lora_config is not None
- and self.compilation_config.level
- != CompilationLevel.NO_COMPILATION):
- logger.warning(
- "LoRA for V0 is not supported with `torch.compile` yet. "
- "Disabling `torch.compile`.")
- self.compilation_config.level = CompilationLevel.NO_COMPILATION
-
  disable_chunked_prefill_reasons: list[str] = []
 
  if self.model_config:
@@ -604,57 +584,27 @@ def _set_cudagraph_sizes(self):
  """
 
  # calculate the default `batch_size_capture_list`
- if not envs.VLLM_USE_V1:
- batch_size_capture_list = []
- if self.scheduler_config is not None and \
- self.model_config is not None and \
- not self.model_config.enforce_eager:
-
- possible_sizes = [1, 2, 4] + [8 * i for i in range(1, 1025)]
- if self.parallel_config.tensor_parallel_size > 1 and \
- self.compilation_config.pass_config.enable_sequence_parallelism:
- possible_sizes = self.update_sizes_for_sequence_parallelism(
- possible_sizes)
-
- # find the minimum size that is larger than max_num_seqs,
- # which then becomes the max_batchsize_to_capture
- larger_sizes = [
- x for x in possible_sizes
- if x >= self.scheduler_config.max_num_seqs
- ]
- if larger_sizes:
- max_batchsize_to_capture = larger_sizes[0]
- else:
- max_batchsize_to_capture = possible_sizes[-1]
-
- # filter out the sizes that are
- # larger than max_batchsize_to_capture
- batch_size_capture_list = [
- size for size in possible_sizes
- if size <= max_batchsize_to_capture
- ]
- else:
- batch_size_capture_list = []
- if self.model_config is not None and \
- not self.model_config.enforce_eager:
- cuda_graph_sizes = self.scheduler_config.cuda_graph_sizes
- if len(cuda_graph_sizes) == 1:
- batch_size_capture_list = [1, 2, 4] + [
- i for i in range(8, cuda_graph_sizes[0] + 1, 8)
- ]
- elif len(cuda_graph_sizes) > 1:
- batch_size_capture_list = sorted(cuda_graph_sizes)
- else:
- raise TypeError(f"Invalid value for {cuda_graph_sizes=}.")
- if self.parallel_config.tensor_parallel_size > 1 and \
- self.compilation_config.pass_config.enable_sequence_parallelism:
- batch_size_capture_list = \
- self.update_sizes_for_sequence_parallelism(batch_size_capture_list)
- max_num_tokens = self.scheduler_config.max_num_batched_tokens
- batch_size_capture_list = [
- size for size in batch_size_capture_list
- if size <= max_num_tokens
+ batch_size_capture_list = []
+ if self.model_config is not None and \
+ not self.model_config.enforce_eager:
+ cuda_graph_sizes = self.scheduler_config.cuda_graph_sizes
+ if len(cuda_graph_sizes) == 1:
+ batch_size_capture_list = [1, 2, 4] + [
+ i for i in range(8, cuda_graph_sizes[0] + 1, 8)
  ]
+ elif len(cuda_graph_sizes) > 1:
+ batch_size_capture_list = sorted(cuda_graph_sizes)
+ else:
+ raise TypeError(f"Invalid value for {cuda_graph_sizes=}.")
+ if self.parallel_config.tensor_parallel_size > 1 and \
+ self.compilation_config.pass_config.enable_sequence_parallelism:
+ batch_size_capture_list = \
+ self.update_sizes_for_sequence_parallelism(batch_size_capture_list)
+ max_num_tokens = self.scheduler_config.max_num_batched_tokens
+ batch_size_capture_list = [
+ size for size in batch_size_capture_list
+ if size <= max_num_tokens
+ ]
 
  self.compilation_config.init_with_cudagraph_sizes(
  batch_size_capture_list)
 
@@ -10,7 +10,6 @@
 from pydantic import TypeAdapter, field_validator
 from pydantic.dataclasses import dataclass
 
-import vllm.envs as envs
 from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
 from vllm.config.utils import config
 from vllm.logger import init_logger
@@ -75,11 +74,11 @@ class PassConfig:
  don't all have access to full configuration - that would create a cycle as
  the `PassManager` is set as a property of config."""
 
- enable_fusion: bool = field(default_factory=lambda: not envs.VLLM_USE_V1)
+ enable_fusion: bool = False
  """Whether to enable the custom fusion (RMSNorm/SiluMul+quant) pass."""
  enable_attn_fusion: bool = False
  """Whether to enable the custom attention+quant fusion pass."""
- enable_noop: bool = field(default_factory=lambda: not envs.VLLM_USE_V1)
+ enable_noop: bool = False
  """Whether to enable the custom no-op elimination pass."""
  enable_sequence_parallelism: bool = False
  """Whether to enable sequence parallelism."""