Skip to content

Commit 71b25b0

Browse files
authored
[V0 deprecation] Clean up V0 fallback in compilation config (vllm-project#25675)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
1 parent 0ea80c8 commit 71b25b0

File tree

2 files changed

+22
-73
lines changed

2 files changed

+22
-73
lines changed

vllm/config/__init__.py

Lines changed: 20 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -384,19 +384,7 @@ def __post_init__(self):
384384
else:
385385
self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
386386

387-
if self.cache_config.cpu_offload_gb > 0 and \
388-
self.compilation_config.level != CompilationLevel.NO_COMPILATION \
389-
and not envs.VLLM_USE_V1:
390-
logger.warning(
391-
"CPU offload is not supported with `torch.compile` in v0 yet."
392-
" Disabling `torch.compile`.")
393-
self.compilation_config.level = CompilationLevel.NO_COMPILATION
394-
395387
if self.cache_config.kv_sharing_fast_prefill:
396-
if not envs.VLLM_USE_V1:
397-
raise NotImplementedError(
398-
"Fast prefill optimization for KV sharing is not supported "
399-
"in V0 currently.")
400388

401389
if self.speculative_config is not None and \
402390
self.speculative_config.use_eagle():
@@ -410,14 +398,6 @@ def __post_init__(self):
410398
"--kv-sharing-fast-prefill requires changes on model side for "
411399
"correctness and to realize prefill savings. ")
412400

413-
if ((not envs.VLLM_USE_V1) and self.lora_config is not None
414-
and self.compilation_config.level
415-
!= CompilationLevel.NO_COMPILATION):
416-
logger.warning(
417-
"LoRA for V0 is not supported with `torch.compile` yet. "
418-
"Disabling `torch.compile`.")
419-
self.compilation_config.level = CompilationLevel.NO_COMPILATION
420-
421401
disable_chunked_prefill_reasons: list[str] = []
422402

423403
if self.model_config:
@@ -604,57 +584,27 @@ def _set_cudagraph_sizes(self):
604584
"""
605585

606586
# calculate the default `batch_size_capture_list`
607-
if not envs.VLLM_USE_V1:
608-
batch_size_capture_list = []
609-
if self.scheduler_config is not None and \
610-
self.model_config is not None and \
611-
not self.model_config.enforce_eager:
612-
613-
possible_sizes = [1, 2, 4] + [8 * i for i in range(1, 1025)]
614-
if self.parallel_config.tensor_parallel_size > 1 and \
615-
self.compilation_config.pass_config.enable_sequence_parallelism:
616-
possible_sizes = self.update_sizes_for_sequence_parallelism(
617-
possible_sizes)
618-
619-
# find the minimum size that is larger than max_num_seqs,
620-
# which then becomes the max_batchsize_to_capture
621-
larger_sizes = [
622-
x for x in possible_sizes
623-
if x >= self.scheduler_config.max_num_seqs
624-
]
625-
if larger_sizes:
626-
max_batchsize_to_capture = larger_sizes[0]
627-
else:
628-
max_batchsize_to_capture = possible_sizes[-1]
629-
630-
# filter out the sizes that are
631-
# larger than max_batchsize_to_capture
632-
batch_size_capture_list = [
633-
size for size in possible_sizes
634-
if size <= max_batchsize_to_capture
635-
]
636-
else:
637-
batch_size_capture_list = []
638-
if self.model_config is not None and \
639-
not self.model_config.enforce_eager:
640-
cuda_graph_sizes = self.scheduler_config.cuda_graph_sizes
641-
if len(cuda_graph_sizes) == 1:
642-
batch_size_capture_list = [1, 2, 4] + [
643-
i for i in range(8, cuda_graph_sizes[0] + 1, 8)
644-
]
645-
elif len(cuda_graph_sizes) > 1:
646-
batch_size_capture_list = sorted(cuda_graph_sizes)
647-
else:
648-
raise TypeError(f"Invalid value for {cuda_graph_sizes=}.")
649-
if self.parallel_config.tensor_parallel_size > 1 and \
650-
self.compilation_config.pass_config.enable_sequence_parallelism:
651-
batch_size_capture_list = \
652-
self.update_sizes_for_sequence_parallelism(batch_size_capture_list)
653-
max_num_tokens = self.scheduler_config.max_num_batched_tokens
654-
batch_size_capture_list = [
655-
size for size in batch_size_capture_list
656-
if size <= max_num_tokens
587+
batch_size_capture_list = []
588+
if self.model_config is not None and \
589+
not self.model_config.enforce_eager:
590+
cuda_graph_sizes = self.scheduler_config.cuda_graph_sizes
591+
if len(cuda_graph_sizes) == 1:
592+
batch_size_capture_list = [1, 2, 4] + [
593+
i for i in range(8, cuda_graph_sizes[0] + 1, 8)
657594
]
595+
elif len(cuda_graph_sizes) > 1:
596+
batch_size_capture_list = sorted(cuda_graph_sizes)
597+
else:
598+
raise TypeError(f"Invalid value for {cuda_graph_sizes=}.")
599+
if self.parallel_config.tensor_parallel_size > 1 and \
600+
self.compilation_config.pass_config.enable_sequence_parallelism:
601+
batch_size_capture_list = \
602+
self.update_sizes_for_sequence_parallelism(batch_size_capture_list)
603+
max_num_tokens = self.scheduler_config.max_num_batched_tokens
604+
batch_size_capture_list = [
605+
size for size in batch_size_capture_list
606+
if size <= max_num_tokens
607+
]
658608

659609
self.compilation_config.init_with_cudagraph_sizes(
660610
batch_size_capture_list)

vllm/config/compilation.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
from pydantic import TypeAdapter, field_validator
1111
from pydantic.dataclasses import dataclass
1212

13-
import vllm.envs as envs
1413
from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
1514
from vllm.config.utils import config
1615
from vllm.logger import init_logger
@@ -75,11 +74,11 @@ class PassConfig:
7574
don't all have access to full configuration - that would create a cycle as
7675
the `PassManager` is set as a property of config."""
7776

78-
enable_fusion: bool = field(default_factory=lambda: not envs.VLLM_USE_V1)
77+
enable_fusion: bool = False
7978
"""Whether to enable the custom fusion (RMSNorm/SiluMul+quant) pass."""
8079
enable_attn_fusion: bool = False
8180
"""Whether to enable the custom attention+quant fusion pass."""
82-
enable_noop: bool = field(default_factory=lambda: not envs.VLLM_USE_V1)
81+
enable_noop: bool = False
8382
"""Whether to enable the custom no-op elimination pass."""
8483
enable_sequence_parallelism: bool = False
8584
"""Whether to enable sequence parallelism."""

0 commit comments

Comments
 (0)