@@ -384,19 +384,7 @@ def __post_init__(self):
384384 else :
385385 self .compilation_config .cudagraph_mode = CUDAGraphMode .NONE
386386
387- if self .cache_config .cpu_offload_gb > 0 and \
388- self .compilation_config .level != CompilationLevel .NO_COMPILATION \
389- and not envs .VLLM_USE_V1 :
390- logger .warning (
391- "CPU offload is not supported with `torch.compile` in v0 yet."
392- " Disabling `torch.compile`." )
393- self .compilation_config .level = CompilationLevel .NO_COMPILATION
394-
395387 if self .cache_config .kv_sharing_fast_prefill :
396- if not envs .VLLM_USE_V1 :
397- raise NotImplementedError (
398- "Fast prefill optimization for KV sharing is not supported "
399- "in V0 currently." )
400388
401389 if self .speculative_config is not None and \
402390 self .speculative_config .use_eagle ():
@@ -410,14 +398,6 @@ def __post_init__(self):
410398 "--kv-sharing-fast-prefill requires changes on model side for "
411399 "correctness and to realize prefill savings. " )
412400
413- if ((not envs .VLLM_USE_V1 ) and self .lora_config is not None
414- and self .compilation_config .level
415- != CompilationLevel .NO_COMPILATION ):
416- logger .warning (
417- "LoRA for V0 is not supported with `torch.compile` yet. "
418- "Disabling `torch.compile`." )
419- self .compilation_config .level = CompilationLevel .NO_COMPILATION
420-
421401 disable_chunked_prefill_reasons : list [str ] = []
422402
423403 if self .model_config :
@@ -604,57 +584,27 @@ def _set_cudagraph_sizes(self):
604584 """
605585
606586 # calculate the default `batch_size_capture_list`
607- if not envs .VLLM_USE_V1 :
608- batch_size_capture_list = []
609- if self .scheduler_config is not None and \
610- self .model_config is not None and \
611- not self .model_config .enforce_eager :
612-
613- possible_sizes = [1 , 2 , 4 ] + [8 * i for i in range (1 , 1025 )]
614- if self .parallel_config .tensor_parallel_size > 1 and \
615- self .compilation_config .pass_config .enable_sequence_parallelism :
616- possible_sizes = self .update_sizes_for_sequence_parallelism (
617- possible_sizes )
618-
619- # find the minimum size that is larger than max_num_seqs,
620- # which then becomes the max_batchsize_to_capture
621- larger_sizes = [
622- x for x in possible_sizes
623- if x >= self .scheduler_config .max_num_seqs
624- ]
625- if larger_sizes :
626- max_batchsize_to_capture = larger_sizes [0 ]
627- else :
628- max_batchsize_to_capture = possible_sizes [- 1 ]
629-
630- # filter out the sizes that are
631- # larger than max_batchsize_to_capture
632- batch_size_capture_list = [
633- size for size in possible_sizes
634- if size <= max_batchsize_to_capture
635- ]
636- else :
637- batch_size_capture_list = []
638- if self .model_config is not None and \
639- not self .model_config .enforce_eager :
640- cuda_graph_sizes = self .scheduler_config .cuda_graph_sizes
641- if len (cuda_graph_sizes ) == 1 :
642- batch_size_capture_list = [1 , 2 , 4 ] + [
643- i for i in range (8 , cuda_graph_sizes [0 ] + 1 , 8 )
644- ]
645- elif len (cuda_graph_sizes ) > 1 :
646- batch_size_capture_list = sorted (cuda_graph_sizes )
647- else :
648- raise TypeError (f"Invalid value for { cuda_graph_sizes = } ." )
649- if self .parallel_config .tensor_parallel_size > 1 and \
650- self .compilation_config .pass_config .enable_sequence_parallelism :
651- batch_size_capture_list = \
652- self .update_sizes_for_sequence_parallelism (batch_size_capture_list )
653- max_num_tokens = self .scheduler_config .max_num_batched_tokens
654- batch_size_capture_list = [
655- size for size in batch_size_capture_list
656- if size <= max_num_tokens
587+ batch_size_capture_list = []
588+ if self .model_config is not None and \
589+ not self .model_config .enforce_eager :
590+ cuda_graph_sizes = self .scheduler_config .cuda_graph_sizes
591+ if len (cuda_graph_sizes ) == 1 :
592+ batch_size_capture_list = [1 , 2 , 4 ] + [
593+ i for i in range (8 , cuda_graph_sizes [0 ] + 1 , 8 )
657594 ]
595+ elif len (cuda_graph_sizes ) > 1 :
596+ batch_size_capture_list = sorted (cuda_graph_sizes )
597+ else :
598+ raise TypeError (f"Invalid value for { cuda_graph_sizes = } ." )
599+ if self .parallel_config .tensor_parallel_size > 1 and \
600+ self .compilation_config .pass_config .enable_sequence_parallelism :
601+ batch_size_capture_list = \
602+ self .update_sizes_for_sequence_parallelism (batch_size_capture_list )
603+ max_num_tokens = self .scheduler_config .max_num_batched_tokens
604+ batch_size_capture_list = [
605+ size for size in batch_size_capture_list
606+ if size <= max_num_tokens
607+ ]
658608
659609 self .compilation_config .init_with_cudagraph_sizes (
660610 batch_size_capture_list )
0 commit comments