77from vllm .outputs import RequestOutput
88from vllm .transformers_utils .detokenizer_utils import AnyTokenizer
99from vllm .transformers_utils .tokenizer_group import BaseTokenizerGroup
10- from vllm .v1 .engine import EngineCoreOutput , EngineCoreRequest
10+ from vllm .v1 .engine import EngineCoreOutputs , EngineCoreRequest
1111from vllm .v1 .engine .detokenizer import (DetokenizerOutput ,
1212 IncrementalDetokenizer )
1313from vllm .v1 .metrics .stats import IterationStats , RequestStateStats
@@ -106,59 +106,71 @@ def add_request(
106106
107107 def process_outputs (
108108 self ,
109- engine_core_outputs : List [EngineCoreOutput ],
109+ engine_core_outputs : EngineCoreOutputs ,
110+ first : int ,
111+ last : int ,
110112 iteration_stats : Optional [IterationStats ] = None ,
111113 ) -> OutputProcessorOutput :
112114 """
113115 Process the EngineCoreOutputs:
114116 1) Compute stats for logging
115117 2) Detokenize
116118 3) Create and handle RequestOutput objects:
117- * If there is a queue (for usage with AsyncLLM),
119+ * If there is a queue (for usage with AsyncLLM),
118120 put the RequestOutput objects into the queue for
119121 handling by the per-request generate() tasks.
120122
121- * If there is no queue (for usage with LLMEngine),
123+ * If there is no queue (for usage with LLMEngine),
122124 return a list of RequestOutput objects.
123125
124126 ****************** NOTE FOR DEVELOPERS ******************
125127
126128 VLLM V1 minimizes the number of python loops over the full
127- batch to ensure system overheads are minimized. This is the
129+ batch to ensure system overheads are minimized. This is the
128130 only function that should loop over EngineCoreOutputs.
129131
130132 If you need to touch every element of the batch, implement a
131133 method called XXXClass.update_from_output() to be called
132134 within the loop below. For examples, see:
133135 * IterationStats.update_from_output()
134136 * Detokenizer.update_from_output()
135-
137+
136138 TODO(rob): add Protocol makes update_from_output explicit.
137-
139+
138140 **********************************************************
139141 """
140142
141143 request_outputs : List [RequestOutput ] = []
142144 reqs_to_abort : List [str ] = []
143145 if not iteration_stats :
144146 iteration_stats = IterationStats (self .log_stats )
145- for engine_core_output in engine_core_outputs :
146- req_id = engine_core_output .request_id
147+ for i , req_id in enumerate (engine_core_outputs .request_ids [first :last ]):
147148 req_state = self .request_states .get (req_id )
148149 if req_state is None :
149150 # Ignore output for already-aborted request.
150151 continue
151152
153+ num_tokens = last - first # might not be robust
154+ start = engine_core_outputs .new_token_id_offsets [i ]
155+ end = engine_core_outputs .new_token_id_offsets [i + 1 ] if i < num_tokens - 1 else - 1
156+ # better way to do this?
157+ new_token_ids = engine_core_outputs .new_token_ids [start :end ]
158+
152159 # 1) Compute stats for this iteration.
153- iteration_stats .update_from_output (engine_core_output ,
160+ iteration_stats .update_from_output (num_tokens ,
154161 req_state .is_prefilling ,
155162 req_state .prompt_len ,
156163 req_state .stats )
157164 req_state .is_prefilling = False
158165
159166 # 2) Detokenize the token ids into text.
167+ #print(f"finish = {engine_core_outputs.finish_reason.get(req_id)}")
168+ #print(f"stop = {engine_core_outputs.stop_reason[i + first]}")
160169 detokenizer_output = req_state .detokenizer .update_from_output (
161- engine_core_output )
170+ new_token_ids ,
171+ engine_core_outputs .finish_reason .get (req_id ),
172+ engine_core_outputs .stop_reason [i + first ],
173+ )
162174
163175 # 3) Create and handle RequestOutput objects.
164176 if detokenizer_output is not None :
@@ -177,7 +189,7 @@ def process_outputs(
177189 assert detokenizer_output .finish_reason is not None
178190
179191 self .request_states .pop (req_id )
180- if not engine_core_output .finished :
192+ if not engine_core_outputs .finished [ i ] :
181193 # If req not finished in EngineCore, but Detokenizer
182194 # detected stop string, abort needed in EngineCore.
183195 reqs_to_abort .append (req_id )
0 commit comments