@@ -175,12 +175,21 @@ def _call_hf_processor(
175175
176176 # Original output: (1, num_images, Pn, Px * Py * C)
177177 # New output: (num_images, Pn, Px * Py * C)
178- assert (isinstance (image_patches , list )
179- and len (image_patches ) == 1 )
180- assert (isinstance (image_patches [0 ], torch .Tensor )
181- and len (image_patches [0 ]) == len (images ))
182-
183- processed_outputs ["image_patches" ] = image_patches [0 ]
178+ # image_patches is a list with shape:
179+ # (1, num_images, Pn, Px * Py * C)
180+ # before Transformers 4.53
181+ if isinstance (image_patches , list ):
182+ assert len (image_patches ) == 1
183+ assert (isinstance (image_patches [0 ], torch .Tensor )
184+ and len (image_patches [0 ]) == len (images ))
185+ processed_outputs ["image_patches" ] = image_patches [0 ]
186+ # image_patches is a tensor with shape:
187+ # (num_images, Pn, Px * Py * C)
188+ # after Transformers 4.53
189+ elif isinstance (image_patches , torch .Tensor ):
190+ assert len (image_patches ) == len (images )
191+ else :
192+ raise AssertionError ("This line should be unreachable." )
184193
185194 return processed_outputs
186195
@@ -193,8 +202,10 @@ def _apply_hf_processor_tokens_only(
193202 vocab = tokenizer .get_vocab ()
194203
195204 boa_token_id = vocab ["<0x04>" ]
205+ if prompt_tokens [- 1 ] != boa_token_id :
206+ prompt_tokens .append (boa_token_id )
196207
197- return prompt_tokens + [ boa_token_id ]
208+ return prompt_tokens
198209
199210 def _get_mm_fields_config (
200211 self ,
0 commit comments