huggingface
diff --git a/‎examples/pytorch/continuous_batching.py‎
Lines changed: 32 additions & 24 deletions b/‎examples/pytorch/continuous_batching.py‎
Lines changed: 32 additions & 24 deletions
diff --git a/‎examples/pytorch/continuous_batching_simple.py‎
Lines changed: 0 additions & 2 deletions b/‎examples/pytorch/continuous_batching_simple.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎src/transformers/generation/continuous_batching/cache.py‎
Lines changed: 5 additions & 6 deletions b/‎src/transformers/generation/continuous_batching/cache.py‎
Lines changed: 5 additions & 6 deletions
@@ -26,22 +26,25 @@
 
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from transformers.generation import GenerationConfig
+from transformers.generation.continuous_batching.requests import logger
 
 
 # MODEL_ID = "Qwen/Qwen3-4B-Instruct-2507"
 SLIDING_WINDOW = 0
-MODEL_ID = "google/gemma-2-2b-it" if SLIDING_WINDOW > 0 else "Qwen/Qwen3-4B-Instruct-2507"
+MODEL_ID = "google/gemma-2-2b-it" if SLIDING_WINDOW > 0 else "meta-llama/Meta-Llama-3-8B"
 FORCE_MAX_LENGTH = False # should be False unless you are debugging sliding window features
+SKIP_SPECIAL_TOKENS = False
 
 
 def generate_simple(
  attn_impl: str, simple_batch_inputs: list[int], generation_config: GenerationConfig
 ) -> dict[str, str]:
  attn_impl = {
- "sdpa_paged": "sdpa",
- "eager_paged": "eager",
+ "sdpa": "sdpa",
+ "eager": "eager",
  "paged_attention": "eager", # TODO: this does not work on AMD docker
  "flash_paged": "flash_attention_2", # TODO: this does not work on AMD docker
+ "kernels-community/flash-attn": "eager",
  }[attn_impl]
 
  model = AutoModelForCausalLM.from_pretrained(MODEL_ID, dtype=torch.bfloat16, attn_implementation=attn_impl)
@@ -56,7 +59,7 @@ def generate_simple(
  # attention_mask = torch.ones_like(input_ids)
  outputs = model.generate(input_ids, generation_config=generation_config, use_model_defaults=False)
  generated_tokens = outputs[0][input_ids.shape[1] :]
- decoded_output = tokenizer.decode(generated_tokens, skip_special_tokens=True)
+ decoded_output = tokenizer.decode(generated_tokens, skip_special_tokens=SKIP_SPECIAL_TOKENS)
  decoded_outputs[key] = decoded_output
  return decoded_outputs
 
@@ -99,7 +102,6 @@ def batch_generate(
  displayed_samples: int = 0, # -1: no display, 0: display stats, >0: display inputs and some outputs
  output_file: Optional[str] = None,
  expected_outputs: Optional[list[str]] = None,
- slice_inputs: bool = True,
 ) -> tuple[float, float]:
  # Actual batch generation
  if displayed_samples >= 0:
@@ -108,7 +110,6 @@ def batch_generate(
  batch_outputs = model.generate_batch(
  inputs=simple_batch_inputs,
  generation_config=generation_config,
- slice_inputs=slice_inputs, # TODO: move this to the generation config
  )
  end_time_simple = time.time()
  if displayed_samples >= 0:
@@ -118,19 +119,21 @@ def batch_generate(
  token_count = 0
  data = []
  for i, request in enumerate(batch_outputs):
- input_text = tokenizer.decode(batch_outputs[request].prompt_ids, skip_special_tokens=True)
+ input_text = tokenizer.decode(batch_outputs[request].prompt_ids, skip_special_tokens=SKIP_SPECIAL_TOKENS)
  # The key is used to tie back to the output of unbatched generation
  key = " ".join(map(str, batch_outputs[request].prompt_ids))
  data.append({"input": input_text, "key": key})
 
  # Try to decode the output
  try:
- output_text = tokenizer.decode(batch_outputs[request].generated_tokens, skip_special_tokens=True)
+ output_text = tokenizer.decode(
+ batch_outputs[request].generated_tokens, skip_special_tokens=SKIP_SPECIAL_TOKENS
+ )
  token_count += len(batch_outputs[request].generated_tokens[1:])
- data[-1]["output"] = output_text
+ data[-1]["cb_outputs"] = output_text
  except Exception as e:
  print(f"Decoding failed for request {request}: {e}")
- data[-1]["output"] = "__ERROR__"
+ data[-1]["cb_outputs"] = "__ERROR__"
  continue
 
  # Display sample if asked
@@ -148,7 +151,7 @@ def batch_generate(
  if expected_outputs is not None:
  expected_output = expected_outputs.pop(key)
  matches = output_text == expected_output # TODO: rework this for a better distance metric
- data[-1]["ref"] = expected_output
+ data[-1]["without_cb"] = expected_output
  data[-1]["matches"] = matches
  data[-1].pop("key")
  print(f"Request {i} matches" if matches else f"Request {i} does NOT match!")
@@ -186,19 +189,20 @@ def batch_generate(
 
  parser.add_argument("--attn", type=str, default="kernels-community/flash-attn", help="Attention implementation")
  parser.add_argument("--matmul-precision", "-mp", type=str, default="high") # set to "none" to disable
- parser.add_argument("--no-slice-inputs", action="store_true") # slicing is enabled by default because much faster
- parser.add_argument("--use-cuda-graph", "-cg", action="store_true")
- parser.add_argument("--compile", action="store_true")
+ parser.add_argument("--cuda-graph", "-cg", help="Use cuda graphs", type=str, default=None)
+ parser.add_argument("--compile", action="store_true", help="Compile the model using torch.compile")
 
- parser.add_argument("--samples", type=int, default=500)
+ parser.add_argument("--samples", type=int, default=500, help="Number of samples to generate")
  parser.add_argument("--displayed", type=int, default=0, help="Number of samples to display")
+ parser.add_argument("--log-level", type=str, default="INFO")
  parser.add_argument("--output-file", type=str, default=None)
  parser.add_argument("--compare", action="store_true")
  parser.add_argument("--metrics", action="store_true")
  parser.add_argument("--profile", type=str, default=None)
  args = parser.parse_args()
 
- args.slice_inputs = not args.no_slice_inputs
+ # Set log level
+ logger.setLevel(args.log_level.upper())
 
  # If turned on, we setup metrics
  if args.metrics:
@@ -207,6 +211,15 @@ def batch_generate(
  # Set matmul precision if not none
  if args.matmul_precision != "none":
  torch.set_float32_matmul_precision(args.matmul_precision)
+ # Parse cuda graph argument
+ if args.cuda_graph is not None:
+ use_cuda_graph = {
+ "none": None,
+ "yes": True, "y": True, "true": True, "t": True, "1": True,
+ "no": False, "n": False, "false": False, "f": False, "0": False,
+ }[args.cuda_graph.lower()] # fmt: skip
+ else:
+ use_cuda_graph = None
 
  # Prepare model
  model = AutoModelForCausalLM.from_pretrained(
@@ -222,9 +235,6 @@ def batch_generate(
  # If turned on, we compile the model
  if args.compile:
  model.forward = torch.compile(model.forward, mode="max-autotune-no-cudagraphs")
- if args.slice_inputs:
- assert not args.compile, "Slicing inputs requires is not the model to be compiled"
- assert not args.use_cuda_graph, "Slicing inputs is not compatible with cuda graphs"
 
  # Prepare tokenizer and dataset
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, padding_side="left")
@@ -237,10 +247,10 @@ def batch_generate(
  # Prepare generation config
  generation_config = GenerationConfig(
  max_new_tokens=512,
- use_cuda_graph=args.use_cuda_graph,
+ use_cuda_graph=use_cuda_graph,
  eos_token_id=tokenizer.pad_token_id if FORCE_MAX_LENGTH else tokenizer.eos_token_id,
  pad_token_id=tokenizer.pad_token_id,
- do_sample=True,
+ do_sample=not args.compare,
  temperature=0.8,
  top_p=0.9,
  num_blocks=args.num_blocks,
@@ -265,7 +275,6 @@ def batch_generate(
  generation_config,
  tokenizer,
  displayed_samples=-1,
- slice_inputs=args.slice_inputs,
  )
 
  if args.profile is not None:
@@ -282,12 +291,11 @@ def batch_generate(
  displayed_samples=args.displayed,
  output_file=args.output_file,
  expected_outputs=expected_outputs,
- slice_inputs=args.slice_inputs,
  )
  if args.profile is not None:
  filename = args.profile if args.profile.endswith(".json") else args.profile + ".json"
  prof.export_chrome_trace(filename)
 
 # Example usage:
-# python examples/pytorch/continuous_batching.py --attn sdpa_paged -mp none --slice-inputs --samples 3 --compare
+# python examples/pytorch/continuous_batching.py --attn sdpa_paged -mp none --samples 3 --compare
 # python examples/pytorch/continuous_batching.py --num-blocks 369 --max-batch-tokens 23 --attn sdpa_paged -mp none --samples 1 --displayed 0 --output-file sliced.json
@@ -68,7 +68,6 @@
  _ = model.generate_batch(
  inputs=simple_batch_inputs[: min(5, args.samples)],
  generation_config=generation_config,
- slice_inputs=True,
  )
 
  # Actual batch generation
@@ -77,7 +76,6 @@
  batch_outputs = model.generate_batch(
  inputs=simple_batch_inputs,
  generation_config=generation_config,
- slice_inputs=True,
  )
  end_time = time.time()
  print("Done with batch generation.")
 
@@ -204,8 +204,8 @@ def __init__(
  # Initialize the cache
  self.key_cache: list[torch.Tensor] = []
  self.value_cache: list[torch.Tensor] = []
- # We add one extra token to the cache to handle padding and generally discard unwanted tokens
- self.cache_shape = (num_blocks * self.block_size + 1, self.num_key_value_heads, self.head_dim)
+ # We add two extra tokens to the cache to handle padding and generally discard unwanted tokens
+ self.cache_shape = (num_blocks * self.block_size + 2, self.num_key_value_heads, self.head_dim)
  for _ in range(group_size):
  new_layer_key_cache = torch.empty(self.cache_shape, dtype=self.dtype, device=self.device)
  new_layer_value_cache = torch.empty(self.cache_shape, dtype=self.dtype, device=self.device)
@@ -290,7 +290,6 @@ def update(
  layer_idx: int,
  read_index: list[torch.Tensor], # shape [num_layer_groups, seqlen_kv + past_length]
  write_index: list[torch.Tensor], # shape [num_layer_groups, seqlen_q]
- **kwargs,
  ) -> tuple[torch.Tensor, torch.Tensor]: # shape [seqlen_kv + past_length, num_kv_heads, head_dim]
  """Update the cache with new key-value states for a specific layer. This method writes new KV states to the
  appropriate cache locations. The behavior differs based on the layer's attention type:
@@ -324,11 +323,11 @@ def update(
  # the only case where you may write over cache you need to use
  else:
  # Add the cache to the key and value states
- mask = layer_read_index == -1 # TODO: can this can be efficiently precomputed?
+ mask = (layer_read_index == -1).unsqueeze(-1).unsqueeze(-1) # TODO: should this be precomputed?
  key_states_with_cache = k_cache[layer_read_index, :, :]
- key_states_with_cache[mask] = key_states
+ key_states_with_cache.masked_scatter_(mask, key_states)
  value_states_with_cache = v_cache[layer_read_index, :, :]
- value_states_with_cache[mask] = value_states
+ value_states_with_cache.masked_scatter_(mask, value_states)
  # Write new KV values to the cache
  k_cache[layer_write_index, :, :] = key_states
  v_cache[layer_write_index, :, :] = value_states
Original file line number	Diff line number	Diff line change
`@@ -68,7 +68,6 @@`
`68`	`68`	`_ = model.generate_batch(`
`69`	`69`	`inputs=simple_batch_inputs[: min(5, args.samples)],`
`70`	`70`	`generation_config=generation_config,`
`71`		`- slice_inputs=True,`
`72`	`71`	`)`
`73`	`72`
`74`	`73`	`# Actual batch generation`
`@@ -77,7 +76,6 @@`
`77`	`76`	`batch_outputs = model.generate_batch(`
`78`	`77`	`inputs=simple_batch_inputs,`
`79`	`78`	`generation_config=generation_config,`
`80`		`- slice_inputs=True,`
`81`	`79`	`)`
`82`	`80`	`end_time = time.time()`
`83`	`81`	`print("Done with batch generation.")`