NVIDIA
diff --git a/‎tensorrt_llm/bench/benchmark/throughput.py‎
Lines changed: 15 additions & 0 deletions b/‎tensorrt_llm/bench/benchmark/throughput.py‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎tensorrt_llm/bench/benchmark/utils/asynchronous.py‎
Lines changed: 1 addition & 1 deletion b/‎tensorrt_llm/bench/benchmark/utils/asynchronous.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tensorrt_llm/bench/dataclasses/reporting.py‎
Lines changed: 11 additions & 0 deletions b/‎tensorrt_llm/bench/dataclasses/reporting.py‎
Lines changed: 11 additions & 0 deletions
@@ -222,6 +222,16 @@
  required=False,
  help="Path where output should be written to.",
 )
+@optgroup.option(
+ "--request_json",
+ type=click.Path(dir_okay=False,
+ writable=True,
+ readable=False,
+ path_type=Path,
+ resolve_path=True),
+ required=False,
+ help="Path where per request information is written to.",
+)
 @optgroup.option(
  "--enable_chunked_context",
  is_flag=True,
@@ -262,6 +272,7 @@ def throughput_command(
  # Reporting options
  report_json: Path = params.pop("report_json")
  output_json: Path = params.pop("output_json")
+ request_json: Path = params.pop("request_json")
  iteration_log: Path = params.pop("iteration_log")
  iteration_writer = IterationWriter(iteration_log)
 
@@ -433,6 +444,10 @@ def throughput_command(
  with open(output_json, "w") as f:
  output_token_info = report_utility.get_output_tokens(tokenizer)
  f.write(json.dumps(output_token_info, indent=4))
+ if request_json:
+ logger.info(f"Writing request information to {request_json}.")
+ with open(request_json, "w") as f:
+ f.write(json.dumps(report_utility.get_request_info(tokenizer)))
  report_utility.report_statistics()
  except KeyboardInterrupt:
  logger.info("Keyboard interrupt, exiting benchmark...")
 
@@ -78,7 +78,7 @@ async def process_request(self, request: InferenceRequest,
  request_perf_item = PerfItemTuple(
  start_timestamp=request_start_timestamp,
  end_timestamp=response_end_timestamp,
- request_id=response.request_id,
+ request_id=response.id,
  num_input_tokens=len(output.prompt_token_ids),
  response_is_final=response.finished,
  error=False,
 
@@ -59,6 +59,7 @@ def register_request_perf_item(self, request_perf_item: PerfItemTuple):
  Register request perf items, used exclusively with LLM API.
  """
  record = self.requests[request_perf_item.request_id]
+ record.id = request_perf_item.request_id
  record.num_input_tokens = request_perf_item.num_input_tokens
  record.start_timestamp = request_perf_item.start_timestamp
  record.register_event(request_perf_item.error,
@@ -220,6 +221,16 @@ def get_output_tokens(self, tokenizer) -> Dict[int, List[str]]:
  retval[req_id] = output_str
  return dict(sorted(retval.items()))
 
+ def get_request_info(self, tokenizer) -> Dict[int, List[str]]:
+ requests = []
+ for request in self.raw_statistics.requests.values():
+ entry = request.model_dump()
+ entry["output"] = tokenizer.decode(entry["tokens"])
+ entry["output_tokens"] = len(entry["tokens"])
+ entry.pop("tokens")
+ requests.append(entry)
+ return requests
+
  def get_statistics_dict(self) -> Dict[str, Any]:
  """Get statistics as a dictionary.