NVIDIA
diff --git a/‎CHANGELOG.md‎
Lines changed: 1 addition & 1 deletion b/‎CHANGELOG.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 3 additions & 2 deletions b/‎README.md‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎benchmarks/cpp/CMakeLists.txt‎
Lines changed: 4 additions & 2 deletions b/‎benchmarks/cpp/CMakeLists.txt‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎benchmarks/cpp/README.md‎
Lines changed: 128 additions & 0 deletions b/‎benchmarks/cpp/README.md‎
Lines changed: 128 additions & 0 deletions
@@ -34,7 +34,7 @@
  - Optimize AllReduce for parallel attention on Falcon and GPT-J
  - Enable split-k for weight-only cutlass kernel when SM>=75
 * Documentation
- - Add [documentation for new builder workflow](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/new_workflow.md)
+ - Add [documentation for convert/build workflow](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/checkpoint.md)
 
 ## Versions 0.6.0 / 0.6.1
 
 
@@ -282,6 +282,7 @@ The list of supported models is:
 * [InternLM](examples/internlm)
 * [LLaMA](examples/llama)
 * [LLaMA-v2](examples/llama)
+* [Mamba](examples/mamba)
 * [mBART](examples/enc_dec)
 * [Mistral](examples/llama#mistral-v01)
 * [MPT](examples/mpt)
@@ -454,7 +455,7 @@ For example: `mpirun -n 1 python3 examples/run.py ...`
  - Support FP16 fMHA on NVIDIA V100 GPU
 * API
  - Add a set of High-level APIs for end-to-end generation tasks (see examples/high-level-api/README.md)
- - **[BREAKING CHANGES]** Migrate models to the new build workflow, including LLaMA, Mistral, Mixtral, InternLM, ChatGLM, Falcon, GPT-J, GPT-NeoX, Medusa, MPT, Baichuan and Phi (see docs/source/new_workflow.md)
+ - **[BREAKING CHANGES]** Migrate models to the new build workflow, including LLaMA, Mistral, Mixtral, InternLM, ChatGLM, Falcon, GPT-J, GPT-NeoX, Medusa, MPT, Baichuan and Phi (see docs/source/checkpoint.md)
  - **[BREAKING CHANGES]** Deprecate `LayerNorm` and `RMSNorm` plugins and removed corresponding build parameters
  - **[BREAKING CHANGES]** Remove optional parameter `maxNumSequences` for GPT manager
 * Bug fixes
@@ -482,7 +483,7 @@ For example: `mpirun -n 1 python3 examples/run.py ...`
  - Batch manager arguments documentation updates
  - Add documentation for best practices for tuning the performance of TensorRT-LLM (See docs/source/perf_best_practices.md)
  - Add documentation for Falcon AWQ support (See examples/falcon/README.md)
- - Update to the `docs/source/new_workflow.md` documentation
+ - Update to the `docs/source/checkpoint.md` documentation
  - Update AWQ INT4 weight only quantization documentation for GPT-J
  - Add blog: Speed up inference with SOTA quantization techniques in TRT-LLM
  - Refine TensorRT-LLM backend README structure #133
 
@@ -19,8 +19,10 @@ set(TOP_LEVEL_DIR "${PROJECT_SOURCE_DIR}/..")
 
 add_custom_target(benchmarks)
 
-set(CXXOPTS_SRC_DIR ${PROJECT_SOURCE_DIR}/../3rdparty/cxxopts)
-add_subdirectory(${CXXOPTS_SRC_DIR} ${CMAKE_CURRENT_BINARY_DIR}/cxxopts)
+if(NOT TARGET cxxopts::cxxopts)
+ set(CXXOPTS_SRC_DIR ${PROJECT_SOURCE_DIR}/../3rdparty/cxxopts)
+ add_subdirectory(${CXXOPTS_SRC_DIR} ${CMAKE_CURRENT_BINARY_DIR}/cxxopts)
+endif()
 
 function(add_benchmark test_name test_src)
  add_executable(${test_name} ${test_src})
 
@@ -127,6 +127,7 @@ python prepare_dataset.py \
 
 For `tokenizer`, specifying the path to the local tokenizer that have already been downloaded, or simply the name of the tokenizer from HuggingFace like `meta-llama/Llama-2-7b` will both work. The tokenizer will be downloaded automatically for the latter case.
 
+
 #### Prepare TensorRT-LLM engines
 Please make sure that the engines are built with argument `--use_inflight_batching` and `--remove_input_padding` if you'd like to benchmark inflight batching, for more details, please see the document in TensorRT-LLM examples.
 
@@ -187,3 +188,130 @@ Take GPT-350M as an example for single GPU with static batching
  --static_emulated_timeout 100 \
  --dataset ../../benchmarks/cpp/tokens-fixed-lengths.json
 ```
+
+#### Benchmarking LoRA
+
+Using either of the `prepare_dataset.py` methods above, add `--rand-task-id <start-id> <end-id>` to the command. This will add a random `task_id` from `<start-id>` to `<end-id>` inclusive.
+You can then use `utils/generate_rand_loras.py` to generate random LoRA weights for benchmarking purposes. `utils/generate_rand_loras.py` takes an example LoRA for the model you are benchmarking.
+Then you can run `gptManagerBenchmark` with `--type IFB` and `--lora_dir /path/to/utils/generate_rand_loras/output`
+
+End-to-end LoRA benchmarking script
+
+```
+git-lfs clone https://huggingface.co/meta-llama/Llama-2-13b-hf
+git-lfs clone https://huggingface.co/hfl/chinese-llama-2-lora-13b
+
+MODEL_CHECKPOINT=Llama-2-13b-hf
+CONVERTED_CHECKPOINT=Llama-2-13b-hf-ckpt
+TOKENIZER=Llama-2-13b-hf
+LORA_ENGINE=Llama-2-13b-hf-engine
+
+DTYPE=float16
+TP=2
+PP=1
+MAX_LEN=1024
+MAX_BATCH=32
+MAX_LORA_RANK=32
+
+SOURCE_LORA=chinese-llama-2-lora-13b
+CPP_LORA=chinese-llama-2-lora-13b-cpp
+
+EG_DIR=/tmp/lora-eg
+
+# Build lora enabled engine
+python examples/llama/convert_checkpoint.py --model_dir ${MODEL_CHECKPOINT} \
+ --output_dir ${CONVERTED_CHECKPOINT} \
+ --dtype ${DTYPE} \
+ --tp_size ${TP} \
+ --pp_size 1 \
+ --lora_target_modules attn_qkv \
+ --max_lora_rank ${MAX_LORA_RANK}
+
+${HOME}/.local/bin/trtllm-build \
+ --checkpoint_dir ${CONVERTED_CHECKPOINT} \
+ --output_dir ${LORA_ENGINE} \
+ --max_batch_size ${MAX_BATCH} \
+ --max_input_len $MAX_LEN \
+ --max_output_len $MAX_LEN \
+ --gpt_attention_plugin float16 \
+ --paged_kv_cache enable \
+ --remove_input_padding enable \
+ --gemm_plugin float16 \
+ --lora_plugin float16 \
+ --use_paged_context_fmha enable \
+ --use_custom_all_reduce disable
+
+NUM_LORAS=(8 16 24 32 64 128 256)
+NUM_REQUESTS=1024
+
+# Convert LoRA to cpp format
+python examples/gpt/nemo_lora_convert.py \
+ -i $SOURCE_LORA \
+ --storage-type $DTYPE \
+ --write-cpp-runtime-tensors \
+ -o $CPP_LORA
+
+# Prepare datasets
+mkdir -p $EG_DIR/data
+
+# Prepare dataset without lora_task_id
+python benchmarks/cpp/prepare_dataset.py \
+ --output "${EG_DIR}/data/token-norm-dist.json" \
+ --request-rate -1 \
+ --time-delay-dist constant \
+ --tokenizer $TOKENIZER \
+ token-norm-dist \
+ --num-requests $NUM_REQUESTS \
+ --input-mean 256 --input-stdev 16 --output-mean 128 --output-stdev 24
+
+# Prepare dataset with lora_task_ids from 0 - $nloras
+for nloras in ${NUM_LORAS[@]}; do
+ python benchmarks/cpp/prepare_dataset.py \
+ --output "${EG_DIR}/data/token-norm-dist-lora-${nloras}.json" \
+ --request-rate -1 \
+ --time-delay-dist constant \
+ --rand-task-id 0 $(( $nloras - 1 )) \
+ --tokenizer $TOKENIZER \
+ token-norm-dist \
+ --num-requests $NUM_REQUESTS \
+ --input-mean 256 --input-stdev 16 --output-mean 128 --output-stdev 24
+done
+
+# Generate random lora weights for 256 adapters
+python benchmarks/cpp/utils/generate_rand_loras.py ${CPP_LORA} ${EG_DIR}/loras 256
+
+# perform benchmarking
+
+# First run inference without LoRAs
+mkdir -p ${EG_DIR}/log-base-lora
+mpirun -n ${TP} --output-filename ${EG_DIR}/log-base-lora \
+ cpp/build_Debug/benchmarks/gptManagerBenchmark \
+ --engine_dir $LORA_ENGINE \
+ --type IFB \
+ --dataset "${EG_DIR}/data/token-norm-dist.json" \
+ --lora_host_cache_bytes 8589934592 \
+ --lora_num_device_mod_layers $(( 32 * $NUM_LAYERS * $NUM_LORA_MODS * $MAX_LORA_RANK )) \
+ --kv_cache_free_gpu_mem_fraction 0.80 \
+ --log_level info \
+ --eos_id ${EOS_ID}
+
+# Now run inference with various numbers or loras
+# The host cache is set large enough to hold all the LoRAs in lora_dir
+# GPU cache is set to hold 32 LoRAs
+# This benchmark will preload all the LoRAs into the host cache
+# We run inference on a range of active LoRAs exercising different cache miss rates.
+for nloras in ${NUM_LORAS[@]}; do
+ mkdir -p ${EG_DIR}/log-lora-${nloras}
+ mpirun -n ${TP} --output-filename "${EG_DIR}/log-lora-${nloras}" \
+ cpp/build_Debug/benchmarks/gptManagerBenchmark \
+ --engine_dir $LORA_ENGINE \
+ --type IFB \
+ --dataset "${EG_DIR}/data/token-norm-dist-lora-${nloras}.json" \
+ --lora_host_cache_bytes 8589934592 \
+ --lora_num_device_mod_layers $(( 32 * $NUM_LAYERS * $NUM_LORA_MODS * $MAX_LORA_RANK )) \
+ --kv_cache_free_gpu_mem_fraction 0.80 \
+ --log_level info \
+ --eos_id ${EOS_ID} \
+ --lora_dir ${EG_DIR}/loras
+done
+```