Skip to content

Commit 850b6fa

Browse files
Shixiaowei02kaiyux
andauthored
Update TensorRT-LLM (#1358)
Co-authored-by: Kaiyu <26294424+kaiyux@users.noreply.github.com>
1 parent 66ca337 commit 850b6fa

File tree

328 files changed

+436786
-6623
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

328 files changed

+436786
-6623
lines changed

CHANGELOG.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@
3434
- Optimize AllReduce for parallel attention on Falcon and GPT-J
3535
- Enable split-k for weight-only cutlass kernel when SM>=75
3636
* Documentation
37-
- Add [documentation for new builder workflow](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/new_workflow.md)
37+
- Add [documentation for convert/build workflow](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/checkpoint.md)
3838

3939
## Versions 0.6.0 / 0.6.1
4040

README.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,7 @@ The list of supported models is:
282282
* [InternLM](examples/internlm)
283283
* [LLaMA](examples/llama)
284284
* [LLaMA-v2](examples/llama)
285+
* [Mamba](examples/mamba)
285286
* [mBART](examples/enc_dec)
286287
* [Mistral](examples/llama#mistral-v01)
287288
* [MPT](examples/mpt)
@@ -454,7 +455,7 @@ For example: `mpirun -n 1 python3 examples/run.py ...`
454455
- Support FP16 fMHA on NVIDIA V100 GPU
455456
* API
456457
- Add a set of High-level APIs for end-to-end generation tasks (see examples/high-level-api/README.md)
457-
- **[BREAKING CHANGES]** Migrate models to the new build workflow, including LLaMA, Mistral, Mixtral, InternLM, ChatGLM, Falcon, GPT-J, GPT-NeoX, Medusa, MPT, Baichuan and Phi (see docs/source/new_workflow.md)
458+
- **[BREAKING CHANGES]** Migrate models to the new build workflow, including LLaMA, Mistral, Mixtral, InternLM, ChatGLM, Falcon, GPT-J, GPT-NeoX, Medusa, MPT, Baichuan and Phi (see docs/source/checkpoint.md)
458459
- **[BREAKING CHANGES]** Deprecate `LayerNorm` and `RMSNorm` plugins and removed corresponding build parameters
459460
- **[BREAKING CHANGES]** Remove optional parameter `maxNumSequences` for GPT manager
460461
* Bug fixes
@@ -482,7 +483,7 @@ For example: `mpirun -n 1 python3 examples/run.py ...`
482483
- Batch manager arguments documentation updates
483484
- Add documentation for best practices for tuning the performance of TensorRT-LLM (See docs/source/perf_best_practices.md)
484485
- Add documentation for Falcon AWQ support (See examples/falcon/README.md)
485-
- Update to the `docs/source/new_workflow.md` documentation
486+
- Update to the `docs/source/checkpoint.md` documentation
486487
- Update AWQ INT4 weight only quantization documentation for GPT-J
487488
- Add blog: Speed up inference with SOTA quantization techniques in TRT-LLM
488489
- Refine TensorRT-LLM backend README structure #133

benchmarks/cpp/CMakeLists.txt

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,10 @@ set(TOP_LEVEL_DIR "${PROJECT_SOURCE_DIR}/..")
1919

2020
add_custom_target(benchmarks)
2121

22-
set(CXXOPTS_SRC_DIR ${PROJECT_SOURCE_DIR}/../3rdparty/cxxopts)
23-
add_subdirectory(${CXXOPTS_SRC_DIR} ${CMAKE_CURRENT_BINARY_DIR}/cxxopts)
22+
if(NOT TARGET cxxopts::cxxopts)
23+
set(CXXOPTS_SRC_DIR ${PROJECT_SOURCE_DIR}/../3rdparty/cxxopts)
24+
add_subdirectory(${CXXOPTS_SRC_DIR} ${CMAKE_CURRENT_BINARY_DIR}/cxxopts)
25+
endif()
2426

2527
function(add_benchmark test_name test_src)
2628
add_executable(${test_name} ${test_src})

benchmarks/cpp/README.md

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,7 @@ python prepare_dataset.py \
127127

128128
For `tokenizer`, specifying the path to the local tokenizer that have already been downloaded, or simply the name of the tokenizer from HuggingFace like `meta-llama/Llama-2-7b` will both work. The tokenizer will be downloaded automatically for the latter case.
129129

130+
130131
#### Prepare TensorRT-LLM engines
131132
Please make sure that the engines are built with argument `--use_inflight_batching` and `--remove_input_padding` if you'd like to benchmark inflight batching, for more details, please see the document in TensorRT-LLM examples.
132133

@@ -187,3 +188,130 @@ Take GPT-350M as an example for single GPU with static batching
187188
--static_emulated_timeout 100 \
188189
--dataset ../../benchmarks/cpp/tokens-fixed-lengths.json
189190
```
191+
192+
#### Benchmarking LoRA
193+
194+
Using either of the `prepare_dataset.py` methods above, add `--rand-task-id <start-id> <end-id>` to the command. This will add a random `task_id` from `<start-id>` to `<end-id>` inclusive.
195+
You can then use `utils/generate_rand_loras.py` to generate random LoRA weights for benchmarking purposes. `utils/generate_rand_loras.py` takes an example LoRA for the model you are benchmarking.
196+
Then you can run `gptManagerBenchmark` with `--type IFB` and `--lora_dir /path/to/utils/generate_rand_loras/output`
197+
198+
End-to-end LoRA benchmarking script
199+
200+
```
201+
git-lfs clone https://huggingface.co/meta-llama/Llama-2-13b-hf
202+
git-lfs clone https://huggingface.co/hfl/chinese-llama-2-lora-13b
203+
204+
MODEL_CHECKPOINT=Llama-2-13b-hf
205+
CONVERTED_CHECKPOINT=Llama-2-13b-hf-ckpt
206+
TOKENIZER=Llama-2-13b-hf
207+
LORA_ENGINE=Llama-2-13b-hf-engine
208+
209+
DTYPE=float16
210+
TP=2
211+
PP=1
212+
MAX_LEN=1024
213+
MAX_BATCH=32
214+
MAX_LORA_RANK=32
215+
216+
SOURCE_LORA=chinese-llama-2-lora-13b
217+
CPP_LORA=chinese-llama-2-lora-13b-cpp
218+
219+
EG_DIR=/tmp/lora-eg
220+
221+
# Build lora enabled engine
222+
python examples/llama/convert_checkpoint.py --model_dir ${MODEL_CHECKPOINT} \
223+
--output_dir ${CONVERTED_CHECKPOINT} \
224+
--dtype ${DTYPE} \
225+
--tp_size ${TP} \
226+
--pp_size 1 \
227+
--lora_target_modules attn_qkv \
228+
--max_lora_rank ${MAX_LORA_RANK}
229+
230+
${HOME}/.local/bin/trtllm-build \
231+
--checkpoint_dir ${CONVERTED_CHECKPOINT} \
232+
--output_dir ${LORA_ENGINE} \
233+
--max_batch_size ${MAX_BATCH} \
234+
--max_input_len $MAX_LEN \
235+
--max_output_len $MAX_LEN \
236+
--gpt_attention_plugin float16 \
237+
--paged_kv_cache enable \
238+
--remove_input_padding enable \
239+
--gemm_plugin float16 \
240+
--lora_plugin float16 \
241+
--use_paged_context_fmha enable \
242+
--use_custom_all_reduce disable
243+
244+
NUM_LORAS=(8 16 24 32 64 128 256)
245+
NUM_REQUESTS=1024
246+
247+
# Convert LoRA to cpp format
248+
python examples/gpt/nemo_lora_convert.py \
249+
-i $SOURCE_LORA \
250+
--storage-type $DTYPE \
251+
--write-cpp-runtime-tensors \
252+
-o $CPP_LORA
253+
254+
# Prepare datasets
255+
mkdir -p $EG_DIR/data
256+
257+
# Prepare dataset without lora_task_id
258+
python benchmarks/cpp/prepare_dataset.py \
259+
--output "${EG_DIR}/data/token-norm-dist.json" \
260+
--request-rate -1 \
261+
--time-delay-dist constant \
262+
--tokenizer $TOKENIZER \
263+
token-norm-dist \
264+
--num-requests $NUM_REQUESTS \
265+
--input-mean 256 --input-stdev 16 --output-mean 128 --output-stdev 24
266+
267+
# Prepare dataset with lora_task_ids from 0 - $nloras
268+
for nloras in ${NUM_LORAS[@]}; do
269+
python benchmarks/cpp/prepare_dataset.py \
270+
--output "${EG_DIR}/data/token-norm-dist-lora-${nloras}.json" \
271+
--request-rate -1 \
272+
--time-delay-dist constant \
273+
--rand-task-id 0 $(( $nloras - 1 )) \
274+
--tokenizer $TOKENIZER \
275+
token-norm-dist \
276+
--num-requests $NUM_REQUESTS \
277+
--input-mean 256 --input-stdev 16 --output-mean 128 --output-stdev 24
278+
done
279+
280+
# Generate random lora weights for 256 adapters
281+
python benchmarks/cpp/utils/generate_rand_loras.py ${CPP_LORA} ${EG_DIR}/loras 256
282+
283+
# perform benchmarking
284+
285+
# First run inference without LoRAs
286+
mkdir -p ${EG_DIR}/log-base-lora
287+
mpirun -n ${TP} --output-filename ${EG_DIR}/log-base-lora \
288+
cpp/build_Debug/benchmarks/gptManagerBenchmark \
289+
--engine_dir $LORA_ENGINE \
290+
--type IFB \
291+
--dataset "${EG_DIR}/data/token-norm-dist.json" \
292+
--lora_host_cache_bytes 8589934592 \
293+
--lora_num_device_mod_layers $(( 32 * $NUM_LAYERS * $NUM_LORA_MODS * $MAX_LORA_RANK )) \
294+
--kv_cache_free_gpu_mem_fraction 0.80 \
295+
--log_level info \
296+
--eos_id ${EOS_ID}
297+
298+
# Now run inference with various numbers or loras
299+
# The host cache is set large enough to hold all the LoRAs in lora_dir
300+
# GPU cache is set to hold 32 LoRAs
301+
# This benchmark will preload all the LoRAs into the host cache
302+
# We run inference on a range of active LoRAs exercising different cache miss rates.
303+
for nloras in ${NUM_LORAS[@]}; do
304+
mkdir -p ${EG_DIR}/log-lora-${nloras}
305+
mpirun -n ${TP} --output-filename "${EG_DIR}/log-lora-${nloras}" \
306+
cpp/build_Debug/benchmarks/gptManagerBenchmark \
307+
--engine_dir $LORA_ENGINE \
308+
--type IFB \
309+
--dataset "${EG_DIR}/data/token-norm-dist-lora-${nloras}.json" \
310+
--lora_host_cache_bytes 8589934592 \
311+
--lora_num_device_mod_layers $(( 32 * $NUM_LAYERS * $NUM_LORA_MODS * $MAX_LORA_RANK )) \
312+
--kv_cache_free_gpu_mem_fraction 0.80 \
313+
--log_level info \
314+
--eos_id ${EOS_ID} \
315+
--lora_dir ${EG_DIR}/loras
316+
done
317+
```

0 commit comments

Comments
 (0)