1 file changed
+1
-1
lines changedSubmodule tensorrt_llm updated 95 files
- .dockerignore+1
- .github/CODEOWNERS+6-5
- README.md+4-1
- cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h+14-5
- cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp+15-10
- cpp/tensorrt_llm/common/attentionOp.cpp+70-13
- cpp/tensorrt_llm/common/attentionOp.h+16
- cpp/tensorrt_llm/executor/cache_transmission/cacheSplitConcat.cu+31-6
- cpp/tensorrt_llm/executor/cache_transmission/cacheSplitConcat.h+1
- cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_common.h+2
- cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.cpp+1-2
- cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplPrecompiled.cpp+1-2
- cpp/tensorrt_llm/kernels/fmhaDispatcher.cpp+3
- cpp/tensorrt_llm/kernels/gptKernels.cu+9-4
- cpp/tensorrt_llm/kernels/gptKernels.h+4-2
- cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaRunner.cpp+2-1
- cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaRunnerParams.h+2-7
- cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/kernelParams.h+43-24
- cpp/tensorrt_llm/kernels/unfusedAttentionKernels.h+3-9
- cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_template.h+40-18
- cpp/tensorrt_llm/kernels/xqaDispatcher.cpp+38-22
- cpp/tensorrt_llm/kernels/xqaDispatcher.h+6-3
- cpp/tensorrt_llm/nanobind/batch_manager/kvCacheManager.cpp+12
- cpp/tensorrt_llm/nanobind/bindings.cpp+2
- cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.cpp+12
- cpp/tensorrt_llm/pybind/bindings.cpp+2
- cpp/tensorrt_llm/thop/attentionOp.cpp+51-11
- cpp/tensorrt_llm/thop/moeOp.cpp+28-13
- cpp/tests/unit_tests/batch_manager/kvCacheManagerTest.cpp+6-2
- cpp/tests/unit_tests/kernels/ropeTest.cu+4-11
- cpp/tests/unit_tests/multi_gpu/cacheTransceiverTest.cpp+260-51
- docker/Dockerfile.multi+2-1
- docker/Makefile+1-1
- docs/source/blogs/media/tech_blog10_baseline_performance_detail.png
- docs/source/blogs/media/tech_blog10_context_wait_performance.png
- docs/source/blogs/media/tech_blog10_full_strategy_performance.png
- docs/source/index.rst+6
- docs/source/legacy/tensorrt_quickstart.md+9
- examples/constraints.txt+1-1
- examples/llm-api/_tensorrt_engine/quickstart_example.py+33
- examples/llm-api/llm_runtime.py+2-3
- examples/llm-api/quickstart_example.py+8-2
- examples/llm-api/quickstart_multimodal.py+83
- examples/models/core/deepseek_v3/README.md+78-18
- examples/models/core/multimodal/README.md+70
- jenkins/L0_Test.groovy+48-20
- tensorrt_llm/_mnnvl_utils.py+6-4
- tensorrt_llm/_torch/attention_backend/trtllm.py+25-12
- tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py+2-4
- tensorrt_llm/_torch/models/checkpoints/hf/weight_loader.py+5-4
- tensorrt_llm/_torch/models/modeling_deepseekv3.py+19-3
- tensorrt_llm/_torch/models/modeling_gemma3vl.py-14
- tensorrt_llm/_torch/models/modeling_llama.py+86-57
- tensorrt_llm/_torch/models/modeling_mistral.py+27-6
- tensorrt_llm/_torch/models/modeling_phi4mm.py+65-421
- tensorrt_llm/_torch/models/modeling_speculative.py+1-13
- tensorrt_llm/_torch/modules/attention.py+11-1
- tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py+2-1
- tensorrt_llm/_torch/modules/linear.py+95-12
- tensorrt_llm/_torch/pyexecutor/_util.py+7-2
- tensorrt_llm/_torch/pyexecutor/model_engine.py+45-6
- tensorrt_llm/_torch/pyexecutor/py_executor.py+20-19
- tensorrt_llm/_torch/pyexecutor/py_executor_creator.py+41-10
- tensorrt_llm/_torch/pyexecutor/resource_manager.py+50-8
- tensorrt_llm/_torch/speculative/drafting_loops.py+150
- tensorrt_llm/_torch/speculative/model_drafter.py+34-3
- tensorrt_llm/_utils.py+17-10
- tensorrt_llm/executor/executor.py+2-2
- tensorrt_llm/executor/worker.py+85-43
- tensorrt_llm/llmapi/llm_utils.py+2-2
- tensorrt_llm/scaffolding/contrib/Dynasor/README.md+3-3
- tensorrt_llm/version.py+1-1
- tests/integration/defs/accuracy/references/cnn_dailymail.yaml+6-2
- tests/integration/defs/accuracy/references/gsm8k.yaml+11-1
- tests/integration/defs/accuracy/references/mmlu.yaml+11-1
- tests/integration/defs/accuracy/test_cli_flow.py+1
- tests/integration/defs/accuracy/test_disaggregated_serving.py+13-5
- tests/integration/defs/accuracy/test_llm_api_pytorch.py+58-13
- tests/integration/defs/llmapi/test_llm_examples.py+9-2
- tests/integration/defs/perf/pytorch_model_config.py+3-4
- tests/integration/defs/test_e2e.py+213-6
- tests/integration/test_lists/qa/llm_function_full.txt+6-2
- tests/integration/test_lists/qa/llm_function_l20.txt+1
- tests/integration/test_lists/qa/llm_function_sanity.txt+3-2
- tests/integration/test_lists/qa/llm_perf_sanity.yml+2-2
- tests/integration/test_lists/test-db/l0_b200.yml+2
- tests/integration/test_lists/test-db/l0_h100.yml+1
- tests/integration/test_lists/test-db/l0_sanity_check.yml+1
- tests/integration/test_lists/waives.txt+27-11
- tests/unittest/_torch/executor/test_pytorch_model_engine.py+1
- tests/unittest/_torch/modeling/test_modeling_mistral.py+55
- tests/unittest/_torch/speculative/test_eagle3.py+1
- tests/unittest/llmapi/apps/_test_openai_chat_json.py+1-5
- tests/unittest/llmapi/apps/_test_openai_chat_structural_tag.py+34-20
- triton_backend/inflight_batcher_llm/scripts/build.sh+23-11
0 commit comments