NVIDIA
diff --git a/‎.devcontainer/make_env.py‎
Lines changed: 3 additions & 1 deletion b/‎.devcontainer/make_env.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎.gitattributes‎
Lines changed: 2 additions & 1 deletion b/‎.gitattributes‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎.github/scripts/label_community_user.py‎
Lines changed: 116 additions & 24 deletions b/‎.github/scripts/label_community_user.py‎
Lines changed: 116 additions & 24 deletions
diff --git a/‎.github/workflows/label_community_pr.yml‎
Lines changed: 11 additions & 5 deletions b/‎.github/workflows/label_community_pr.yml‎
Lines changed: 11 additions & 5 deletions
diff --git a/‎.gitignore‎
Lines changed: 4 additions & 0 deletions b/‎.gitignore‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 1 addition & 0 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/cpp/README.md‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/cpp/README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cpp/CMakeLists.txt‎
Lines changed: 40 additions & 10 deletions b/‎cpp/CMakeLists.txt‎
Lines changed: 40 additions & 10 deletions
@@ -180,9 +180,11 @@ def main():
  env_files = [
  JENKINS_PROPS_PATH,
  DEV_CONTAINER_ENV_PATH,
- DEV_CONTAINER_USER_ENV_PATH,
  ]
 
+ if DEV_CONTAINER_USER_ENV_PATH.exists():
+ env_files.append(DEV_CONTAINER_USER_ENV_PATH)
+
  env = _load_env(env_files)
  _handle_rootless(env_inout=env)
 
 
@@ -1,7 +1,8 @@
 *.a filter=lfs diff=lfs merge=lfs -text
+*.dll filter=lfs diff=lfs merge=lfs -text
 *.lib filter=lfs diff=lfs merge=lfs -text
 *.so filter=lfs diff=lfs merge=lfs -text
-*.dll filter=lfs diff=lfs merge=lfs -text
+*.txz filter=lfs diff=lfs merge=lfs -text
 *.xz filter=lfs diff=lfs merge=lfs -text
 triton_backend/tools/gpt/input_data.json filter=lfs diff=lfs merge=lfs -text
 *cubin.cpp filter=lfs diff=lfs merge=lfs -text
 
@@ -1,5 +1,6 @@
 import os
 import sys
+from datetime import datetime, timedelta, timezone
 
 import requests
 
@@ -97,6 +98,73 @@ def add_label_to_pr(repo_owner: str, repo_name: str, pr_number: str,
  raise e
 
 
+def get_recent_open_prs(repo_owner: str,
+ repo_name: str,
+ minutes_back: int = 65):
+ """Get open PRs created or updated in the last N minutes."""
+ cutoff_time = datetime.now(timezone.utc) - timedelta(minutes=minutes_back)
+
+ url = f"{GITHUB_API_URL}/repos/{repo_owner}/{repo_name}/pulls"
+ params = {
+ "state": "open",
+ "sort": "updated",
+ "direction": "desc",
+ "per_page": 100
+ }
+
+ recent_prs = []
+ page = 1
+
+ try:
+ while True:
+ params["page"] = page
+ response = requests.get(url,
+ headers=HEADERS,
+ params=params,
+ timeout=30)
+ response.raise_for_status()
+ page_prs = response.json()
+
+ if not page_prs: # no more PRs
+ break
+
+ found_old_pr = False
+ for pr in page_prs:
+ created_at = datetime.strptime(
+ pr["created_at"],
+ "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
+ updated_at = datetime.strptime(
+ pr["updated_at"],
+ "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
+
+ if created_at >= cutoff_time or updated_at >= cutoff_time:
+ recent_prs.append(pr)
+ else:
+ # since sorted by updated desc, once we hit an old PR we can stop
+ found_old_pr = True
+ break
+
+ if found_old_pr:
+ break
+
+ page += 1
+ # safety limit to avoid infinite loops
+ if page > 10: # max 1000 PRs (100 * 10)
+ print(
+ f"Warning: Hit pagination limit at page {page}, may have missed some PRs"
+ )
+ break
+
+ print(
+ f"Found {len(recent_prs)} PRs created/updated in the last {minutes_back} minutes (checked {page} pages)"
+ )
+ return recent_prs
+
+ except requests.exceptions.RequestException as e:
+ print(f"Error fetching PRs: {e}")
+ raise
+
+
 def main():
  """
  Main function to check user membership and apply community labels.
@@ -106,45 +174,69 @@ def main():
  1 - Failed to determine user membership (API permission issues)
  2 - Failed to add community label (labeling API issues)
  """
- pr_author = os.environ.get("PR_AUTHOR")
- assert pr_author, "PR_AUTHOR environment variable not set"
- pr_number = os.environ.get("PR_NUMBER")
- assert pr_number, "PR_NUMBER environment variable not set"
  repo_owner = os.environ.get("REPO_OWNER")
  assert repo_owner, "REPO_OWNER environment variable not set"
  repo_name = os.environ.get("REPO_NAME")
  assert repo_name, "REPO_NAME environment variable not set"
  community_label = os.environ.get("COMMUNITY_LABEL")
  assert community_label, "COMMUNITY_LABEL environment variable not set"
+ time_window_minutes = int(os.environ.get("TIME_WINDOW_MINUTES"))
 
  print(
- f"Starting NVIDIA membership check for PR author '{pr_author}' on PR #{pr_number}."
+ f"Starting community PR labeling sweep for {repo_owner}/{repo_name}. Time window: {time_window_minutes} minutes."
  )
 
  try:
- is_member = check_user_membership("NVIDIA", pr_author)
- except RuntimeError as e:
- print(
- f"Critical error during NVIDIA membership check for '{pr_author}': {e}"
- )
- print("Halting script due to inability to determine membership status.")
+ recent_prs = get_recent_open_prs(repo_owner, repo_name,
+ time_window_minutes)
+ except requests.exceptions.RequestException:
+ print("Failed to fetch recent PRs")
  sys.exit(1)
 
- print(
- f"User '{pr_author}' is determined to be an NVIDIA member: {is_member}")
+ processed_count = 0
+ labeled_count = 0
+
+ for pr in recent_prs:
+ pr_number = pr["number"]
+ pr_author = pr["user"]["login"]
+ existing_labels = {label["name"] for label in pr["labels"]}
+
+ if community_label in existing_labels:
+ print(
+ f"PR #{pr_number} by {pr_author} already has community label, skipping"
+ )
+ continue
+
+ print(f"Processing PR #{pr_number} by {pr_author}")
+ processed_count += 1
 
- if not is_member:
- print(
- f"User '{pr_author}' is a community user. Adding label '{community_label}'."
- )
  try:
- add_label_to_pr(repo_owner, repo_name, pr_number, community_label)
- except requests.exceptions.RequestException as e:
- print(f"Failed to add community label: {e}")
- sys.exit(2)
- else:
- print(
- f"User '{pr_author}' is an NVIDIA member. No label will be added.")
+ is_member = check_user_membership("NVIDIA", pr_author)
+ except RuntimeError as e:
+ print(
+ f"Critical error during NVIDIA membership check for '{pr_author}': {e}"
+ )
+ print("Continuing with next PR...")
+ continue
+
+ if not is_member:
+ print(
+ f"User '{pr_author}' is a community user. Adding label '{community_label}'."
+ )
+ try:
+ add_label_to_pr(repo_owner, repo_name, str(pr_number),
+ community_label)
+ labeled_count += 1
+ except requests.exceptions.RequestException as e:
+ print(f"Failed to add community label to PR #{pr_number}: {e}")
+ # continue with other PRs instead of exiting
+ continue
+ else:
+ print(f"User '{pr_author}' is an NVIDIA member. No label needed.")
+
+ print(
+ f"Sweep complete: processed {processed_count} PRs, labeled {labeled_count} as community"
+ )
 
 
 if __name__ == "__main__":
 
@@ -1,8 +1,15 @@
 name: Label Community PR
 
 on:
- pull_request:
- types: [opened]
+ schedule:
+ - cron: '0 * * * *' # every hour at minute 0
+ workflow_dispatch: # manual trigger option
+ inputs:
+ time_window_minutes:
+ description: 'Time window in minutes to look back for PRs'
+ required: false
+ default: 65
+ type: number
 
 jobs:
  label_pr:
@@ -22,9 +29,8 @@ jobs:
  - name: Run labeling script
  env:
  AUTO_LABEL_COMMUNITY_TOKEN: ${{ secrets.AUTO_LABEL_COMMUNITY_TOKEN }}
- PR_AUTHOR: ${{ github.event.pull_request.user.login }}
- PR_NUMBER: ${{ github.event.pull_request.number }}
- REPO_OWNER: ${{ github.event.repository.owner.login }}
+ REPO_OWNER: ${{ github.repository_owner }}
  REPO_NAME: ${{ github.event.repository.name }}
  COMMUNITY_LABEL: "Community want to contribute"
+ TIME_WINDOW_MINUTES: ${{ inputs.time_window_minutes || 65 }}
  run: python .github/scripts/label_community_user.py
@@ -40,6 +40,9 @@ tensorrt_llm/libs
 tensorrt_llm/bindings.*.so
 tensorrt_llm/bindings.pyi
 tensorrt_llm/bindings/**/*.pyi
+tensorrt_llm/deep_ep/
+tensorrt_llm/deep_ep_cpp_tllm.*.so
+tensorrt_llm/deep_ep_cpp_tllm.pyi
 *docs/cpp_docs*
 *docs/source/_cpp_gen*
 docs/source/**/*.rst
@@ -55,6 +58,7 @@ llm-test-workspace/
 *.safetensors
 */tllm_debug/**
 *.patch
+!cpp/tensorrt_llm/deep_ep/*.patch
 
 # Generated files
 cpp/include/tensorrt_llm/executor/version.h
 
@@ -27,6 +27,7 @@ repos:
  args: [--allow-multiple-documents]
  exclude: ".*/gitlab/.*.yml"
  - id: trailing-whitespace
+ exclude: '\.patch$'
  - id: check-toml
  - id: mixed-line-ending
  args: [--fix=lf]
 
@@ -9,7 +9,7 @@ TensorRT-LLM
 [![python](https://img.shields.io/badge/python-3.10-green)](https://www.python.org/downloads/release/python-31012/)
 [![cuda](https://img.shields.io/badge/cuda-12.9.0-green)](https://developer.nvidia.com/cuda-downloads)
 [![trt](https://img.shields.io/badge/TRT-10.11.0-green)](https://developer.nvidia.com/tensorrt)
-[![version](https://img.shields.io/badge/release-1.0.0rc1-green)](./tensorrt_llm/version.py)
+[![version](https://img.shields.io/badge/release-1.0.0rc2-green)](./tensorrt_llm/version.py)
 [![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE)
 
 [Architecture](./docs/source/torch/arch_overview.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Performance](./docs/source/performance/perf-overview.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Examples](https://nvidia.github.io/TensorRT-LLM/quick-start-guide.html)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Documentation](./docs/source/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Roadmap](https://github.com/NVIDIA/TensorRT-LLM/issues?q=is%3Aissue%20state%3Aopen%20label%3Aroadmap)
 
@@ -41,7 +41,7 @@ python3 prepare_dataset.py \
 ```
 
 For datasets that don't have prompt key, set --dataset-prompt instead.
-Take [cnn_dailymail dataset](https://huggingface.co/datasets/cnn_dailymail) for example:
+Take [cnn_dailymail dataset](https://huggingface.co/datasets/abisee/cnn_dailymail) for example:
 ```
 python3 prepare_dataset.py \
  --tokenizer <path/to/tokenizer> \
 
@@ -39,6 +39,7 @@ option(FAST_BUILD "Skip compiling some kernels to accelerate compiling" OFF)
 option(FAST_MATH "Compiling in fast math mode" OFF)
 option(INDEX_RANGE_CHECK "Compiling with index range checks" OFF)
 option(COMPRESS_FATBIN "Compress everything in fatbin" ON)
+option(TIMING_NVCC "Enable nvcc build timing report" OFF)
 option(ENABLE_MULTI_DEVICE
  "Enable building with multi device support (requires NCCL, MPI,...)" ON)
 option(ENABLE_UCX "Enable building with UCX (Uniform Communication X) support"
@@ -135,10 +136,15 @@ configure_file(
  ${CMAKE_CURRENT_SOURCE_DIR}/include/tensorrt_llm/executor/version.h)
 
 setup_cuda_compiler()
-setup_cuda_architectures()
 
 enable_language(C CXX CUDA)
 
+# Configure CUDA Architectures after enabling CUDA.
+
+# Old CMake rejects family conditional architectures during enable_language, But
+# after that CMake handles it just fine.
+setup_cuda_architectures()
+
 find_package(CUDAToolkit 11.2 REQUIRED COMPONENTS cudart_static cuda_driver
  cublas cublasLt curand nvml)
 
@@ -323,6 +329,10 @@ endif()
 if(COMPRESS_FATBIN)
  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --fatbin-options -compress-all")
 endif()
+if(NVCC_TIMING)
+ set(CMAKE_CUDA_FLAGS
+ "${CMAKE_CUDA_FLAGS} --time ${CMAKE_CURRENT_BINARY_DIR}/nvcc-timing.csv")
+endif()
 message("CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}")
 
 set(COMMON_HEADER_DIRS ${PROJECT_SOURCE_DIR} ${CUDAToolkit_INCLUDE_DIR})
@@ -345,15 +355,6 @@ if(NOT WIN32 AND NOT DEFINED USE_CXX11_ABI)
 endif()
 
 if(BUILD_PYT)
- # Build TORCH_CUDA_ARCH_LIST
- set(TORCH_CUDA_ARCH_LIST "")
- foreach(CUDA_ARCH IN LISTS CMAKE_CUDA_ARCHITECTURES)
- string(REGEX REPLACE "^([1-9][0-9]*)([0-9]a?)-real$" "\\1.\\2" TORCH_ARCH
- ${CUDA_ARCH})
- list(APPEND TORCH_CUDA_ARCH_LIST ${TORCH_ARCH})
- endforeach()
-
- message(STATUS "TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST}")
  # ignore values passed from the environment
  if(DEFINED ENV{TORCH_CUDA_ARCH_LIST})
  message(
@@ -362,6 +363,20 @@ if(BUILD_PYT)
  )
  endif()
  unset(ENV{TORCH_CUDA_ARCH_LIST})
+ # Torch maintains custom logic to add CUDA architecture flags into
+ # CMAKE_CUDA_FLAGS based on TORCH_CUDA_ARCH_LIST variable, instead of using
+ # the native support introduced in newer CMake versions. And it always tries
+ # to add some flags, even given empty TORCH_CUDA_ARCH_LIST.
+
+ # We prefer CMake's native support to be able to easily customize the CUDA
+ # architectures to be compiled for, for each kernel individually. So we set
+ # TORCH_CUDA_ARCH_LIST to a placeholder value and remove the generated flags
+ # then to effectively prevent Torch from adding CUDA architecture flags.
+ message(
+ STATUS
+ "Set TORCH_CUDA_ARCH_LIST to placeholder value \"8.0\" to make Torch happy. "
+ "This is NOT the list of architectures that will be compiled for.")
+ set(TORCH_CUDA_ARCH_LIST "8.0")
 
  find_package(Python3 COMPONENTS Interpreter Development REQUIRED)
  message(STATUS "Found Python executable at ${Python3_EXECUTABLE}")
@@ -391,7 +406,22 @@ print(os.path.dirname(torch.__file__),end='');"
  list(APPEND CMAKE_PREFIX_PATH ${TORCH_DIR})
  set(USE_SYSTEM_NVTX ON)
  set(nvtx3_dir ${3RDPARTY_DIR}/NVTX/include)
+ set(CMAKE_CUDA_ARCHITECTURES_BACKUP ${CMAKE_CUDA_ARCHITECTURES})
  find_package(Torch REQUIRED)
+ set(CMAKE_CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES_BACKUP})
+ message(
+ STATUS
+ "Removing Torch generated placeholder CUDA architecture flags: -gencode arch=compute_80,code=sm_80."
+ )
+ string(REPLACE "-gencode arch=compute_80,code=sm_80 " "" CMAKE_CUDA_FLAGS_NEW
+ "${CMAKE_CUDA_FLAGS}")
+ if("${CMAKE_CUDA_FLAGS_NEW}" STREQUAL "${CMAKE_CUDA_FLAGS}")
+ message(
+ FATAL_ERROR
+ "Torch didn't generate expected placeholder CUDA architecture flags.")
+ endif()
+ set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS_NEW}")
+
  add_compile_definitions(TORCH_CUDA=1)
 
  if(DEFINED TORCH_CXX_FLAGS)