Skip to content

Commit 3c7931c

Browse files
authored
Merge pull request #505 from NVlabs/jit
feat: JIT mode for 2-5x inference and 1.5x training speed up
2 parents d9e5274 + 1317f3b commit 3c7931c

File tree

103 files changed

+23472
-467
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

103 files changed

+23472
-467
lines changed

.editorconfig

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,11 @@ insert_final_newline = true
66
indent_style = tab
77
indent_size = 4
88
trim_trailing_whitespace = true
9+
max_line_length = 140
910

1011
[*.md]
1112
trim_trailing_whitespace = false
1213

13-
[*.{clangd,yml}]
14+
[*.{clangd,nix,yml}]
1415
indent_style = space
1516
indent_size = 2

.github/workflows/main.yml

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,6 @@ jobs:
3030
- os: ubuntu-22.04
3131
cuda: "11.8"
3232
arch: 89
33-
- os: ubuntu-22.04
34-
cuda: "11.7"
35-
arch: 89
3633
- os: ubuntu-22.04
3734
cuda: "11.7"
3835
arch: 86
@@ -67,7 +64,7 @@ jobs:
6764
run: ./dependencies/cuda-cmake-github-actions/scripts/actions/install_cuda_ubuntu.sh
6865
shell: bash
6966
- name: CMake
70-
run: cmake . -B ${{ env.build_dir }} -DCMAKE_BUILD_TYPE=${{ env.config }}
67+
run: cmake . -B ${{ env.build_dir }} -DCMAKE_BUILD_TYPE=${{ env.config }} -DTCNN_BUILD_TESTS=1
7168
- name: Build
7269
working-directory: ${{ env.build_dir }}
7370
run: cmake --build . --target all --verbose -j `nproc`
@@ -121,7 +118,7 @@ jobs:
121118
shell: powershell
122119
run: .\dependencies\cuda-cmake-github-actions\scripts\actions\install_cuda_windows.ps1
123120
- name: CMake
124-
run: cmake . -B ${{ env.build_dir }} -G "${{ matrix.visual_studio }}" -A x64
121+
run: cmake . -B ${{ env.build_dir }} -G "${{ matrix.visual_studio }}" -A x64 -DTCNN_BUILD_TESTS=1
125122
- name: Build
126123
working-directory: ${{ env.build_dir }}
127124
run: cmake --build . --config ${{ env.config }} --target ALL_BUILD --verbose

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
__pycache__
55
build*
66
dist
7+
rtc
78
/.cache
89
/.vscode
910
/.direnv

.gitmodules

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,6 @@
44
[submodule "dependencies/fmt"]
55
path = dependencies/fmt
66
url = https://github.com/fmtlib/fmt
7+
[submodule "dependencies/cmrc"]
8+
path = dependencies/cmrc
9+
url = https://github.com/vector-of-bool/cmrc

CITATION.cff

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,5 +20,5 @@ keywords:
2020
- 'neural network, tiny, tensor cores, cuda'
2121
license: BSD-3-Clause
2222
license-url: https://github.com/NVlabs/tiny-cuda-nn/blob/master/LICENSE.txt
23-
version: 1.7
23+
version: 2.0
2424
date-released: '2021-04-21'

CMakeLists.txt

Lines changed: 117 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
# Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
2-
#
1+
# Copyright (c) 2020-2025, NVIDIA CORPORATION. All rights reserved.
2+
#
33
# Redistribution and use in source and binary forms, with or without modification, are permitted
44
# provided that the following conditions are met:
55
# * Redistributions of source code must retain the above copyright notice, this list of
@@ -10,7 +10,7 @@
1010
# * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
1111
# to endorse or promote products derived from this software without specific prior written
1212
# permission.
13-
#
13+
#
1414
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
1515
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
1616
# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
@@ -24,15 +24,23 @@ cmake_minimum_required(VERSION 3.18)
2424

2525
project(
2626
tiny-cuda-nn
27-
VERSION 1.7
27+
VERSION 2.0
2828
DESCRIPTION "Lightning fast & tiny C++/CUDA neural network framework"
2929
LANGUAGES CXX CUDA
3030
)
3131

32+
option(TCNN_ALLOW_CUBLAS_CUSOLVER "Allows tiny-cuda-nn to use cuBLAS and cuSolver. Only required for the Shampoo optimizer." OFF)
3233
option(TCNN_BUILD_BENCHMARK "Build tiny-cuda-nn example benchmark?" ON)
3334
option(TCNN_BUILD_EXAMPLES "Build tiny-cuda-nn example applications?" ON)
34-
option(TCNN_ALLOW_CUBLAS_CUSOLVER "Allows tiny-cuda-nn to use cuBLAS and cuSolver. Only required for the Shampoo optimizer." OFF)
35+
option(TCNN_BUILD_NO_FWD_BWD "Build without offline compiled forward and backward kernels?" OFF)
36+
option(TCNN_BUILD_TESTS "Build tiny-cuda-nn's tests?" OFF)
37+
option(TCNN_BUILD_WITH_RTC "Build support for runtime compilation of fully fused kernels?" ON)
38+
option(TCNN_BUILD_USE_FAST_MATH "Build tiny-cuda-nn with '--use_fast_math' option?" ON)
39+
40+
set(TCNN_EXTERNAL_FMT "" CACHE STRING "If non-empty, the `fmt` target is supplied externally with the given name.")
41+
3542
set(TCNN_CUDA_ARCHITECTURES "" CACHE STRING "Build tiny-cuda-nn for a specific GPU architecture.")
43+
option(TCNN_LINK_CUDA "Link tiny-cuda-nn to CUDA libraries?" ON)
3644

3745
###############################################################################
3846
# Build type and C++ compiler setup
@@ -57,6 +65,11 @@ if (APPLE)
5765
set(CMAKE_MACOSX_RPATH ON)
5866
endif()
5967

68+
if (CMAKE_EXPORT_COMPILE_COMMANDS)
69+
set(CMAKE_CXX_STANDARD_INCLUDE_DIRECTORIES ${CMAKE_CXX_IMPLICIT_INCLUDE_DIRECTORIES})
70+
set(CMAKE_CUDA_STANDARD_INCLUDE_DIRECTORIES ${CMAKE_CUDA_IMPLICIT_INCLUDE_DIRECTORIES})
71+
endif()
72+
6073
if (MSVC)
6174
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /D_CRT_SECURE_NO_WARNINGS")
6275
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP")
@@ -204,17 +217,44 @@ if (CUDA_VERSION VERSION_LESS 10.2)
204217
message(FATAL_ERROR "CUDA version too low. tiny-cuda-nn require CUDA 10.2 or higher.")
205218
endif()
206219

207-
list(APPEND TCNN_LIBRARIES cuda)
208-
list(APPEND TCNN_DEFINITIONS -DTCNN_MIN_GPU_ARCH=${MIN_GPU_ARCH})
220+
list(APPEND TCNN_INCLUDES "include")
221+
222+
if (TCNN_HAS_PARENT)
223+
set(TCNN_DEFINITIONS ${TCNN_DEFINITIONS} PARENT_SCOPE)
224+
endif()
225+
226+
# Only compile the shampoo optimizer if
227+
# a recent enough cuBLAS version is available.
209228
if (TCNN_ALLOW_CUBLAS_CUSOLVER AND CUDA_VERSION VERSION_GREATER_EQUAL 11.0)
210-
# Only compile the shampoo optimizer if
211-
# a new enough cuBLAS version is available.
212-
list(APPEND TCNN_LIBRARIES cublas)
229+
set(TCNN_BUILD_WITH_SHAMPOO ON)
230+
else()
231+
set(TCNN_BUILD_WITH_SHAMPOO OFF)
232+
endif()
233+
234+
if (TCNN_BUILD_WITH_SHAMPOO)
213235
list(APPEND TCNN_DEFINITIONS -DTCNN_SHAMPOO)
214236
endif()
215237

216-
if (TCNN_HAS_PARENT)
217-
set(TCNN_DEFINITIONS ${TCNN_DEFINITIONS} PARENT_SCOPE)
238+
if (TCNN_BUILD_WITH_RTC)
239+
list(APPEND TCNN_DEFINITIONS -DTCNN_RTC)
240+
endif()
241+
242+
if (TCNN_BUILD_USE_FAST_MATH)
243+
list(APPEND TCNN_DEFINITIONS -DTCNN_RTC_USE_FAST_MATH)
244+
endif()
245+
246+
if (TCNN_BUILD_NO_FWD_BWD)
247+
list(APPEND TCNN_DEFINITIONS -DTCNN_NO_FWD_BWD)
248+
endif()
249+
250+
if (TCNN_LINK_CUDA)
251+
list(APPEND TCNN_LIBRARIES cuda)
252+
if (TCNN_BUILD_WITH_SHAMPOO)
253+
list(APPEND TCNN_LIBRARIES cublas)
254+
endif()
255+
if (TCNN_BUILD_WITH_RTC)
256+
list(APPEND TCNN_LIBRARIES nvrtc)
257+
endif()
218258
endif()
219259

220260
if (MSVC)
@@ -224,6 +264,9 @@ else()
224264
list(APPEND CUDA_NVCC_FLAGS "-Xcompiler=-fno-strict-aliasing")
225265
list(APPEND CUDA_NVCC_FLAGS "-Xcudafe=--diag_suppress=unrecognized_gcc_pragma")
226266
endif()
267+
if (TCNN_BUILD_USE_FAST_MATH)
268+
list(APPEND CUDA_NVCC_FLAGS "--use_fast_math")
269+
endif()
227270
list(APPEND CUDA_NVCC_FLAGS "--extended-lambda")
228271
list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr")
229272

@@ -237,7 +280,14 @@ if (NOT MSVC)
237280
endif()
238281

239282
set(BUILD_SHARED_LIBS OFF)
240-
add_subdirectory("dependencies/fmt")
283+
284+
if (TCNN_EXTERNAL_FMT)
285+
list(APPEND TCNN_LIBRARIES "${TCNN_EXTERNAL_FMT}")
286+
else()
287+
add_subdirectory("dependencies/fmt")
288+
list(APPEND TCNN_LIBRARIES fmt)
289+
list(APPEND TCNN_INCLUDES "dependencies/fmt/include")
290+
endif()
241291

242292
###############################################################################
243293
# tiny-cuda-nn library, samples, and benchmarks
@@ -258,31 +308,79 @@ set(TCNN_SOURCES
258308
src/object.cu
259309
src/optimizer.cu
260310
src/reduce_sum.cu
311+
src/rtc_kernel.cu
261312
)
262313

263314
if (MIN_GPU_ARCH GREATER 70)
264315
list(APPEND TCNN_SOURCES src/fully_fused_mlp.cu)
265316
endif()
266317

318+
list(APPEND TCNN_DEFINITIONS -DTCNN_MIN_GPU_ARCH=${MIN_GPU_ARCH})
319+
267320
###############################################################################
268321
# Linker / library
269322
###############################################################################
270323

271-
add_library(tiny-cuda-nn STATIC ${TCNN_SOURCES})
272-
target_compile_definitions(tiny-cuda-nn PUBLIC ${TCNN_DEFINITIONS})
273-
target_compile_options(tiny-cuda-nn PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:${CUDA_NVCC_FLAGS}>)
274-
target_include_directories(tiny-cuda-nn PUBLIC
324+
include("${CMAKE_CURRENT_SOURCE_DIR}/dependencies/cmrc/CMakeRC.cmake")
325+
cmrc_add_resource_library(tiny-cuda-nn-resources NAMESPACE tcnn)
326+
list(APPEND TCNN_DEFINITIONS -DTCNN_CMRC)
327+
list(APPEND TCNN_LIBRARIES tiny-cuda-nn-resources)
328+
329+
if (TCNN_BUILD_WITH_RTC)
330+
# Fetch CUDA headers and folders that will be required by the runtime compiler
331+
# and include those headers with the compiled binary of tcnn.
332+
foreach (CUDA_INCLUDE_CANDIDATE IN LISTS CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES)
333+
if (EXISTS "${CUDA_INCLUDE_CANDIDATE}/cuda_fp16.h")
334+
set(CUDA_INCLUDE "${CUDA_INCLUDE_CANDIDATE}")
335+
break()
336+
endif()
337+
endforeach(CUDA_INCLUDE_CANDIDATE)
338+
339+
if (NOT CUDA_INCLUDE)
340+
# If the CUDA include dir couldn't be found via CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES,
341+
# try a relative path w.r.t. the CUDA compiler binary as a last-ditch effort.
342+
get_filename_component(CUDA_COMPILER_BIN "${CMAKE_CUDA_COMPILER}" DIRECTORY)
343+
get_filename_component(CUDA_DIR "${CUDA_COMPILER_BIN}" DIRECTORY)
344+
set(CUDA_INCLUDE "${CUDA_DIR}/include")
345+
endif()
346+
347+
file(GLOB CUDA_HEADERS "${CUDA_INCLUDE}/cuda_fp16*" "${CUDA_INCLUDE}/vector*")
348+
if (NOT CUDA_HEADERS)
349+
message(WARNING "FP16 headers could not be found. JIT compilation will likely fail.")
350+
endif()
351+
352+
file(GLOB_RECURSE TCNN_HEADERS "${CMAKE_CURRENT_SOURCE_DIR}/include/tiny-cuda-nn/*")
353+
file(GLOB PCG32_HEADERS "${CMAKE_CURRENT_SOURCE_DIR}/dependencies/pcg32/*")
354+
355+
cmrc_add_resources(tiny-cuda-nn-resources WHENCE "${CUDA_INCLUDE}" ${CUDA_HEADERS})
356+
cmrc_add_resources(tiny-cuda-nn-resources WHENCE "${CMAKE_CURRENT_SOURCE_DIR}/include" ${TCNN_HEADERS})
357+
cmrc_add_resources(tiny-cuda-nn-resources WHENCE "${CMAKE_CURRENT_SOURCE_DIR}/dependencies" ${PCG32_HEADERS})
358+
endif()
359+
360+
list(APPEND TCNN_INCLUDES
275361
"include"
276362
"dependencies"
277363
"dependencies/cutlass/include"
278364
"dependencies/cutlass/tools/util/include"
279-
"dependencies/fmt/include"
280365
)
281-
target_link_libraries(tiny-cuda-nn PUBLIC ${CUDA_LIBRARIES} ${TCNN_LIBRARIES} fmt)
366+
367+
add_library(tiny-cuda-nn STATIC ${TCNN_SOURCES})
368+
target_compile_definitions(tiny-cuda-nn PUBLIC ${TCNN_DEFINITIONS})
369+
target_compile_options(tiny-cuda-nn PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:${CUDA_NVCC_FLAGS}>)
370+
target_include_directories(tiny-cuda-nn PUBLIC ${TCNN_INCLUDES})
371+
target_link_libraries(tiny-cuda-nn PUBLIC ${TCNN_LIBRARIES})
282372

283373
if (TCNN_BUILD_EXAMPLES)
284374
add_subdirectory("samples")
285375
endif()
376+
286377
if (TCNN_BUILD_BENCHMARK)
287378
add_subdirectory("benchmarks/image")
379+
add_subdirectory("benchmarks/mlp")
380+
endif()
381+
382+
if (TCNN_BUILD_TESTS)
383+
enable_testing()
384+
add_subdirectory(tests)
385+
list(APPEND CMAKE_CTEST_ARGUMENTS "--output-on-failure")
288386
endif()

LICENSE.txt

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,21 @@
1-
Copyright (c) 2020-2025, NVIDIA CORPORATION. All rights reserved.
2-
3-
Redistribution and use in source and binary forms, with or without modification, are permitted
4-
provided that the following conditions are met:
5-
* Redistributions of source code must retain the above copyright notice, this list of
6-
conditions and the following disclaimer.
7-
* Redistributions in binary form must reproduce the above copyright notice, this list of
8-
conditions and the following disclaimer in the documentation and/or other materials
9-
provided with the distribution.
10-
* Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
11-
to endorse or promote products derived from this software without specific prior written
12-
permission.
13-
14-
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
15-
IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
16-
FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
17-
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
18-
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
19-
OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
20-
STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
21-
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
1+
Copyright (c) 2020-2025, NVIDIA CORPORATION. All rights reserved.
2+
3+
Redistribution and use in source and binary forms, with or without modification, are permitted
4+
provided that the following conditions are met:
5+
* Redistributions of source code must retain the above copyright notice, this list of
6+
conditions and the following disclaimer.
7+
* Redistributions in binary form must reproduce the above copyright notice, this list of
8+
conditions and the following disclaimer in the documentation and/or other materials
9+
provided with the distribution.
10+
* Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
11+
to endorse or promote products derived from this software without specific prior written
12+
permission.
13+
14+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
15+
IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
16+
FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
17+
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
18+
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
19+
OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
20+
STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
21+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

README.md

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ nlohmann::json config = {
4444
using namespace tcnn;
4545

4646
auto model = create_from_config(n_input_dims, n_output_dims, config);
47+
model->set_jit_fusion(supports_jit_fusion()); // Optional: accelerate with JIT fusion
4748

4849
// Train the model (batch_size must be a multiple of tcnn::BATCH_SIZE_GRANULARITY)
4950
GPUMatrix<float> training_batch_inputs(n_input_dims, batch_size);
@@ -65,6 +66,11 @@ GPUMatrix<float> inference_outputs(n_output_dims, batch_size);
6566
model.network->inference(inference_inputs, inference_outputs);
6667
```
6768
69+
**Important**: enabling JIT fusion is a new, optional feature with tiny-cuda-nn v2.0 and later.
70+
It is recommended to *always* enable it for a performance boost of 1.5x to 2.5x, depending on the model and GPU.
71+
Newer GPUs exhibit larger speedups.
72+
Please [open an issue](https://github.com/NVlabs/tiny-cuda-nn/issues) if you encounter a slowdown or other problems with JIT fusion enabled.
73+
6874
6975
## Example: learning a 2D image
7076
@@ -161,6 +167,8 @@ model = tcnn.NetworkWithInputEncoding(
161167
encoding = tcnn.Encoding(n_input_dims, config["encoding"])
162168
network = tcnn.Network(encoding.n_output_dims, n_output_dims, config["network"])
163169
model = torch.nn.Sequential(encoding, network)
170+
171+
model.jit_fusion = tcnn.supports_jit_fusion() # Optional: accelerate with JIT fusion
164172
```
165173

166174
See `samples/mlp_learning_an_image_pytorch.py` for an example.
@@ -225,7 +233,7 @@ If you use it in your research, we would appreciate a citation via
225233
month = {4},
226234
title = {{tiny-cuda-nn}},
227235
url = {https://github.com/NVlabs/tiny-cuda-nn},
228-
version = {1.7},
236+
version = {2.0},
229237
year = {2021}
230238
}
231239
```

benchmarks/image/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
1+
# Copyright (c) 2020-2025, NVIDIA CORPORATION. All rights reserved.
22
#
33
# Redistribution and use in source and binary forms, with or without modification, are permitted
44
# provided that the following conditions are met:

benchmarks/image/bench_ours.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
2+
* Copyright (c) 2020-2025, NVIDIA CORPORATION. All rights reserved.
33
*
44
* Redistribution and use in source and binary forms, with or without modification, are permitted
55
* provided that the following conditions are met:

0 commit comments

Comments
 (0)