1
- # Copyright (c) 2020-2023 , NVIDIA CORPORATION. All rights reserved.
2
- #
1
+ # Copyright (c) 2020-2025 , NVIDIA CORPORATION. All rights reserved.
2
+ #
3
3
# Redistribution and use in source and binary forms, with or without modification, are permitted
4
4
# provided that the following conditions are met:
5
5
# * Redistributions of source code must retain the above copyright notice, this list of
10
10
# * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
11
11
# to endorse or promote products derived from this software without specific prior written
12
12
# permission.
13
- #
13
+ #
14
14
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
15
15
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
16
16
# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
@@ -24,15 +24,23 @@ cmake_minimum_required(VERSION 3.18)
24
24
25
25
project (
26
26
tiny-cuda-nn
27
- VERSION 1.7
27
+ VERSION 2.0
28
28
DESCRIPTION "Lightning fast & tiny C++/CUDA neural network framework"
29
29
LANGUAGES CXX CUDA
30
30
)
31
31
32
+ option (TCNN_ALLOW_CUBLAS_CUSOLVER "Allows tiny-cuda-nn to use cuBLAS and cuSolver. Only required for the Shampoo optimizer." OFF )
32
33
option (TCNN_BUILD_BENCHMARK "Build tiny-cuda-nn example benchmark?" ON )
33
34
option (TCNN_BUILD_EXAMPLES "Build tiny-cuda-nn example applications?" ON )
34
- option (TCNN_ALLOW_CUBLAS_CUSOLVER "Allows tiny-cuda-nn to use cuBLAS and cuSolver. Only required for the Shampoo optimizer." OFF )
35
+ option (TCNN_BUILD_NO_FWD_BWD "Build without offline compiled forward and backward kernels?" OFF )
36
+ option (TCNN_BUILD_TESTS "Build tiny-cuda-nn's tests?" OFF )
37
+ option (TCNN_BUILD_WITH_RTC "Build support for runtime compilation of fully fused kernels?" ON )
38
+ option (TCNN_BUILD_USE_FAST_MATH "Build tiny-cuda-nn with '--use_fast_math' option?" ON )
39
+
40
+ set (TCNN_EXTERNAL_FMT "" CACHE STRING "If non-empty, the `fmt` target is supplied externally with the given name." )
41
+
35
42
set (TCNN_CUDA_ARCHITECTURES "" CACHE STRING "Build tiny-cuda-nn for a specific GPU architecture." )
43
+ option (TCNN_LINK_CUDA "Link tiny-cuda-nn to CUDA libraries?" ON )
36
44
37
45
###############################################################################
38
46
# Build type and C++ compiler setup
@@ -57,6 +65,11 @@ if (APPLE)
57
65
set (CMAKE_MACOSX_RPATH ON )
58
66
endif ()
59
67
68
+ if (CMAKE_EXPORT_COMPILE_COMMANDS)
69
+ set (CMAKE_CXX_STANDARD_INCLUDE_DIRECTORIES ${CMAKE_CXX_IMPLICIT_INCLUDE_DIRECTORIES} )
70
+ set (CMAKE_CUDA_STANDARD_INCLUDE_DIRECTORIES ${CMAKE_CUDA_IMPLICIT_INCLUDE_DIRECTORIES} )
71
+ endif ()
72
+
60
73
if (MSVC )
61
74
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /D_CRT_SECURE_NO_WARNINGS" )
62
75
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP" )
@@ -204,17 +217,44 @@ if (CUDA_VERSION VERSION_LESS 10.2)
204
217
message (FATAL_ERROR "CUDA version too low. tiny-cuda-nn require CUDA 10.2 or higher." )
205
218
endif ()
206
219
207
- list (APPEND TCNN_LIBRARIES cuda)
208
- list (APPEND TCNN_DEFINITIONS -DTCNN_MIN_GPU_ARCH=${MIN_GPU_ARCH} )
220
+ list (APPEND TCNN_INCLUDES "include" )
221
+
222
+ if (TCNN_HAS_PARENT)
223
+ set (TCNN_DEFINITIONS ${TCNN_DEFINITIONS} PARENT_SCOPE)
224
+ endif ()
225
+
226
+ # Only compile the shampoo optimizer if
227
+ # a recent enough cuBLAS version is available.
209
228
if (TCNN_ALLOW_CUBLAS_CUSOLVER AND CUDA_VERSION VERSION_GREATER_EQUAL 11.0)
210
- # Only compile the shampoo optimizer if
211
- # a new enough cuBLAS version is available.
212
- list (APPEND TCNN_LIBRARIES cublas)
229
+ set (TCNN_BUILD_WITH_SHAMPOO ON )
230
+ else ()
231
+ set (TCNN_BUILD_WITH_SHAMPOO OFF )
232
+ endif ()
233
+
234
+ if (TCNN_BUILD_WITH_SHAMPOO)
213
235
list (APPEND TCNN_DEFINITIONS -DTCNN_SHAMPOO)
214
236
endif ()
215
237
216
- if (TCNN_HAS_PARENT)
217
- set (TCNN_DEFINITIONS ${TCNN_DEFINITIONS} PARENT_SCOPE)
238
+ if (TCNN_BUILD_WITH_RTC)
239
+ list (APPEND TCNN_DEFINITIONS -DTCNN_RTC)
240
+ endif ()
241
+
242
+ if (TCNN_BUILD_USE_FAST_MATH)
243
+ list (APPEND TCNN_DEFINITIONS -DTCNN_RTC_USE_FAST_MATH)
244
+ endif ()
245
+
246
+ if (TCNN_BUILD_NO_FWD_BWD)
247
+ list (APPEND TCNN_DEFINITIONS -DTCNN_NO_FWD_BWD)
248
+ endif ()
249
+
250
+ if (TCNN_LINK_CUDA)
251
+ list (APPEND TCNN_LIBRARIES cuda)
252
+ if (TCNN_BUILD_WITH_SHAMPOO)
253
+ list (APPEND TCNN_LIBRARIES cublas)
254
+ endif ()
255
+ if (TCNN_BUILD_WITH_RTC)
256
+ list (APPEND TCNN_LIBRARIES nvrtc)
257
+ endif ()
218
258
endif ()
219
259
220
260
if (MSVC )
@@ -224,6 +264,9 @@ else()
224
264
list (APPEND CUDA_NVCC_FLAGS "-Xcompiler=-fno-strict-aliasing" )
225
265
list (APPEND CUDA_NVCC_FLAGS "-Xcudafe=--diag_suppress=unrecognized_gcc_pragma" )
226
266
endif ()
267
+ if (TCNN_BUILD_USE_FAST_MATH)
268
+ list (APPEND CUDA_NVCC_FLAGS "--use_fast_math" )
269
+ endif ()
227
270
list (APPEND CUDA_NVCC_FLAGS "--extended-lambda" )
228
271
list (APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr" )
229
272
@@ -237,7 +280,14 @@ if (NOT MSVC)
237
280
endif ()
238
281
239
282
set (BUILD_SHARED_LIBS OFF )
240
- add_subdirectory ("dependencies/fmt" )
283
+
284
+ if (TCNN_EXTERNAL_FMT)
285
+ list (APPEND TCNN_LIBRARIES "${TCNN_EXTERNAL_FMT} " )
286
+ else ()
287
+ add_subdirectory ("dependencies/fmt" )
288
+ list (APPEND TCNN_LIBRARIES fmt)
289
+ list (APPEND TCNN_INCLUDES "dependencies/fmt/include" )
290
+ endif ()
241
291
242
292
###############################################################################
243
293
# tiny-cuda-nn library, samples, and benchmarks
@@ -258,31 +308,79 @@ set(TCNN_SOURCES
258
308
src/object.cu
259
309
src/optimizer.cu
260
310
src/reduce_sum.cu
311
+ src/rtc_kernel.cu
261
312
)
262
313
263
314
if (MIN_GPU_ARCH GREATER 70)
264
315
list (APPEND TCNN_SOURCES src/fully_fused_mlp.cu)
265
316
endif ()
266
317
318
+ list (APPEND TCNN_DEFINITIONS -DTCNN_MIN_GPU_ARCH=${MIN_GPU_ARCH} )
319
+
267
320
###############################################################################
268
321
# Linker / library
269
322
###############################################################################
270
323
271
- add_library (tiny-cuda-nn STATIC ${TCNN_SOURCES} )
272
- target_compile_definitions (tiny-cuda-nn PUBLIC ${TCNN_DEFINITIONS} )
273
- target_compile_options (tiny-cuda-nn PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:${CUDA_NVCC_FLAGS} >)
274
- target_include_directories (tiny-cuda-nn PUBLIC
324
+ include ("${CMAKE_CURRENT_SOURCE_DIR} /dependencies/cmrc/CMakeRC.cmake" )
325
+ cmrc_add_resource_library(tiny-cuda-nn-resources NAMESPACE tcnn)
326
+ list (APPEND TCNN_DEFINITIONS -DTCNN_CMRC)
327
+ list (APPEND TCNN_LIBRARIES tiny-cuda-nn-resources)
328
+
329
+ if (TCNN_BUILD_WITH_RTC)
330
+ # Fetch CUDA headers and folders that will be required by the runtime compiler
331
+ # and include those headers with the compiled binary of tcnn.
332
+ foreach (CUDA_INCLUDE_CANDIDATE IN LISTS CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES)
333
+ if (EXISTS "${CUDA_INCLUDE_CANDIDATE} /cuda_fp16.h" )
334
+ set (CUDA_INCLUDE "${CUDA_INCLUDE_CANDIDATE} " )
335
+ break ()
336
+ endif ()
337
+ endforeach (CUDA_INCLUDE_CANDIDATE)
338
+
339
+ if (NOT CUDA_INCLUDE)
340
+ # If the CUDA include dir couldn't be found via CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES,
341
+ # try a relative path w.r.t. the CUDA compiler binary as a last-ditch effort.
342
+ get_filename_component (CUDA_COMPILER_BIN "${CMAKE_CUDA_COMPILER} " DIRECTORY )
343
+ get_filename_component (CUDA_DIR "${CUDA_COMPILER_BIN} " DIRECTORY )
344
+ set (CUDA_INCLUDE "${CUDA_DIR} /include" )
345
+ endif ()
346
+
347
+ file (GLOB CUDA_HEADERS "${CUDA_INCLUDE} /cuda_fp16*" "${CUDA_INCLUDE} /vector*" )
348
+ if (NOT CUDA_HEADERS)
349
+ message (WARNING "FP16 headers could not be found. JIT compilation will likely fail." )
350
+ endif ()
351
+
352
+ file (GLOB_RECURSE TCNN_HEADERS "${CMAKE_CURRENT_SOURCE_DIR} /include/tiny-cuda-nn/*" )
353
+ file (GLOB PCG32_HEADERS "${CMAKE_CURRENT_SOURCE_DIR} /dependencies/pcg32/*" )
354
+
355
+ cmrc_add_resources(tiny-cuda-nn-resources WHENCE "${CUDA_INCLUDE} " ${CUDA_HEADERS} )
356
+ cmrc_add_resources(tiny-cuda-nn-resources WHENCE "${CMAKE_CURRENT_SOURCE_DIR} /include" ${TCNN_HEADERS} )
357
+ cmrc_add_resources(tiny-cuda-nn-resources WHENCE "${CMAKE_CURRENT_SOURCE_DIR} /dependencies" ${PCG32_HEADERS} )
358
+ endif ()
359
+
360
+ list (APPEND TCNN_INCLUDES
275
361
"include"
276
362
"dependencies"
277
363
"dependencies/cutlass/include"
278
364
"dependencies/cutlass/tools/util/include"
279
- "dependencies/fmt/include"
280
365
)
281
- target_link_libraries (tiny-cuda-nn PUBLIC ${CUDA_LIBRARIES} ${TCNN_LIBRARIES} fmt)
366
+
367
+ add_library (tiny-cuda-nn STATIC ${TCNN_SOURCES} )
368
+ target_compile_definitions (tiny-cuda-nn PUBLIC ${TCNN_DEFINITIONS} )
369
+ target_compile_options (tiny-cuda-nn PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:${CUDA_NVCC_FLAGS} >)
370
+ target_include_directories (tiny-cuda-nn PUBLIC ${TCNN_INCLUDES} )
371
+ target_link_libraries (tiny-cuda-nn PUBLIC ${TCNN_LIBRARIES} )
282
372
283
373
if (TCNN_BUILD_EXAMPLES)
284
374
add_subdirectory ("samples" )
285
375
endif ()
376
+
286
377
if (TCNN_BUILD_BENCHMARK)
287
378
add_subdirectory ("benchmarks/image" )
379
+ add_subdirectory ("benchmarks/mlp" )
380
+ endif ()
381
+
382
+ if (TCNN_BUILD_TESTS)
383
+ enable_testing ()
384
+ add_subdirectory (tests)
385
+ list (APPEND CMAKE_CTEST_ARGUMENTS "--output-on-failure" )
288
386
endif ()
0 commit comments