I’m currently writing code for a system that has multiple processes, some CPU and GPU intensive. I noticed the performance I achieve for cuda kernel functions during runtime is noticeably different (dozens of percentage slower) than the results when I simply run those functions by themselves. One of the reasons I found is that calls for functions such as “std::this_thread::sleep_for” and condition_variable::wait would cause later runs of cuda kernel functions to be much slower.
I created a code snippet for example:
CMakeLists.txt
cmake_minimum_required(VERSION 3.22) project(CudaSlow) set(CMAKE_CUDA_COMPILER "/usr/local/cuda/bin/nvcc") enable_language(CUDA) FIND_PACKAGE(Threads REQUIRED) FIND_PACKAGE(CUDA 10.2 REQUIRED) set(CMAKE_CXX_STANDARD 17) add_executable(CudaSlow main.cu) target_include_directories(CudaSlow PUBLIC ${CUDA_INCLUDE_DIRS} ) TARGET_LINK_LIBRARIES(CudaSlow PUBLIC Threads::Threads ${CUDA_LIBRARIES} ) main.cu
#include <iostream> #include <cuda_device_runtime_api.h> #include <cuda_runtime.h> #include <chrono> #include <thread> __global__ void Func(float* A, float* B, float* C, int N) { for (int j = threadIdx.x; j < N; j+= blockDim.x) { C[j] = A[j] * B[j]; } } int main() { // example function variables int N = 1e5; float* A; float* B; float* C; cudaMalloc(&A, N * sizeof(float)); cudaMalloc(&B, N * sizeof(float)); cudaMalloc(&C, N * sizeof(float)); cudaStream_t stream; cudaStreamCreate(&stream); auto iterations = 2000; // Run multiple iterations without std::this_thread::sleep_for { double totalTime = 0; for (int i = 0; i < iterations; ++i) { auto start = std::chrono::system_clock::now(); Func<<<1, 512, 0, stream>>>(A, B, C, N); cudaStreamSynchronize(stream); totalTime += std::chrono::duration_cast<std::chrono::duration<double>>( std::chrono::system_clock::now() - start).count(); } std::cout << "Average time without std::this_thread::sleep_for is " << totalTime / (float) iterations << std::endl; } // Run multiple iterations with std::this_thread::sleep_for { double totalTime = 0; for (int i = 0; i < iterations; ++i) { auto start = std::chrono::system_clock::now(); Func<<<1, 512, 0, stream>>>(A, B, C, N); cudaStreamSynchronize(stream); totalTime += std::chrono::duration_cast<std::chrono::duration<double>>( std::chrono::system_clock::now() - start).count(); // call to sleep that is outside the time calculation and after synchronizing the stream std::this_thread::sleep_for(std::chrono::milliseconds (1)); } std::cout << "Average time with std::this_thread::sleep_for is " << totalTime / (float) iterations << std::endl; } return 0; } The output when I run this program on a GeForce RTX 3090 with Cuda 11.4:
Average time without std::this_thread::sleep_for is 7.91676e-05 Average time with std::this_thread::sleep_for is 0.000129487