Skip to content

Conversation

@mgorny
Copy link
Member

@mgorny mgorny commented Dec 21, 2025

Reverts #172249. The changes introduced a unittest that does not compile, per #172249 (comment).

@llvmbot
Copy link
Member

llvmbot commented Dec 21, 2025

@llvm/pr-subscribers-offload

Author: Michał Górny (mgorny)

Changes

Reverts llvm/llvm-project#172249. The changes introduced a unittest that does not compile, per #172249 (comment).


Full diff: https://github.com/llvm/llvm-project/pull/173199.diff

7 Files Affected:

  • (modified) offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp (-1)
  • (modified) offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h (-1)
  • (modified) offload/plugins-nextgen/cuda/src/rtl.cpp (+2-31)
  • (modified) offload/test/offloading/CUDA/basic_launch_multi_arg.cu (+3)
  • (modified) offload/unittests/OffloadAPI/device_code/CMakeLists.txt (-2)
  • (removed) offload/unittests/OffloadAPI/device_code/multiargs.cpp (-3)
  • (modified) offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp (-14)
diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp index f630e8d850706..e7a1ca38b3c13 100644 --- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp +++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp @@ -81,7 +81,6 @@ DLWRAP(cuDevicePrimaryCtxSetFlags, 2) DLWRAP(cuDevicePrimaryCtxRetain, 2) DLWRAP(cuModuleLoadDataEx, 5) DLWRAP(cuOccupancyMaxPotentialBlockSize, 6) -DLWRAP(cuFuncGetParamInfo, 4) DLWRAP(cuDeviceCanAccessPeer, 3) DLWRAP(cuCtxEnablePeerAccess, 2) diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h index 7e42c66dddabb..a470d6df1079d 100644 --- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h +++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h @@ -390,6 +390,5 @@ CUresult cuMemGetAllocationGranularity(size_t *granularity, CUmemAllocationGranularity_flags option); CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction, CUoccupancyB2DSize, size_t, int); -CUresult cuFuncGetParamInfo(CUfunction, size_t, size_t *, size_t *); #endif diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp index 3c41694bf9dc4..a27c6f3de0cd3 100644 --- a/offload/plugins-nextgen/cuda/src/rtl.cpp +++ b/offload/plugins-nextgen/cuda/src/rtl.cpp @@ -149,8 +149,7 @@ struct CUDAKernelTy : public GenericKernelTy { // The maximum number of threads cannot exceed the maximum of the kernel. MaxNumThreads = std::min(MaxNumThreads, (uint32_t)MaxThreads); - // Retrieve the size of the arguments. - return initArgsSize(); + return Plugin::success(); } /// Launch the CUDA kernel function. @@ -174,32 +173,11 @@ struct CUDAKernelTy : public GenericKernelTy { } private: - /// Initialize the size of the arguments. - Error initArgsSize() { - CUresult Res; - size_t ArgOffset, ArgSize; - size_t Arg = 0; - - ArgsSize = 0; - - // Find the last argument to know the total size of the arguments. - while ((Res = cuFuncGetParamInfo(Func, Arg++, &ArgOffset, &ArgSize)) == - CUDA_SUCCESS) - ArgsSize = ArgOffset + ArgSize; - - if (Res != CUDA_ERROR_INVALID_VALUE) - return Plugin::check(Res, "error in cuFuncGetParamInfo: %s"); - return Plugin::success(); - } - /// The CUDA kernel function to execute. CUfunction Func; /// The maximum amount of dynamic shared memory per thread group. By default, /// this is set to 48 KB. mutable uint32_t MaxDynCGroupMemLimit = 49152; - - /// The size of the kernel arguments. - size_t ArgsSize; }; /// Class wrapping a CUDA stream reference. These are the objects handled by the @@ -1452,12 +1430,6 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice, AsyncInfoWrapperTy &AsyncInfoWrapper) const { CUDADeviceTy &CUDADevice = static_cast<CUDADeviceTy &>(GenericDevice); - // The args size passed in LaunchParams may have tail padding, which is not - // accepted by the CUDA driver. - if (ArgsSize > LaunchParams.Size) - return Plugin::error(ErrorCode::INVALID_ARGUMENT, - "mismatch in kernel arguments"); - CUstream Stream; if (auto Err = CUDADevice.getStream(AsyncInfoWrapper, Stream)) return Err; @@ -1465,10 +1437,9 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice, uint32_t MaxDynCGroupMem = std::max(KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize()); - size_t ConfigArgsSize = ArgsSize; void *Config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, LaunchParams.Data, CU_LAUNCH_PARAM_BUFFER_SIZE, - reinterpret_cast<void *>(&ConfigArgsSize), + reinterpret_cast<void *>(&LaunchParams.Size), CU_LAUNCH_PARAM_END}; // If we are running an RPC server we want to wake up the server thread diff --git a/offload/test/offloading/CUDA/basic_launch_multi_arg.cu b/offload/test/offloading/CUDA/basic_launch_multi_arg.cu index 4e0f3a41a7a0c..7a32983f51f7c 100644 --- a/offload/test/offloading/CUDA/basic_launch_multi_arg.cu +++ b/offload/test/offloading/CUDA/basic_launch_multi_arg.cu @@ -6,6 +6,9 @@ // clang-format on // REQUIRES: gpu +// +// FIXME: https://github.com/llvm/llvm-project/issues/161265 +// UNSUPPORTED: gpu #include <stdio.h> diff --git a/offload/unittests/OffloadAPI/device_code/CMakeLists.txt b/offload/unittests/OffloadAPI/device_code/CMakeLists.txt index 22ebacf62e83e..1a042e1b38315 100644 --- a/offload/unittests/OffloadAPI/device_code/CMakeLists.txt +++ b/offload/unittests/OffloadAPI/device_code/CMakeLists.txt @@ -2,7 +2,6 @@ add_offload_test_device_code(foo.cpp foo) add_offload_test_device_code(bar.cpp bar) # Compile with optimizations to eliminate AMDGPU implicit arguments. add_offload_test_device_code(noargs.cpp noargs -O3) -add_offload_test_device_code(multiargs.cpp multiargs -O3) add_offload_test_device_code(byte.cpp byte) add_offload_test_device_code(localmem.cpp localmem) add_offload_test_device_code(localmem_reduction.cpp localmem_reduction) @@ -16,7 +15,6 @@ add_custom_target(offload_device_binaries DEPENDS foo.bin bar.bin noargs.bin - multiargs.bin byte.bin localmem.bin localmem_reduction.bin diff --git a/offload/unittests/OffloadAPI/device_code/multiargs.cpp b/offload/unittests/OffloadAPI/device_code/multiargs.cpp deleted file mode 100644 index 265dad124e91e..0000000000000 --- a/offload/unittests/OffloadAPI/device_code/multiargs.cpp +++ /dev/null @@ -1,3 +0,0 @@ -#include <gpuintrin.h> - -extern "C" __gpu_kernel void multiargs(char, int *, short) { (void)0; } diff --git a/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp b/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp index 0845b9a1afdb7..c9eca36a4d447 100644 --- a/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp +++ b/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp @@ -55,7 +55,6 @@ struct LaunchSingleKernelTestBase : LaunchKernelTestBase { KERNEL_TEST(Foo, foo) KERNEL_TEST(NoArgs, noargs) -KERNEL_TEST(MultiArgs, multiargs) KERNEL_TEST(Byte, byte) KERNEL_TEST(LocalMem, localmem) KERNEL_TEST(LocalMemReduction, localmem_reduction) @@ -136,19 +135,6 @@ TEST_P(olLaunchKernelNoArgsTest, Success) { ASSERT_SUCCESS(olSyncQueue(Queue)); } -TEST_P(olLaunchKernelMultiTest, Success) { - struct { - char A; - int *B; - short C; - } Args{0, nullptr, 0}; - - ASSERT_SUCCESS( - olLaunchKernel(Queue, Device, Kernel, Args, sizeof(Args), &LaunchArgs)); - - ASSERT_SUCCESS(olSyncQueue(Queue)); -} - TEST_P(olLaunchKernelFooTest, SuccessSynchronous) { void *Mem; ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, 
@kevinsala
Copy link
Contributor

@mgorny, it should be fixed by #173203. No need to revert.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

4 participants