My use of shared memory seems to be causing a problem in a program I am writing. It works in deviceemulation mode, but not without. On my linux machine (CUDA 2.0, GTX280) the program crashes X. On my windows machine (CUDA 2.0, GTS8800512) I don’t get a crash, but I do get a memory transfer error copying from the device to the host. On a laptop (CUDA 2.1, 9400M) the program runs fine.
Instead of copying the actual program I’ve produced a small test program that only has the neccesary parts. If inside the kernel I comment out the part reading into shared memory then the program runs fine. Otherwise, we encounter the errors mentioned above.
If anyone has any idea what is causing the crashes I would be grateful. I note it seems to work under CUDA 2.1, so if this has been a bug fix then does anyone know what the bug was in the first place?
#include <stdio.h> #include <cuda.h> struct myStruct { int a; int4 b; float3 c; float3 d; float e; float3 f; float3 g; float3 h; float3 i; float j; int k; }; __global__ void test(myStruct *data) { extern __shared__ myStruct s_data[]; int tid = threadIdx.x; int i = blockIdx.x * blockDim.x + tid; if (i < 300) { s_data[tid] = data[i]; } __syncthreads(); } void algorithm() { size_t size = sizeof(myStruct); myStruct* h_data; h_data = (myStruct *) malloc(300 * size); myStruct bob; bob.a = 1; bob.b = make_int4(1,1,1,1); bob.c = make_float3(1,1,1); bob.d = make_float3(1,1,1); bob.e = 1; bob.f = make_float3(1,1,1); bob.g = make_float3(1,1,1); bob.h = make_float3(1,1,1); bob.i = make_float3(1,1,1); bob.j = 1; bob.k = 1; for (int i = 0; i < 300; i++) h_data[i] = bob; myStruct* d_data; if (cudaMalloc((void **) &d_data, 300 * size) != cudaSuccess) { printf("Error allocating memory on device!\n"); exit(1); } if (cudaMemcpy(d_data, h_data, 300 * size, cudaMemcpyHostToDevice) != cudaSuccess) { printf("Error copying host->device"); exit(1); } test <<< 5, 64, 64*size >>> (d_data); if (cudaMemcpy(h_data, d_data, 300 * size, cudaMemcpyDeviceToHost) != cudaSuccess) { printf("Error copying device->host"); exit(1); }; } int main(int argc, char **argv) { printf("Calling algorithm...\n"); algorithm(); printf("Done!"); return 0; }