Hello,
I am trying to run the following code:
__global__ void testKernel( float* g_idata, int data_size, unsigned int block_size) { // access thread id const unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x; // access number of threads in this block //const unsigned int num_threads = blockDim.x; const unsigned int num_threads = block_size; int slices = data_size / num_threads; int js; __shared__ float shared_data[2048]; for(int i = 0; i < 2048; i++){ shared_data[i] = g_idata[i]; } __syncthreads(); for(int i = 0; i < slices; i++){ js = (tid*slices) + i; for(int j = 0; j < 200000; j++){ //g_idata[js] = g_idata[js] + 2; //g_idata[js] = g_idata[js] - 1; shared_data[js] = shared_data[js] + 2; shared_data[js] = shared_data[js] - 1; } } for(int i = 0; i < 2048; i++){ g_idata[i] = shared_data[i]; } } My previous code using g_idata worked fine but now with shared memory only about the last half gets set to the right numbers. What am I doing wrong?