Shared memory problem

MaxFX · March 31, 2009, 5:43am

I have some problem with shared memory in my program.
After for cycle values in shared memory are different every time.
If I use only one block in grid - result is correct.
If I use more then one block - result is invalid.
Shared memory variable written in global space.

navier-stokes · March 31, 2009, 6:32am

Hi,
can you give some more details about your problem? Perhaps a code snipped?

MaxFX · March 31, 2009, 6:50am

I delete some unused code from listing.

I detect problem in process() function (when for cycles is work).

[codebox]#include <stdio.h>

#include <stdlib.h>

#include <cuda_runtime_api.h>

#define CUDA_CHECK_ERROR(err) \

if (err != cudaSuccess) { \ printf("Error in file: %s, line: %s\n", __FILE__, __LINE__); \ printf(cudaGetErrorString(err));	\ printf("\n"); \ } \

constant unsigned int md5constant[64] =

{

0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee, 0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501, 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be, 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821, 0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa, 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8, 0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed, 0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a, 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c, 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70, 0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05, 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665, 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039, 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1, 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1, 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391

};

shared unsigned int partialHash[4];

shared unsigned int waitingData[16];

shared unsigned int bitTotal[2];

shared unsigned char hashSum[16];

shared unsigned int msg[32];

//

global void get_md5_constant(unsigned int* nums);

device unsigned int shiftLeft(unsigned int val, unsigned int step);

global void get_md5_constant(unsigned int* nums)

{

nums[threadIdx.x] = md5constant[threadIdx.x];

};

device void reset()

{

for (int i = 0; i < 16; i++) {	waitingData[i] = 0; } __syncthreads(); for (int i = 0; i < 2; i++) {	bitTotal[i] = 0; } __syncthreads(); partialHash[0] = 0x67452301; partialHash[1] = 0xefcdab89; __syncthreads(); partialHash[2] = ~partialHash[0]; partialHash[3] = ~partialHash[1]; __syncthreads();

};

device void process()

{

unsigned int a = partialHash[0]; unsigned int b = partialHash[1]; unsigned int c = partialHash[2]; unsigned int d = partialHash[3]; __syncthreads(); unsigned int dataValue = 0; for (int i = 0; i < 16; i++) {	dataValue = waitingData[i];	waitingData[i] = 0;	msg[i] = msg[i + 16] = dataValue; } __syncthreads(); int index = 0; for (int i = -16; i != 0; i += 4) {	a += msg[i + 16] + md5constant[index] + (d ^ (b & (c ^ d)));	a = b + shiftLeft(a, 7);	index++;	d += msg[i + 17] + md5constant[index] + (c ^ (a & (b ^ c)));	d = a + shiftLeft(d, 12);	index++;	c += msg[i + 18] + md5constant[index] + (b ^ (d & (a ^ B)));	c = d + shiftLeft(c, 17);	index++;	b += msg[i + 19] + md5constant[index] + (a ^ (c & (d ^ a)));	b = c + shiftLeft(b, 22);	index++; } __syncthreads(); for (int i = -16; i != 0; i += 4) {	a += msg[i + 17] + md5constant[index] + (c ^ (d & (b ^ c)));	a = b + shiftLeft(a, 5);	index++;	d += msg[i + 22] + md5constant[index] + (b ^ (c & (a ^ B)));	d = a + shiftLeft(d, 9);	index++;	c += msg[i + 27] + md5constant[index] + (a ^ (b & (d ^ a)));	c = d + shiftLeft(c, 14);	index++;	b += msg[i + 16] + md5constant[index] + (d ^ (a & (c ^ d)));	b = c + shiftLeft(b, 20);	index++; } __syncthreads(); for (int i = 16; i != 0; i -= 4) {	a += msg[i + 5] + md5constant[index] + ((b ^ c) ^ d);	a = b + shiftLeft(a, 4);	index++;	d += msg[i + 8] + md5constant[index] + (a ^ (b ^ c));	d = a + shiftLeft(d, 11);	index++;	c += msg[i + 11] + md5constant[index] + ((d ^ a) ^ B);	c = d + shiftLeft(c, 16);	index++;	b += msg[i + 14] + md5constant[index] + (c ^ (d ^ a));	b = c + shiftLeft(b, 23);	index++; } __syncthreads(); for (int i = 16; i != 0; i -= 4) {	a += msg[i] + md5constant[index] + (c ^ (~d | B));	a = b + shiftLeft(a, 6);	index++;	d += msg[i + 7] + md5constant[index] + (b ^ (~c | a));	d = a + shiftLeft(d, 10);	index++;	c += msg[i + 14] + md5constant[index] + (a ^ (~b | d));	c = d + shiftLeft(c, 15);	index++;	b += msg[i + 5] + md5constant[index] + (d ^ (~a | c));	b = c + shiftLeft(b, 21);	index++; } __syncthreads(); partialHash[0] = partialHash[0] + a; partialHash[1] = partialHash[1] + b; partialHash[2] = partialHash[2] + c; partialHash[3] = partialHash[3] + d;

};

device void final(unsigned char* B)

{

unsigned bit0 = bitTotal[0]; unsigned bit1 = bitTotal[1]; unsigned occupied = bit0 & 511; waitingData[occupied / 32] = 0x80 << (int)(occupied & 31); occupied += 8; for (int i = 0; i < 4; i++) {	unsigned int hashElement = partialHash[i];	hashSum[4 * i]	= (unsigned char)((hashElement) & 255);	hashSum[4 * i + 1]	= (unsigned char)((hashElement >> 8) & 255);	hashSum[4 * i + 2]	= (unsigned char)((hashElement >> 16) & 255);	hashSum[4 * i + 3]	= (unsigned char)((hashElement >> 24) & 255); } __syncthreads(); for (int i = 0; i < 16; i++) {	b[i] = hashSum[i]; } __syncthreads();

};

device unsigned int shiftLeft(unsigned int val, unsigned int step)

{

unsigned int overflowBits = val >> (32 - step); val <<= step; val |= overflowBits; return val;

};

global void computeMD5Gpu(unsigned char* bytes)

{

reset(); process(); final(bytes);

};

host void ts()

{

unsigned char hash[17]; memset(hash, 0, sizeof(hash)); unsigned char* devHash; cudaMalloc((void**)&devHash, sizeof(hash)); dim3 grid = dim3(1, 1, 1); dim3 block = dim3(16, 16, 1); cudaEvent_t syncEvent; cudaEventCreate(&syncEvent); computeMD5Gpu<<<grid, block>>>(devHash); cudaEventRecord(syncEvent, 0); cudaEventSynchronize(syncEvent); cudaMemcpy(hash, devHash, sizeof(hash), cudaMemcpyDeviceToHost); cudaFree(devHash); cudaEventDestroy(syncEvent); for (int i = 0; i < 16; i++) {	printf("%i\n",hash[i]); } printf("\n");

}

host int main()

{

ts(); return 0;

}

[/codebox]

Jamie_K · March 31, 2009, 11:46pm

All your threads are doing the same thing. They will interfere with each other when they refer to shared memory or global memory.

MaxFX · April 1, 2009, 7:33am

Thanks

Topic		Replies	Views
shared memory problem usage in variables CUDA Programming and Performance	8	2508	September 22, 2010
Shared Memory Problem memory shared only within blocks? CUDA Programming and Performance	4	5986	February 8, 2008
Problems doing shared memory test CUDA Programming and Performance	1	1546	February 17, 2008
Shared memory coherance problem within a thread CUDA Programming and Performance	3	795	November 17, 2010
Shared memory vs global memory CUDA Programming and Performance	6	3496	April 30, 2007
help getting shared memory working CUDA Programming and Performance	11	4369	June 12, 2007
Shared memory problem CUDA Programming and Performance	10	4034	April 20, 2010
problem with shared mamery CUDA Programming and Performance	4	3225	May 11, 2009
Shared memory and running time Results not reproducible CUDA Programming and Performance	10	1811	August 24, 2009
shared memory issue CUDA Programming and Performance	2	2996	February 17, 2010

Shared memory problem

Related topics