I have some problem with shared memory in my program.
After for cycle values in shared memory are different every time.
If I use only one block in grid - result is correct.
If I use more then one block - result is invalid.
Shared memory variable written in global space.
Hi,
can you give some more details about your problem? Perhaps a code snipped?
I delete some unused code from listing.
I detect problem in process() function (when for cycles is work).
[codebox]#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime_api.h>
#define CUDA_CHECK_ERROR(err) \
if (err != cudaSuccess) { \ printf("Error in file: %s, line: %s\n", __FILE__, __LINE__); \ printf(cudaGetErrorString(err)); \ printf("\n"); \ } \ constant unsigned int md5constant[64] =
{
0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee, 0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501, 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be, 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821, 0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa, 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8, 0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed, 0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a, 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c, 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70, 0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05, 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665, 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039, 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1, 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1, 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391 };
shared unsigned int partialHash[4];
shared unsigned int waitingData[16];
shared unsigned int bitTotal[2];
shared unsigned char hashSum[16];
shared unsigned int msg[32];
//
//
//
global void get_md5_constant(unsigned int* nums);
device unsigned int shiftLeft(unsigned int val, unsigned int step);
global void get_md5_constant(unsigned int* nums)
{
nums[threadIdx.x] = md5constant[threadIdx.x]; };
device void reset()
{
for (int i = 0; i < 16; i++) { waitingData[i] = 0; } __syncthreads(); for (int i = 0; i < 2; i++) { bitTotal[i] = 0; } __syncthreads(); partialHash[0] = 0x67452301; partialHash[1] = 0xefcdab89; __syncthreads(); partialHash[2] = ~partialHash[0]; partialHash[3] = ~partialHash[1]; __syncthreads(); };
device void process()
{
unsigned int a = partialHash[0]; unsigned int b = partialHash[1]; unsigned int c = partialHash[2]; unsigned int d = partialHash[3]; __syncthreads(); unsigned int dataValue = 0; for (int i = 0; i < 16; i++) { dataValue = waitingData[i]; waitingData[i] = 0; msg[i] = msg[i + 16] = dataValue; } __syncthreads(); int index = 0; for (int i = -16; i != 0; i += 4) { a += msg[i + 16] + md5constant[index] + (d ^ (b & (c ^ d))); a = b + shiftLeft(a, 7); index++; d += msg[i + 17] + md5constant[index] + (c ^ (a & (b ^ c))); d = a + shiftLeft(d, 12); index++; c += msg[i + 18] + md5constant[index] + (b ^ (d & (a ^ B))); c = d + shiftLeft(c, 17); index++; b += msg[i + 19] + md5constant[index] + (a ^ (c & (d ^ a))); b = c + shiftLeft(b, 22); index++; } __syncthreads(); for (int i = -16; i != 0; i += 4) { a += msg[i + 17] + md5constant[index] + (c ^ (d & (b ^ c))); a = b + shiftLeft(a, 5); index++; d += msg[i + 22] + md5constant[index] + (b ^ (c & (a ^ B))); d = a + shiftLeft(d, 9); index++; c += msg[i + 27] + md5constant[index] + (a ^ (b & (d ^ a))); c = d + shiftLeft(c, 14); index++; b += msg[i + 16] + md5constant[index] + (d ^ (a & (c ^ d))); b = c + shiftLeft(b, 20); index++; } __syncthreads(); for (int i = 16; i != 0; i -= 4) { a += msg[i + 5] + md5constant[index] + ((b ^ c) ^ d); a = b + shiftLeft(a, 4); index++; d += msg[i + 8] + md5constant[index] + (a ^ (b ^ c)); d = a + shiftLeft(d, 11); index++; c += msg[i + 11] + md5constant[index] + ((d ^ a) ^ B); c = d + shiftLeft(c, 16); index++; b += msg[i + 14] + md5constant[index] + (c ^ (d ^ a)); b = c + shiftLeft(b, 23); index++; } __syncthreads(); for (int i = 16; i != 0; i -= 4) { a += msg[i] + md5constant[index] + (c ^ (~d | B)); a = b + shiftLeft(a, 6); index++; d += msg[i + 7] + md5constant[index] + (b ^ (~c | a)); d = a + shiftLeft(d, 10); index++; c += msg[i + 14] + md5constant[index] + (a ^ (~b | d)); c = d + shiftLeft(c, 15); index++; b += msg[i + 5] + md5constant[index] + (d ^ (~a | c)); b = c + shiftLeft(b, 21); index++; } __syncthreads(); partialHash[0] = partialHash[0] + a; partialHash[1] = partialHash[1] + b; partialHash[2] = partialHash[2] + c; partialHash[3] = partialHash[3] + d; };
device void final(unsigned char* B)
{
unsigned bit0 = bitTotal[0]; unsigned bit1 = bitTotal[1]; unsigned occupied = bit0 & 511; waitingData[occupied / 32] = 0x80 << (int)(occupied & 31); occupied += 8; for (int i = 0; i < 4; i++) { unsigned int hashElement = partialHash[i]; hashSum[4 * i] = (unsigned char)((hashElement) & 255); hashSum[4 * i + 1] = (unsigned char)((hashElement >> 8) & 255); hashSum[4 * i + 2] = (unsigned char)((hashElement >> 16) & 255); hashSum[4 * i + 3] = (unsigned char)((hashElement >> 24) & 255); } __syncthreads(); for (int i = 0; i < 16; i++) { b[i] = hashSum[i]; } __syncthreads(); };
device unsigned int shiftLeft(unsigned int val, unsigned int step)
{
unsigned int overflowBits = val >> (32 - step); val <<= step; val |= overflowBits; return val; };
global void computeMD5Gpu(unsigned char* bytes)
{
reset(); process(); final(bytes); };
host void ts()
{
unsigned char hash[17]; memset(hash, 0, sizeof(hash)); unsigned char* devHash; cudaMalloc((void**)&devHash, sizeof(hash)); dim3 grid = dim3(1, 1, 1); dim3 block = dim3(16, 16, 1); cudaEvent_t syncEvent; cudaEventCreate(&syncEvent); computeMD5Gpu<<<grid, block>>>(devHash); cudaEventRecord(syncEvent, 0); cudaEventSynchronize(syncEvent); cudaMemcpy(hash, devHash, sizeof(hash), cudaMemcpyDeviceToHost); cudaFree(devHash); cudaEventDestroy(syncEvent); for (int i = 0; i < 16; i++) { printf("%i\n",hash[i]); } printf("\n"); }
host int main()
{
ts(); return 0; }
[/codebox]
All your threads are doing the same thing. They will interfere with each other when they refer to shared memory or global memory.
Thanks