Hi,
I am using CUDA1.0 and i want to do some atomic operation in a memory location at global area. My display card is 8800GTS. I changed the custom build setup to
$(CUDA_BIN_PATH)\nvcc.exe -arch sm_11 -ccbin “$(VCInstallDir)bin” -c -DWIN32 -D_CONSOLE -D_MBCS -Xcompiler /EHsc,/W3,/nologo,/Wp64,/O2,/Zi,/MT -I"$(CUDA_INC_PATH)" -I./ -I…/…/common/inc -o $(ConfigurationName)\template.obj template.cu
But the program is giving some different output than what i expect.
kernel
global void
testKernel( int* g_odata)
{
// Block index
int bx = blockIdx.x;
int by = blockIdx.y;
// Thread index int tx = threadIdx.x; int ty = threadIdx.y; if(tx==0&&bx==0) { g_odata[0]=0; } int nBlocksize = 16; int nStart = bx * ceil((float)65536/nBlocksize) + tx * ceil((float)((65536/nBlocksize)/nBlocksize)); for( int i = nStart; i <= nStart+ceil((float)((65536/nBlocksize)/nBlocksize)); i=i+1 ) { g_odata[0] = 1.0f; //__syncthreads(); } }
host
void
runTest( int argc, char** argv)
{
CUT_DEVICE_INIT();
int* pCpuOutData = (int*)malloc( 256256sizeof(float));
int* pOutData;
CUDA_SAFE_CALL( cudaMalloc( (void**) &pOutData, 256 * 256 * sizeof(int)));
dim3 grid(16,1);
dim3 thread(16,1);
testKernel<<<grid,thread>>>(pOutData);
CUDA_SAFE_CALL( cudaMemcpy( pCpuOutData, pOutData, 256 * 256 * sizeof(int), cudaMemcpyDeviceToHost) ); printf("%d",pCpuOutData[0]); CUDA_SAFE_CALL( cudaFree(pOutData)); free( pCpuOutData ); }
The output is some junk value like 11731320…
Please help me.