Hi all,
I read a program to find the Multiplication of SquareMatrix Using shared Memory.But It gives the wrong result(outputsnapshot)and my expected result is other(shapshot expect).
Tell me?
How Can I get a exacted output?
Thanks in Advance
#include<stdio.h> #include<cuda.h> __global__ void Shar(float *a,float *b,float *c,int n) { __shared__ float aTile[4][4],bTile[4][4]; int row=blockIdx.y*blockDim.y+threadIdx.y; int col=blockIdx.x*blockDim.x+threadIdx.x; float sum=0.0; aTile[threadIdx.y][threadIdx.x]=a[row*n+threadIdx.x]; bTile[threadIdx.y][threadIdx.x]=b[threadIdx.y*n+col]; __syncthreads(); for(int i=0;i<n;i++) { sum+=aTile[threadIdx.y][i]*bTile[i][threadIdx.x]; } c[row*n+col]=sum; } int main() { float *a_h,*b_h,*c_h,*a_d,*b_d,*c_d; int i,n; n=4; size_t size=sizeof(float)*(n*n); a_h=(float*)malloc(size); b_h=(float*)malloc(size); c_h=(float*)malloc(size); cudaMalloc((void**)&a_d,size); cudaMalloc((void**)&b_d,size); cudaMalloc((void**)&c_d,size); for(i=0;i<(n*n);i++) { a_h[i]=1; } cudaMemcpy(a_d,a_h,size,cudaMemcpyHostToDevice); for(i=0;i<(n*n);i++) { b_h[i]=1; } cudaMemcpy(b_d,b_h,size,cudaMemcpyHostToDevice); int blocksize=4; int nblock=n/blocksize+(n%blocksize==0?0:1); int TILE_DIM=4; Shar<<<nblock,blocksize,TILE_DIM>>>(a_d,b_d,c_d,n); cudaMemcpy(c_h,c_d,size,cudaMemcpyDeviceToHost); printf("\nMultiplication Of Matrix"); for(i=0;i<(n*n);i++) { printf("\n%f",c_h[i]); } free(a_h); free(b_h); free(c_h); cudaFree(a_d); cudaFree(b_d); cudaFree(c_d); return 0; } 
