#define SUBVOL_DIM 128 cudaExtent volSize = make_cudaExtent(...); cudaArray *d_volArray = 0; // subvolume bound to texture cudaPitchedPtr d_volPPtr;// subvolume in device memory float* h_vol = NULL; // The full volume on host int iDivUp(int a, int b) { return ((a % b) != 0)? (a / b + 1): (a / b); } //Initialization and mem allocations void initCuda() { ... cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>(); cudaExtent subvolSize = make_cudaExtent(SUBVOL_DIM, SUBVOL_DIM, SUBVOL_DIM); CUDA_SAFE_CALL(cudaMalloc3DArray(&d_volArray, &channelDesc, subvolSize)); ... cudaExtent pitchedVolSize = make_cudaExtent(SUBVOL_DIM*sizeof(float), SUBVOL_DIM, SUBVOL_DIM); CUDA_SAFE_CALL(cudaMalloc3D(&d_volPPtr, pitchedVolSize)); ... }
Host to array copy:
void copy3DHostToArray(float *_src, cudaArray *_dst, cudaExtent copy_extent, cudaPos src_offset) { cudaMemcpy3DParms copyParams = {0}; float *h_source = _src + src_offset.x + src_offset.y*volSize.width + src_offset.z*volSize.width*volSize.height; copyParams.srcPtr = make_cudaPitchedPtr((void*)h_source, volSize.width*sizeof(float), volSize.width, volSize.height); copyParams.dstArray = _dst; copyParams.kind = cudaMemcpyHostToDevice; copyParams.extent = copy_extent; CUDA_SAFE_CALL(cudaMemcpy3D(©Params)); CUT_CHECK_ERROR("Host -> Array Memcpy failed\n"); }
Device mem to array copy:
void copy3DMemToArray(cudaPitchedPtr _src, cudaArray *_dst) { cudaMemcpy3DParms copyParams = {0}; copyParams.srcPtr = _src; copyParams.dstArray = _dst; copyParams.kind = cudaMemcpyDeviceToDevice; copyParams.extent = make_cudaExtent(SUBVOL_DIM, SUBVOL_DIM, SUBVOL_DIM); CUDA_SAFE_CALL(cudaMemcpy3D(©Params)); CUT_CHECK_ERROR("Mem -> Array Memcpy failed\n"); }
Device mem to host mem copy:
void copy3DMemToHost(cudaPitchedPtr _src, float *_dst, cudaExtent copy_extent, cudaExtent dst_extent, cudaPos src_offset, cudaPos dst_offset) { cudaMemcpy3DParms copyParams = {0}; copyParams.srcPtr = _src; float *h_target = _dst + dst_offset.x + dst_offset.y*dst_extent.width + dst_offset.z*dst_extent.width*dst_extent.height;//For some reason, using copyParams.dstPos doesn't give correct results, so we set the offset here. copyParams.dstPtr = make_cudaPitchedPtr((void*)h_target, dst_extent.width*sizeof(float), dst_extent.width, dst_extent.height); copyParams.kind = cudaMemcpyDeviceToHost; copyParams.extent = make_cudaExtent(copy_extent.width*sizeof(float), copy_extent.height, copy_extent.depth); copyParams.srcPos = make_cudaPos(src_offset.x*sizeof(float), src_offset.y, src_offset.z); // We want to copy copy_extent sized volume starting at (x_off, y_off, z_off). CUDA_SAFE_CALL(cudaMemcpy3D(©Params)); CUT_CHECK_ERROR("Mem -> Host Memcpy failed\n"); }
Memory management (note that there is a one voxel border around every subvolume which is shared with other subvolumes):
cudaExtent subvolIndicesExtents = make_cudaExtent(iDivUp(volSize.width-2, SUBVOL_DIM-2), iDivUp(volSize.height-2, SUBVOL_DIM-2), iDivUp(volSize.depth-2, SUBVOL_DIM-2)); for(int _z = 0; _z< subvolIndicesExtents.depth; _z++) for(int _y = 0; _y< subvolIndicesExtents.height; _y++) for(int _x = 0; _x< subvolIndicesExtents.width; _x++) { //copy the subvolume to texture copy3DHostToArray(h_vol, d_volArray, make_cudaExtent(SUBVOL_DIM, SUBVOL_DIM, SUBVOL_DIM), make_cudaPos(_x*(SUBVOL_DIM-2), _y*(SUBVOL_DIM-2), _z*(SUBVOL_DIM-2))); //fprintf(stderr, "->%s", cudaGetErrorString(cudaGetLastError())); //run a kernel on subvolume. reads from texture (via d_volArray)and writes to d_volPPtr d_kernel<<<gridSize, blockSize>>>(d_volPPtr, ...); CUT_CHECK_ERROR("Kernel failed"); //fprintf(stderr, "---%d-%d-%d %s---", _x, _y, _z, cudaGetErrorString(cudaGetLastError())); cudaThreadSynchronize(); //Copy results back to host mem from device mem dst_off.x = 1 + _x*(SUBVOL_DIM-2); dst_off.y = 1 + _y*(SUBVOL_DIM-2); dst_off.z = 1 + _z*(SUBVOL_DIM-2); copy3DMemToHost(d_volPPtr, h_phi, copyvol, volSize, src_off, dst_off); //fprintf(stderr, "%s<-\n", cudaGetErrorString(cudaGetLastError())); }
Notes:
-
copy3DMemToHost() is the most generic of the three functions, but t shouldn’t be difficult to do the same with other two. I have hard-coded some values in the other two.
-
In my experience, setting a copy position (offset) on the host pitched ptr (host memory) never worked. So, I set the correct offset using pointer arithmetic myself and it works. Does anyone know why? Setting offsets on device memory and arrays always works.
-
In case of normal 3D memory on device and host, always set the first argument of cudaPos, cudaExtent to the offset along X in “bytes”. While, with arrays, this has to be in number of elements in X direction. Anyway, this is documented in the API guide.
Most of the code is trivial and taken from cuda examples.
Hope it helps.
Cheers,
Ojaswa