@@ -62,7 +62,7 @@ enum MemoryAccess {
6262
6363template <typename T> __device__ T myrand (T i);
6464
65- // from wikipedia - glibc LCG constants
65+ // glibc LCG constants - taken from public domain
6666// x_n+1 = (a*x_n + c) mod m
6767// a = 1103515245
6868// m = 2^31
@@ -97,7 +97,7 @@ template<typename data_type>
9797__global__ void read_thread_blocksync (data_type *ptr, const size_t size)
9898{
9999 size_t n = size / sizeof (data_type);
100- data_type accum = 0 ; // ToDo: check PTX that accum is not optimized out
100+ data_type accum = 0 ;
101101
102102 size_t tid = threadIdx .x + blockIdx .x * blockDim .x ;
103103 while (1 ) {
@@ -195,18 +195,26 @@ __global__ void cta_random_warp_streaming_read(data_type *ptr, const size_t size
195195 ptr[0 ] = accum;
196196}
197197
198- int main (int argc, char *argv[]) {
199198
199+ typedef struct {
200+ std::string header_string = " " ;
201+ size_t page_size;
202+ float oversubscription_factor;
203+ int loop_count;
204+ int block_size;
205+ KernelOp k_op;
206+ UVMBehavior uvm_behavior;
207+ MemoryAccess memory_access;
208+ } cmdline_params;
209+
210+ cmdline_params parse_arguments (int argc, char *argv[]) {
200211 KernelOp k_op = READ;
201212 UVMBehavior uvm_behavior = PAGE_FAULT;
202213 MemoryAccess memory_access = STREAMING;
203214 float oversubscription_factor = 1 .0f ; // 1.0 - 100%
204215 size_t page_size = TWO_MB;
205216 int loop_count = 3 ;
206- int num_gpus = 0 ;
207217 int block_size = 128 ;
208- CUDA_CHECK (cudaGetDeviceCount (&num_gpus));
209-
210218 std::string header_string = " " ;
211219
212220 int cur_pos = 1 ;
@@ -258,7 +266,6 @@ int main(int argc, char *argv[]) {
258266 }
259267 else if (flag == " -lc" ) {
260268 loop_count = std::atoi (argv[cur_pos++]);
261- header_string += " loop_count=" ;
262269 }
263270 else if (flag == " -blocksize" ) {
264271 block_size = std::atoi (argv[cur_pos++]);
@@ -304,49 +311,98 @@ int main(int argc, char *argv[]) {
304311 header_string += " loop_count=" ;
305312 header_string += std::to_string (loop_count);
306313
314+ cmdline_params args;
315+ args.header_string = header_string;
316+ args.page_size = page_size;
317+ args.oversubscription_factor = oversubscription_factor;
318+ args.loop_count = loop_count;
319+ args.block_size = block_size;
320+ args.k_op = k_op;
321+ args.uvm_behavior = uvm_behavior;
322+ args.memory_access = memory_access;
323+
324+ return args;
325+ }
326+
327+ void setup_memory_allocation (cmdline_params params, size_t vidmem_size, bool is_P9, void ** blockMemory, void **uvm_alloc_ptr, size_t & allocation_size) {
328+
307329 // determine cudaMallocManaged size
308330 int current_device = 0 ;
309331 CUDA_CHECK (cudaSetDevice (current_device));
310- cudaDeviceProp prop;
311- CUDA_CHECK (cudaGetDeviceProperties (&prop, current_device));
312- bool is_P9 = (prop.pageableMemoryAccessUsesHostPageTables == 1 );
313-
314- size_t allocation_size = (size_t )(oversubscription_factor * prop.totalGlobalMem );
332+ allocation_size = (size_t )(params.oversubscription_factor * vidmem_size);
315333
316- void *blockMemory = nullptr ;
317- if (memory_access != STREAMING && uvm_behavior) {
334+ if (params.memory_access == RANDOM_WARP) {
318335 // reduce test working memory
319336 // cudaMalloc 2/3 GPU
320- size_t cudaMallocSize = AlignSize (size_t (prop. totalGlobalMem * 0.67 ), page_size);
321- allocation_size = AlignSize (size_t (prop. totalGlobalMem * 0.33 * oversubscription_factor),
322- page_size);
323- CUDA_CHECK (cudaMalloc (& blockMemory, cudaMallocSize));
337+ size_t cudaMallocSize = AlignSize (size_t (vidmem_size * 0.67 ), params. page_size );
338+ allocation_size = AlignSize (size_t (vidmem_size * 0.33 * params. oversubscription_factor ),
339+ params. page_size );
340+ CUDA_CHECK (cudaMalloc (blockMemory, cudaMallocSize));
324341 }
325342 // pad allocation to page_size
326- allocation_size = AlignSize (allocation_size, page_size);
327- size_t num_pages = allocation_size / page_size;
328-
329- size_t avail_phy_vidmem = 0 , total_phy_vidmem = 0 ;
330- // allocate memory - add hints etc as needed
331- void *uvm_alloc_ptr = NULL ;
343+ allocation_size = AlignSize (allocation_size, params.page_size );
332344
333345 // For P9 we need to allocate and free in-benchmark loop
334346 // as evicted memory has remote mappings don't trigger a page-fault
335- if (!(is_P9 && uvm_behavior == PAGE_FAULT)) {
336- CUDA_CHECK (cudaMallocManaged (& uvm_alloc_ptr, allocation_size));
337- CUDA_CHECK ( cudaMemGetInfo (&avail_phy_vidmem, &total_phy_vidmem));
347+ if (!(is_P9 && params. uvm_behavior == PAGE_FAULT)) {
348+ CUDA_CHECK (cudaMallocManaged (uvm_alloc_ptr, allocation_size));
349+
338350 // populate pages on GPU
339- CUDA_CHECK (cudaMemPrefetchAsync (uvm_alloc_ptr, allocation_size, current_device));
351+ CUDA_CHECK (cudaMemPrefetchAsync (*uvm_alloc_ptr, allocation_size, current_device));
352+ }
353+ }
354+
355+ dim3 get_grid_config (cmdline_params params, int multiProcessorCount) {
356+ dim3 grid (1 ,1 ,1 );
357+ int num_blocks_per_sm = 1 ; // placeholder value
358+
359+ if (params.k_op == READ) {
360+ if (params.memory_access == STREAMING) {
361+ cudaOccupancyMaxActiveBlocksPerMultiprocessor (&num_blocks_per_sm,
362+ read_thread_blocksync<float >, params.block_size , 0 );
363+ grid.x = multiProcessorCount * num_blocks_per_sm;
364+ }
365+ else if (params.memory_access == BLOCK_STREAMING) {
366+ cudaOccupancyMaxActiveBlocksPerMultiprocessor (&num_blocks_per_sm,
367+ read_thread_blockCont_blocksync<float >, params.block_size , 0 );
368+ grid.x = multiProcessorCount * num_blocks_per_sm;
369+ }
370+ else if (params.memory_access == RANDOM_WARP) {
371+ cudaOccupancyMaxActiveBlocksPerMultiprocessor (&num_blocks_per_sm,
372+ cta_random_warp_streaming_read<float >, params.block_size , 0 );
373+ grid.x = multiProcessorCount * num_blocks_per_sm;
374+ }
340375 }
376+ return grid;
377+ }
378+
379+ int main (int argc, char *argv[]) {
380+ cmdline_params args = parse_arguments (argc, argv);
381+
382+ int current_device = 0 ;
383+ cudaDeviceProp prop;
384+ CUDA_CHECK (cudaGetDeviceProperties (&prop, current_device));
385+ bool is_P9 = (prop.pageableMemoryAccessUsesHostPageTables == 1 );
386+ void *blockMemory = nullptr ;
387+ void *uvm_alloc_ptr = nullptr ;
388+ size_t allocation_size = 0 ;
389+ size_t avail_phy_vidmem = 0 , total_phy_vidmem = 0 ;
390+ CUDA_CHECK (cudaMemGetInfo (&avail_phy_vidmem, &total_phy_vidmem));
391+
392+ setup_memory_allocation (args, prop.totalGlobalMem , is_P9, &blockMemory, &uvm_alloc_ptr, allocation_size);
393+ size_t num_pages = allocation_size / args.page_size ;
394+
395+ if (args.memory_access == RANDOM_WARP)
396+ CUDA_CHECK (cudaMemGetInfo (&avail_phy_vidmem, &total_phy_vidmem));
397+
341398
342399 // P9 need more state space on vidmem - size in MB
343400 size_t state_space_size = (prop.pageableMemoryAccessUsesHostPageTables == 1 ) ? 320 : 128 ;
344- size_t permissible_phys_pages_count = avail_phy_vidmem / page_size;
345- permissible_phys_pages_count -= (state_space_size * 1024 * 1024 / page_size);
401+ size_t permissible_phys_pages_count = avail_phy_vidmem / args. page_size ;
402+ permissible_phys_pages_count -= (state_space_size * 1024 * 1024 / args. page_size );
346403
347- dim3 block (block_size,1 ,1 );
348- dim3 grid ((prop.multiProcessorCount * prop.maxThreadsPerMultiProcessor ) / block.x , 1 , 1 );
349- int num_blocks_per_sm = 1 ; // placeholder value
404+ dim3 block (args.block_size ,1 ,1 );
405+ dim3 grid = get_grid_config (args, prop.multiProcessorCount );
350406
351407 cudaStream_t task_stream;
352408 CUDA_CHECK (cudaStreamCreate (&task_stream));
@@ -359,21 +415,23 @@ int main(int argc, char *argv[]) {
359415 float accum_kernel_time = 0 .0f ;
360416 float accum_bw = 0 .0f ;
361417
362- for (int itr = 0 ; itr < loop_count; itr++) {
363- if (is_P9 && uvm_behavior == PAGE_FAULT) {
418+ for (int itr = 0 ; itr < args.loop_count ; itr++) {
419+ // on P9, memory is allocated in loop for fault based allocations
420+ // to avoid access counter initiated mapping
421+ if (is_P9 && args.uvm_behavior == PAGE_FAULT) {
364422 CUDA_CHECK (cudaMallocManaged (&uvm_alloc_ptr, allocation_size));
365423 }
366424 // prefetch to CPU as starting point
367- if (uvm_behavior != PREFETCH_ONCE_AND_HINTS)
425+ if (args. uvm_behavior != PREFETCH_ONCE_AND_HINTS)
368426 CUDA_CHECK (cudaMemPrefetchAsync (uvm_alloc_ptr, allocation_size, cudaCpuDeviceId,
369427 task_stream));
370428
371- switch (uvm_behavior) {
429+ switch (args. uvm_behavior ) {
372430 case STRIPE_GPU_CPU:
373431 {
374432 // distribute pages across GPU0 and CPU
375433 // get page-split ratios
376- float cpu_factor = oversubscription_factor - 1.0 ;
434+ float cpu_factor = args. oversubscription_factor - 1.0 ;
377435 if (cpu_factor < 0 .0f )
378436 cpu_factor = 0 .0f ;
379437 int mod_zero_devId = cudaCpuDeviceId;
@@ -382,10 +440,10 @@ int main(int argc, char *argv[]) {
382440 if (cpu_factor > 1.0 ) {
383441 mod_zero_devId = current_device;
384442 flip_devId = cudaCpuDeviceId;
385- mod_scale = int (std::round (oversubscription_factor));
443+ mod_scale = int (std::round (args. oversubscription_factor ));
386444 }
387445 else if (cpu_factor != 0 .0f ) {
388- mod_scale = int (std::round (oversubscription_factor / cpu_factor));
446+ mod_scale = int (std::round (args. oversubscription_factor / cpu_factor));
389447 }
390448 int gpu_page_count = 0 , cpu_page_count = 0 ;
391449 void *running_ptr = uvm_alloc_ptr;
@@ -398,22 +456,22 @@ int main(int argc, char *argv[]) {
398456 if (gpu_page_count == permissible_phys_pages_count)
399457 device = cudaCpuDeviceId;
400458
401- CUDA_CHECK (cudaMemPrefetchAsync (running_ptr, page_size, device, task_stream));
459+ CUDA_CHECK (cudaMemPrefetchAsync (running_ptr, args. page_size , device, task_stream));
402460
403461 if (device == cudaCpuDeviceId)
404462 cpu_page_count++;
405463 else
406464 gpu_page_count++;
407465
408466 if (itr == 0 ) {
409- CUDA_CHECK (cudaMemAdvise (running_ptr, page_size, cudaMemAdviseSetPreferredLocation,
467+ CUDA_CHECK (cudaMemAdvise (running_ptr, args. page_size , cudaMemAdviseSetPreferredLocation,
410468 device));
411469
412470 if (device == cudaCpuDeviceId)
413- CUDA_CHECK (cudaMemAdvise (running_ptr, page_size, cudaMemAdviseSetAccessedBy,
471+ CUDA_CHECK (cudaMemAdvise (running_ptr, args. page_size , cudaMemAdviseSetAccessedBy,
414472 current_device));
415473 }
416- running_ptr = reinterpret_cast <void *>((size_t )running_ptr + page_size);
474+ running_ptr = reinterpret_cast <void *>((size_t )running_ptr + args. page_size );
417475 }
418476 }
419477 break ;
@@ -446,28 +504,19 @@ int main(int argc, char *argv[]) {
446504 CUDA_CHECK (cudaEventRecord (startE, task_stream));
447505
448506 // run read/write kernel for streaming/random access
449- if (k_op == READ) {
450- if (memory_access == STREAMING) {
451- cudaOccupancyMaxActiveBlocksPerMultiprocessor (&num_blocks_per_sm,
452- read_thread_blocksync<float >, block.x , 0 );
453- grid.x = prop.multiProcessorCount * num_blocks_per_sm;
507+ if (args.k_op == READ) {
508+ if (args.memory_access == STREAMING) {
454509 read_thread_blocksync<float ><<<grid, block, 0 , task_stream>>> ((float *)uvm_alloc_ptr,
455510 allocation_size);
456511 }
457- else if (memory_access == BLOCK_STREAMING) {
458- cudaOccupancyMaxActiveBlocksPerMultiprocessor (&num_blocks_per_sm,
459- read_thread_blockCont_blocksync<float >, block.x , 0 );
460- grid.x = prop.multiProcessorCount * num_blocks_per_sm;
512+ else if (args.memory_access == BLOCK_STREAMING) {
461513 read_thread_blockCont_blocksync<float ><<<grid, block, 0 , task_stream>>> ((float *)uvm_alloc_ptr,
462514 allocation_size);
463515 }
464- else if (memory_access == RANDOM_WARP) {
465- cudaOccupancyMaxActiveBlocksPerMultiprocessor (&num_blocks_per_sm,
466- cta_random_warp_streaming_read<float >, block.x , 0 );
467- grid.x = prop.multiProcessorCount * num_blocks_per_sm;
516+ else if (args.memory_access == RANDOM_WARP) {
468517 cta_random_warp_streaming_read<float ><<<grid, block, 0 , task_stream>>> (
469518 (float *)uvm_alloc_ptr, allocation_size,
470- num_pages, page_size);
519+ num_pages, args. page_size );
471520 }
472521 }
473522
@@ -480,7 +529,7 @@ int main(int argc, char *argv[]) {
480529 float bw_meas = allocation_size / (1024 .0f * 1024 .0f * 1024 .0f ) / (kernel_time / 1000 .0f );
481530 accum_bw += bw_meas;
482531
483- if (is_P9 && uvm_behavior == PAGE_FAULT) {
532+ if (is_P9 && args. uvm_behavior == PAGE_FAULT) {
484533 CUDA_CHECK (cudaFree (uvm_alloc_ptr));
485534 }
486535 }
@@ -490,9 +539,9 @@ int main(int argc, char *argv[]) {
490539 CUDA_CHECK (cudaStreamDestroy (task_stream));
491540
492541 // avg time, comp bw, print numbers, avg bw per run or total run/total sizes??, avg kernel time, avg bw
493- printf (" %s, %f ms, %f GB/s\n " , header_string.c_str (), accum_kernel_time / loop_count, accum_bw / loop_count);
542+ printf (" %s, %f ms, %f GB/s\n " , args. header_string .c_str (), accum_kernel_time / args. loop_count , accum_bw / args. loop_count );
494543
495- if (!(is_P9 && uvm_behavior == PAGE_FAULT)) {
544+ if (!(is_P9 && args. uvm_behavior == PAGE_FAULT)) {
496545 CUDA_CHECK (cudaFree (uvm_alloc_ptr));
497546 }
498547
0 commit comments