Skip to content

Commit 5a00355

Browse files
refactor main function, resolve review comments
1 parent 4aba949 commit 5a00355

File tree

3 files changed

+124
-65
lines changed

3 files changed

+124
-65
lines changed

posts/unified-memory-oversubscription/Makefile

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,9 @@ NVCC ?= nvcc
2929
all: uvm_oversubs
3030

3131
uvm_oversubs: uvm_oversubs.cu
32-
$(NVCC) $^ -o $@ -std=c++11 -arch=sm_70
32+
$(NVCC) $^ -o $@ -std=c++11 -gencode arch=compute_70,code=sm_70 \
33+
-gencode arch=compute_80,code=sm_80 \
34+
-gencode arch=compute_80,code=compute_80
3335

3436
clean:
3537
$(RM) uvm_oversubs

posts/unified-memory-oversubscription/README.md

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
Benchmark for UVM oversubscription tests
44

5-
Build command: `nvcc uvm_oversubs.cu -gencode arch=compute_70,code=sm_70 -o uvm_oversubs`
5+
Applicatiopn build: Execute the provided Makefile to build the executable.
66
## Command line options
77

88
```
@@ -19,7 +19,7 @@ Float value.
1919
Eg: 1.1 - 110% GPU allocation
2020
Default: 1.0
2121
22-
-s - Page Size
22+
-s - Software abstracted page Size for memory striping experiments
2323
2M/64K/4K
2424
Default: 2M
2525
@@ -28,4 +28,12 @@ Default: 128
2828
2929
-lc - LoopCount - Benchmark iteration count
3030
integer value
31-
```
31+
Default: 3
32+
```
33+
34+
## Sample commands (with test description):
35+
`uvm_oversubs -p 2.0 -a streaming -m zero_copy` - Test oversubscription with 2x GPU memory size working set, using zero-copy (data placed in CPU memory and directly accessed), and streaming access pattern (see corresponding developer blog for detail).
36+
37+
`uvm_oversubs -p 0.5 -a block_streaming -m fault` - Test oversubscription with half GPU memory allocated using Unified Memory (`cudaMallocManaged`) and block strided kernel read data with page-fault induced migration.
38+
39+
`uvm_oversubs -p 1.5 -a stripe_gpu_cpu -m random_warp` - Test oversubscription with 1.5x GPU memory working set, with memory pages striped between GPU and CPU. Random warp kernel accesses a different 128 byte region of allocation in each loop iteration.

posts/unified-memory-oversubscription/uvm_oversubs.cu

Lines changed: 110 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ enum MemoryAccess {
6262

6363
template <typename T> __device__ T myrand(T i);
6464

65-
// from wikipedia - glibc LCG constants
65+
// glibc LCG constants - taken from public domain
6666
// x_n+1 = (a*x_n + c) mod m
6767
// a = 1103515245
6868
// m = 2^31
@@ -97,7 +97,7 @@ template<typename data_type>
9797
__global__ void read_thread_blocksync(data_type *ptr, const size_t size)
9898
{
9999
size_t n = size / sizeof(data_type);
100-
data_type accum = 0; // ToDo: check PTX that accum is not optimized out
100+
data_type accum = 0;
101101

102102
size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
103103
while (1) {
@@ -195,18 +195,26 @@ __global__ void cta_random_warp_streaming_read(data_type *ptr, const size_t size
195195
ptr[0] = accum;
196196
}
197197

198-
int main(int argc, char *argv[]) {
199198

199+
typedef struct {
200+
std::string header_string = "";
201+
size_t page_size;
202+
float oversubscription_factor;
203+
int loop_count;
204+
int block_size;
205+
KernelOp k_op;
206+
UVMBehavior uvm_behavior;
207+
MemoryAccess memory_access;
208+
} cmdline_params;
209+
210+
cmdline_params parse_arguments(int argc, char *argv[]) {
200211
KernelOp k_op = READ;
201212
UVMBehavior uvm_behavior = PAGE_FAULT;
202213
MemoryAccess memory_access = STREAMING;
203214
float oversubscription_factor = 1.0f; // 1.0 - 100%
204215
size_t page_size = TWO_MB;
205216
int loop_count = 3;
206-
int num_gpus = 0;
207217
int block_size = 128;
208-
CUDA_CHECK(cudaGetDeviceCount(&num_gpus));
209-
210218
std::string header_string = "";
211219

212220
int cur_pos = 1;
@@ -258,7 +266,6 @@ int main(int argc, char *argv[]) {
258266
}
259267
else if (flag == "-lc") {
260268
loop_count = std::atoi(argv[cur_pos++]);
261-
header_string += "loop_count=";
262269
}
263270
else if (flag == "-blocksize") {
264271
block_size = std::atoi(argv[cur_pos++]);
@@ -304,49 +311,98 @@ int main(int argc, char *argv[]) {
304311
header_string += "loop_count=";
305312
header_string += std::to_string(loop_count);
306313

314+
cmdline_params args;
315+
args.header_string = header_string;
316+
args.page_size = page_size;
317+
args.oversubscription_factor = oversubscription_factor;
318+
args.loop_count = loop_count;
319+
args.block_size = block_size;
320+
args.k_op = k_op;
321+
args.uvm_behavior = uvm_behavior;
322+
args.memory_access = memory_access;
323+
324+
return args;
325+
}
326+
327+
void setup_memory_allocation(cmdline_params params, size_t vidmem_size, bool is_P9, void** blockMemory, void **uvm_alloc_ptr, size_t& allocation_size) {
328+
307329
// determine cudaMallocManaged size
308330
int current_device = 0;
309331
CUDA_CHECK(cudaSetDevice(current_device));
310-
cudaDeviceProp prop;
311-
CUDA_CHECK(cudaGetDeviceProperties(&prop, current_device));
312-
bool is_P9 = (prop.pageableMemoryAccessUsesHostPageTables == 1);
313-
314-
size_t allocation_size = (size_t)(oversubscription_factor * prop.totalGlobalMem);
332+
allocation_size = (size_t)(params.oversubscription_factor * vidmem_size);
315333

316-
void *blockMemory = nullptr;
317-
if (memory_access != STREAMING && uvm_behavior) {
334+
if (params.memory_access == RANDOM_WARP) {
318335
// reduce test working memory
319336
// cudaMalloc 2/3 GPU
320-
size_t cudaMallocSize = AlignSize(size_t(prop.totalGlobalMem * 0.67), page_size);
321-
allocation_size = AlignSize(size_t(prop.totalGlobalMem * 0.33 * oversubscription_factor),
322-
page_size);
323-
CUDA_CHECK(cudaMalloc(&blockMemory, cudaMallocSize));
337+
size_t cudaMallocSize = AlignSize(size_t(vidmem_size * 0.67), params.page_size);
338+
allocation_size = AlignSize(size_t(vidmem_size * 0.33 * params.oversubscription_factor),
339+
params.page_size);
340+
CUDA_CHECK(cudaMalloc(blockMemory, cudaMallocSize));
324341
}
325342
// pad allocation to page_size
326-
allocation_size = AlignSize(allocation_size, page_size);
327-
size_t num_pages = allocation_size / page_size;
328-
329-
size_t avail_phy_vidmem = 0, total_phy_vidmem = 0;
330-
// allocate memory - add hints etc as needed
331-
void *uvm_alloc_ptr = NULL;
343+
allocation_size = AlignSize(allocation_size, params.page_size);
332344

333345
// For P9 we need to allocate and free in-benchmark loop
334346
// as evicted memory has remote mappings don't trigger a page-fault
335-
if (!(is_P9 && uvm_behavior == PAGE_FAULT)) {
336-
CUDA_CHECK(cudaMallocManaged(&uvm_alloc_ptr, allocation_size));
337-
CUDA_CHECK(cudaMemGetInfo(&avail_phy_vidmem, &total_phy_vidmem));
347+
if (!(is_P9 && params.uvm_behavior == PAGE_FAULT)) {
348+
CUDA_CHECK(cudaMallocManaged(uvm_alloc_ptr, allocation_size));
349+
338350
// populate pages on GPU
339-
CUDA_CHECK(cudaMemPrefetchAsync(uvm_alloc_ptr, allocation_size, current_device));
351+
CUDA_CHECK(cudaMemPrefetchAsync(*uvm_alloc_ptr, allocation_size, current_device));
352+
}
353+
}
354+
355+
dim3 get_grid_config(cmdline_params params, int multiProcessorCount) {
356+
dim3 grid(1,1,1);
357+
int num_blocks_per_sm = 1; // placeholder value
358+
359+
if (params.k_op == READ) {
360+
if (params.memory_access == STREAMING) {
361+
cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_blocks_per_sm,
362+
read_thread_blocksync<float>, params.block_size, 0);
363+
grid.x = multiProcessorCount * num_blocks_per_sm;
364+
}
365+
else if (params.memory_access == BLOCK_STREAMING) {
366+
cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_blocks_per_sm,
367+
read_thread_blockCont_blocksync<float>, params.block_size, 0);
368+
grid.x = multiProcessorCount * num_blocks_per_sm;
369+
}
370+
else if (params.memory_access == RANDOM_WARP) {
371+
cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_blocks_per_sm,
372+
cta_random_warp_streaming_read<float>, params.block_size, 0);
373+
grid.x = multiProcessorCount * num_blocks_per_sm;
374+
}
340375
}
376+
return grid;
377+
}
378+
379+
int main(int argc, char *argv[]) {
380+
cmdline_params args = parse_arguments(argc, argv);
381+
382+
int current_device = 0;
383+
cudaDeviceProp prop;
384+
CUDA_CHECK(cudaGetDeviceProperties(&prop, current_device));
385+
bool is_P9 = (prop.pageableMemoryAccessUsesHostPageTables == 1);
386+
void *blockMemory = nullptr;
387+
void *uvm_alloc_ptr = nullptr;
388+
size_t allocation_size = 0;
389+
size_t avail_phy_vidmem = 0, total_phy_vidmem = 0;
390+
CUDA_CHECK(cudaMemGetInfo(&avail_phy_vidmem, &total_phy_vidmem));
391+
392+
setup_memory_allocation(args, prop.totalGlobalMem, is_P9, &blockMemory, &uvm_alloc_ptr, allocation_size);
393+
size_t num_pages = allocation_size / args.page_size;
394+
395+
if (args.memory_access == RANDOM_WARP)
396+
CUDA_CHECK(cudaMemGetInfo(&avail_phy_vidmem, &total_phy_vidmem));
397+
341398

342399
// P9 need more state space on vidmem - size in MB
343400
size_t state_space_size = (prop.pageableMemoryAccessUsesHostPageTables == 1) ? 320 : 128;
344-
size_t permissible_phys_pages_count = avail_phy_vidmem / page_size;
345-
permissible_phys_pages_count -= (state_space_size * 1024 * 1024 / page_size);
401+
size_t permissible_phys_pages_count = avail_phy_vidmem / args.page_size;
402+
permissible_phys_pages_count -= (state_space_size * 1024 * 1024 / args.page_size);
346403

347-
dim3 block(block_size,1,1);
348-
dim3 grid((prop.multiProcessorCount * prop.maxThreadsPerMultiProcessor) / block.x, 1, 1);
349-
int num_blocks_per_sm = 1; // placeholder value
404+
dim3 block(args.block_size,1,1);
405+
dim3 grid = get_grid_config(args, prop.multiProcessorCount);
350406

351407
cudaStream_t task_stream;
352408
CUDA_CHECK(cudaStreamCreate(&task_stream));
@@ -359,21 +415,23 @@ int main(int argc, char *argv[]) {
359415
float accum_kernel_time = 0.0f;
360416
float accum_bw = 0.0f;
361417

362-
for (int itr = 0; itr < loop_count; itr++) {
363-
if (is_P9 && uvm_behavior == PAGE_FAULT) {
418+
for (int itr = 0; itr < args.loop_count; itr++) {
419+
// on P9, memory is allocated in loop for fault based allocations
420+
// to avoid access counter initiated mapping
421+
if (is_P9 && args.uvm_behavior == PAGE_FAULT) {
364422
CUDA_CHECK(cudaMallocManaged(&uvm_alloc_ptr, allocation_size));
365423
}
366424
// prefetch to CPU as starting point
367-
if (uvm_behavior != PREFETCH_ONCE_AND_HINTS)
425+
if (args.uvm_behavior != PREFETCH_ONCE_AND_HINTS)
368426
CUDA_CHECK(cudaMemPrefetchAsync(uvm_alloc_ptr, allocation_size, cudaCpuDeviceId,
369427
task_stream));
370428

371-
switch(uvm_behavior) {
429+
switch(args.uvm_behavior) {
372430
case STRIPE_GPU_CPU:
373431
{
374432
// distribute pages across GPU0 and CPU
375433
// get page-split ratios
376-
float cpu_factor = oversubscription_factor - 1.0;
434+
float cpu_factor = args.oversubscription_factor - 1.0;
377435
if (cpu_factor < 0.0f)
378436
cpu_factor = 0.0f;
379437
int mod_zero_devId = cudaCpuDeviceId;
@@ -382,10 +440,10 @@ int main(int argc, char *argv[]) {
382440
if (cpu_factor > 1.0) {
383441
mod_zero_devId = current_device;
384442
flip_devId = cudaCpuDeviceId;
385-
mod_scale = int(std::round(oversubscription_factor));
443+
mod_scale = int(std::round(args.oversubscription_factor));
386444
}
387445
else if (cpu_factor != 0.0f) {
388-
mod_scale = int(std::round(oversubscription_factor / cpu_factor));
446+
mod_scale = int(std::round(args.oversubscription_factor / cpu_factor));
389447
}
390448
int gpu_page_count = 0, cpu_page_count = 0;
391449
void *running_ptr = uvm_alloc_ptr;
@@ -398,22 +456,22 @@ int main(int argc, char *argv[]) {
398456
if (gpu_page_count == permissible_phys_pages_count)
399457
device = cudaCpuDeviceId;
400458

401-
CUDA_CHECK(cudaMemPrefetchAsync(running_ptr, page_size, device, task_stream));
459+
CUDA_CHECK(cudaMemPrefetchAsync(running_ptr, args.page_size, device, task_stream));
402460

403461
if (device == cudaCpuDeviceId)
404462
cpu_page_count++;
405463
else
406464
gpu_page_count++;
407465

408466
if (itr == 0) {
409-
CUDA_CHECK(cudaMemAdvise(running_ptr, page_size, cudaMemAdviseSetPreferredLocation,
467+
CUDA_CHECK(cudaMemAdvise(running_ptr, args.page_size, cudaMemAdviseSetPreferredLocation,
410468
device));
411469

412470
if (device == cudaCpuDeviceId)
413-
CUDA_CHECK(cudaMemAdvise(running_ptr, page_size, cudaMemAdviseSetAccessedBy,
471+
CUDA_CHECK(cudaMemAdvise(running_ptr, args.page_size, cudaMemAdviseSetAccessedBy,
414472
current_device));
415473
}
416-
running_ptr = reinterpret_cast<void*>((size_t)running_ptr + page_size);
474+
running_ptr = reinterpret_cast<void*>((size_t)running_ptr + args.page_size);
417475
}
418476
}
419477
break;
@@ -446,28 +504,19 @@ int main(int argc, char *argv[]) {
446504
CUDA_CHECK(cudaEventRecord(startE, task_stream));
447505

448506
// run read/write kernel for streaming/random access
449-
if (k_op == READ) {
450-
if (memory_access == STREAMING) {
451-
cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_blocks_per_sm,
452-
read_thread_blocksync<float>, block.x, 0);
453-
grid.x = prop.multiProcessorCount * num_blocks_per_sm;
507+
if (args.k_op == READ) {
508+
if (args.memory_access == STREAMING) {
454509
read_thread_blocksync<float><<<grid, block, 0, task_stream>>>((float*)uvm_alloc_ptr,
455510
allocation_size);
456511
}
457-
else if (memory_access == BLOCK_STREAMING) {
458-
cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_blocks_per_sm,
459-
read_thread_blockCont_blocksync<float>, block.x, 0);
460-
grid.x = prop.multiProcessorCount * num_blocks_per_sm;
512+
else if (args.memory_access == BLOCK_STREAMING) {
461513
read_thread_blockCont_blocksync<float><<<grid, block, 0, task_stream>>>((float*)uvm_alloc_ptr,
462514
allocation_size);
463515
}
464-
else if (memory_access == RANDOM_WARP) {
465-
cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_blocks_per_sm,
466-
cta_random_warp_streaming_read<float>, block.x, 0);
467-
grid.x = prop.multiProcessorCount * num_blocks_per_sm;
516+
else if (args.memory_access == RANDOM_WARP) {
468517
cta_random_warp_streaming_read<float><<<grid, block, 0, task_stream>>>(
469518
(float*)uvm_alloc_ptr, allocation_size,
470-
num_pages, page_size);
519+
num_pages, args.page_size);
471520
}
472521
}
473522

@@ -480,7 +529,7 @@ int main(int argc, char *argv[]) {
480529
float bw_meas = allocation_size / (1024.0f * 1024.0f * 1024.0f) / (kernel_time / 1000.0f );
481530
accum_bw += bw_meas;
482531

483-
if (is_P9 && uvm_behavior == PAGE_FAULT) {
532+
if (is_P9 && args.uvm_behavior == PAGE_FAULT) {
484533
CUDA_CHECK(cudaFree(uvm_alloc_ptr));
485534
}
486535
}
@@ -490,9 +539,9 @@ int main(int argc, char *argv[]) {
490539
CUDA_CHECK(cudaStreamDestroy(task_stream));
491540

492541
// avg time, comp bw, print numbers, avg bw per run or total run/total sizes??, avg kernel time, avg bw
493-
printf("%s, %f ms, %f GB/s\n", header_string.c_str(), accum_kernel_time / loop_count, accum_bw / loop_count);
542+
printf("%s, %f ms, %f GB/s\n", args.header_string.c_str(), accum_kernel_time / args.loop_count, accum_bw / args.loop_count);
494543

495-
if (!(is_P9 && uvm_behavior == PAGE_FAULT)) {
544+
if (!(is_P9 && args.uvm_behavior == PAGE_FAULT)) {
496545
CUDA_CHECK(cudaFree(uvm_alloc_ptr));
497546
}
498547

0 commit comments

Comments
 (0)