FZJ-JSC · AndiH · Oct 29, 2021 · Oct 28, 2021 · Oct 28, 2021 · Oct 29, 2021
diff --git a/H3-Multi-GPU-parallelization/Instructions.md b/H3-Multi-GPU-parallelization/Instructions.md
@@ -0,0 +1,52 @@
+# SC21 Tutorial: Efficient Distributed GPU Programming for Exascale
+
+- Time: Sunday, 14 November 2021 8AM - 5PM CST
+- Location: *online*
+- Program Link: https://sc21.supercomputing.org/presentation/?id=tut138&sess=sess188
+
+
+## Hands-ON 5: Multi GPU parallelization with CUDA-aware MPI
+
+## Task 1: Parallelize the jacobi solver for multiple GPUs using CUDA-aware MPI
-## Task 1: Parallelize the jacobi solver for multiple GPUs using CUDA-aware MPI
+## Task 1: Parallelize Jacobi Solver for Multiple GPUs using CUDA-aware MPI
-## Task 1: Parallelize the jacobi solver for multiple GPUs using CUDA-aware MPI
+## Task 1: Parallelize Jacobi Solver for Multiple GPUs using CUDA-aware MPI
+
+#### Description
+The purpose of this task is to use CUDA-aware MPI to parallelize a jacobi solver. The starting point of this task is a skeleton `jacobi.cu`, in which the CUDA kernel is already defined and also some basic-setup functions are already present.
+There is also a single-GPU version with which the performance and numerical results are compared.
+Take some time to get familiar with the code. Some functions (like NVTX) will be explained in the next tutorial. They can be ignored for now (e.g. the `PUSH` and `POP` macros).
+Once you are familiar with the code, you need to work on `TODOs` in `jacobi.cu`:
+
+- Initialize the MPI application
+ - Include the MPI header file
+ - Determine the local rank and the number of MPI processes
+ - Query the number of GPUs visible to the calling process.
+ - Use a local communicator to assign one GPU to each MPI process
+ - Finalize MPI at the end of the application
+- Compute the 1-D domain decomposition
+ - Compute the local chunk size to distribute (ny-2) lines among the process
+ - in case `(ny-2)%size != 0` the last process should calculate the remaining rows
+ - determine the global (`iy_start_global, iy_end_global`) and local (`iy_start, iy_end`) start and end points in the 2-dimensional grid.
+- Use MPI to exchange the boundaries
+ - Compute the top and the bottom neighbor
+ - we are using reflecting/periodic boundaries on top and bottom, so rank0's Top neighbor is (size-1) and rank(size-1) bottom neighbor is rank 0
+ - Use MPI_Sendrecv to exchange data between the neighbors
+ - use the self-defined MPI_REAL_TYPE. This allows an easy switch between single- and double precision
+
+
+Compile with
+
+``` {.bash}
+make
+```
+
+Submit your compiled application to the batch system with
+
+``` {.bash}
+make run
+```
+
+## Task 1: Optimize load balancing
-## Task 1: Optimize load balancing
+## Advanced Task: Optimize Load Balancing
-## Task 1: Optimize load balancing
+## Advanced Task: Optimize Load Balancing
+
+- The work distribution of the first task is not ideal, because it can lead to the process with the last rank having to calculate significantly more than all the others. Therefore, the load distribution is to be optimized in this task.
+- Compute the `chunk_size` that each rank gets either (ny - 2) / size or (ny - 2) / size + 1 rows.
+- Compute how many processes get (ny - 2) / size resp (ny - 2) / size + 1 rows
+- Adapt the computation of (`iy_start_global`)
diff --git a/H3-Multi-GPU-parallelization/Makefile b/H3-Multi-GPU-parallelization/Makefile
@@ -0,0 +1,40 @@
+# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+NP ?= 4
+NVCC=nvcc
+JSC_SUBMIT_CMD ?= srun --gres=gpu:4 --ntasks-per-node 4
+CUDA_HOME ?= /usr/local/cuda
+ifndef MPI_HOME
+$(error MPI_HOME is not set)
+endif
+GENCODE_SM30	:= -gencode arch=compute_30,code=sm_30
+GENCODE_SM35	:= -gencode arch=compute_35,code=sm_35
+GENCODE_SM37	:= -gencode arch=compute_37,code=sm_37
+GENCODE_SM50	:= -gencode arch=compute_50,code=sm_50
+GENCODE_SM52	:= -gencode arch=compute_52,code=sm_52
+GENCODE_SM60 := -gencode arch=compute_60,code=sm_60
+GENCODE_SM70 := -gencode arch=compute_70,code=sm_70
+GENCODE_SM80 := -gencode arch=compute_80,code=sm_80 -gencode arch=compute_80,code=compute_80
+GENCODE_FLAGS	:= $(GENCODE_SM70) $(GENCODE_SM80)
+ifdef DISABLE_CUB
+ NVCC_FLAGS = -Xptxas --optimize-float-atomics
+else
+ NVCC_FLAGS = -DHAVE_CUB
+endif
+NVCC_FLAGS += -dc -Xcompiler -fopenmp -lineinfo -DUSE_NVTX -lnvToolsExt $(GENCODE_FLAGS) -std=c++14 -I$(MPI_HOME)/include
+NVCC_LDFLAGS = -ccbin=mpic++ -L$(NVSHMEM_HOME) -L$(MPI_HOME)/lib -lmpi -L$(CUDA_HOME)/lib64 -lcuda -lcudart -lnvToolsExt
+jacobi: Makefile jacobi.cu
+$(NVCC) $(NVCC_FLAGS) jacobi.cu -c -o jacobi.o
+$(NVCC) $(GENCODE_FLAGS) jacobi.o -o jacobi $(NVCC_LDFLAGS)
+
+.PHONY.: clean
+clean:
+rm -f jacobi jacobi.o *.nsys-rep jacobi.*.compute-sanitizer.log
+
+sanitize: jacobi
+$(JSC_SUBMIT_CMD) -n $(NP) compute-sanitizer --log-file jacobi.%q{SLURM_PROCID}.compute-sanitizer.log ./jacobi -niter 10
+
+run: jacobi
+$(JSC_SUBMIT_CMD) -n $(NP) ./jacobi
+
+profile: jacobi
+$(JSC_SUBMIT_CMD) -n $(NP) nsys profile --trace=mpi,cuda,nvtx -o jacobi.%q{SLURM_PROCID} ./jacobi -niter 10
diff --git a/H3-Multi-GPU-parallelization/copy.mk b/H3-Multi-GPU-parallelization/copy.mk
@@ -0,0 +1,51 @@
+#!/usr/bin/make -f
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+TASKDIR = ../../tasks/H3-Multi-GPU-parallelization
+SOLUTIONDIR = ../../solutions/H3-Multi-GPU-parallelization
+OPT_SOLUTIONDIR = ../../solutions/H3-Multi-GPU-parallelization_opt
+
+PROCESSFILES = jacobi.cu
+COPYFILES = Makefile Instructions.ipynb Instructions.md
+
+
+TASKPROCCESFILES = $(addprefix $(TASKDIR)/,$(PROCESSFILES))
+TASKCOPYFILES = $(addprefix $(TASKDIR)/,$(COPYFILES))
+SOLUTIONPROCCESFILES = $(addprefix $(SOLUTIONDIR)/,$(PROCESSFILES))
+OPT_SOLUTIONPROCCESFILES = $(addprefix $(OPT_SOLUTIONDIR)/,$(PROCESSFILES))
+SOLUTIONCOPYFILES = $(addprefix $(SOLUTIONDIR)/,$(COPYFILES))
+OPT_SOLUTIONCOPYFILES = $(addprefix $(OPT_SOLUTIONDIR)/,$(COPYFILES))
+
+
+.PHONY: all task
+all: task
+task: ${TASKPROCCESFILES} ${TASKCOPYFILES} ${SOLUTIONPROCCESFILES} ${SOLUTIONCOPYFILES} ${OPT_SOLUTIONPROCCESFILES} ${OPT_SOLUTIONCOPYFILES}
+
+
+${TASKPROCCESFILES}: $(PROCESSFILES)
+mkdir -p $(TASKDIR)/
+cppp -USOLUTION -USOLUTION_OPT $(notdir $@) $@
+
+${SOLUTIONPROCCESFILES}: $(PROCESSFILES)
+mkdir -p $(SOLUTIONDIR)/
+cppp -DSOLUTION -USOLUTION_OPT $(notdir $@) $@
+
+${OPT_SOLUTIONPROCCESFILES}: $(PROCESSFILES)
+mkdir -p $(OPT_SOLUTIONDIR)/
+cppp -DSOLUTION -DSOLUTION_OPT $(notdir $@) $@
+
+${TASKCOPYFILES}: $(COPYFILES)
+mkdir -p $(TASKDIR)/
+cp $(notdir $@) $@
+
+${SOLUTIONCOPYFILES}: $(COPYFILES)
+mkdir -p $(SOLUTIONDIR)/
+cp $(notdir $@) $@
+
+${OPT_SOLUTIONCOPYFILES}: $(COPYFILES)
+mkdir -p $(OPT_SOLUTIONDIR)/
+cp $(notdir $@) $@
+
+%.ipynb: %.md
+pandoc $< -o $@
+# add metadata so this is seen as python
+jq -s '.[0] * .[1]' $@ ../template.json | sponge $@