Skip to content

Commit 0ebdd3b

Browse files
committed
Add cuda-vmm examples
1 parent 4c18ff8 commit 0ebdd3b

File tree

5 files changed

+881
-0
lines changed

5 files changed

+881
-0
lines changed

posts/cuda-vmm/Makefile

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Redistribution and use in source and binary forms, with or without
4+
# modification, are permitted provided that the following conditions
5+
# are met:
6+
# * Redistributions of source code must retain the above copyright
7+
# notice, this list of conditions and the following disclaimer.
8+
# * Redistributions in binary form must reproduce the above copyright
9+
# notice, this list of conditions and the following disclaimer in the
10+
# documentation and/or other materials provided with the distribution.
11+
# * Neither the name of NVIDIA CORPORATION nor the names of its
12+
# contributors may be used to endorse or promote products derived
13+
# from this software without specific prior written permission.
14+
#
15+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
27+
NVCC ?= nvcc
28+
29+
all: vector_example sync_example
30+
31+
vector_example: vector_main.cpp cuvector.cpp
32+
$(NVCC) $^ -o $@ -lcuda -std=c++11
33+
34+
sync_example: sync_main.cu
35+
$(NVCC) $^ -o $@ -lcuda -std=c++11
36+
37+
clean:
38+
$(RM) vector_example sync_example

posts/cuda-vmm/cuvector.cpp

Lines changed: 359 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,359 @@
1+
/* Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
2+
*
3+
* Redistribution and use in source and binary forms, with or without
4+
* modification, are permitted provided that the following conditions
5+
* are met:
6+
* * Redistributions of source code must retain the above copyright
7+
* notice, this list of conditions and the following disclaimer.
8+
* * Redistributions in binary form must reproduce the above copyright
9+
* notice, this list of conditions and the following disclaimer in the
10+
* documentation and/or other materials provided with the distribution.
11+
* * Neither the name of NVIDIA CORPORATION nor the names of its
12+
* contributors may be used to endorse or promote products derived
13+
* from this software without specific prior written permission.
14+
*
15+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16+
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18+
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19+
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20+
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21+
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22+
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23+
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24+
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25+
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
*/
27+
#include <cuda.h>
28+
#include <assert.h>
29+
#include "cuvector.h"
30+
31+
// **************
32+
// VectorMemAlloc
33+
// **************
34+
35+
namespace cuda_utils {
36+
37+
VectorMemAlloc::VectorMemAlloc(CUcontext context) : ctx(context), d_p(0ULL), alloc_sz(0ULL)
38+
{
39+
40+
}
41+
42+
VectorMemAlloc::~VectorMemAlloc()
43+
{
44+
(void)cuMemFree(d_p);
45+
}
46+
47+
// Although we're not supposed to "commit" memory in a reserve call,
48+
// doing so for this sample demonstrates why reserve is so important
49+
CUresult
50+
VectorMemAlloc::reserve(size_t new_sz)
51+
{
52+
CUresult status = CUDA_SUCCESS;
53+
CUdeviceptr new_ptr = 0ULL;
54+
CUcontext prev_ctx;
55+
56+
if (new_sz <= alloc_sz) {
57+
return CUDA_SUCCESS;
58+
}
59+
(void)cuCtxGetCurrent(&prev_ctx);
60+
// Make sure we allocate on the correct context
61+
if ((status = cuCtxSetCurrent(ctx)) != CUDA_SUCCESS) {
62+
return status;
63+
}
64+
// Allocate the bigger buffer
65+
if ((status = cuMemAlloc(&new_ptr, new_sz)) == CUDA_SUCCESS) {
66+
// Copy over the bigger buffer. We'll explicitly use the per thread
67+
// stream to ensure we don't add false dependencies on other threads
68+
// using the null stream, but we may have issues with other prior
69+
// work on this stream. Luckily, that's not the case in our sample.
70+
//
71+
// We only want to copy over the alloc_sz here, as that's what's
72+
// actually committed at the moment
73+
if ((status = cuMemcpyAsync(new_ptr, d_p, alloc_sz, CU_STREAM_PER_THREAD)) == CUDA_SUCCESS) {
74+
// Free the smaller buffer. We don't need to synchronize
75+
// CU_STREAM_PER_THREAD, since cuMemFree synchronizes for us
76+
(void)cuMemFree(d_p);
77+
d_p = new_ptr;
78+
alloc_sz = new_sz;
79+
}
80+
else {
81+
// Failed to copy the bigger buffer, free the smaller one
82+
(void)cuMemFree(new_ptr);
83+
}
84+
}
85+
// Make sure to always return to the previous context the caller had
86+
(void)cuCtxSetCurrent(prev_ctx);
87+
88+
return status;
89+
}
90+
91+
// *********************
92+
// VectorMemAllocManaged
93+
// *********************
94+
95+
VectorMemAllocManaged::VectorMemAllocManaged(CUcontext context) : ctx(context), dev(CU_DEVICE_INVALID), d_p(0ULL),
96+
alloc_sz(0ULL), reserve_sz(0ULL)
97+
{
98+
CUcontext prev_ctx;
99+
(void)cuCtxGetCurrent(&prev_ctx);
100+
if (cuCtxSetCurrent(context) == CUDA_SUCCESS) {
101+
(void)cuCtxGetDevice(&dev);
102+
(void)cuCtxSetCurrent(prev_ctx);
103+
}
104+
}
105+
106+
VectorMemAllocManaged::~VectorMemAllocManaged()
107+
{
108+
(void)cuMemFree(d_p);
109+
}
110+
111+
CUresult
112+
VectorMemAllocManaged::reserve(size_t new_sz)
113+
{
114+
CUresult status = CUDA_SUCCESS;
115+
CUcontext prev_ctx;
116+
CUdeviceptr new_ptr = 0ULL;
117+
118+
if (new_sz <= reserve_sz) {
119+
return CUDA_SUCCESS;
120+
}
121+
122+
(void)cuCtxGetCurrent(&prev_ctx);
123+
if ((status = cuCtxSetCurrent(ctx)) != CUDA_SUCCESS) {
124+
return status;
125+
}
126+
127+
// Allocate the bigger buffer
128+
if ((status = cuMemAllocManaged(&new_ptr, new_sz, CU_MEM_ATTACH_GLOBAL)) == CUDA_SUCCESS) {
129+
// Set the preferred location for this managed allocation, to bias
130+
// any migration requests ("pinning" it under most circumstances to
131+
// the requested device)
132+
(void)cuMemAdvise(new_ptr, new_sz, CU_MEM_ADVISE_SET_PREFERRED_LOCATION, dev);
133+
// Copy over the bigger buffer. We'll explicitly use the per thread
134+
// stream to ensure we don't add false dependencies on other threads
135+
// using the null stream, but we may have issues with other prior
136+
// work on this stream. Luckily, that's not the case in our sample.
137+
//
138+
// We only want to copy over the alloc_sz here, as that's what's
139+
// actually committed at the moment
140+
if (alloc_sz > 0) {
141+
if ((status = cuMemcpyAsync(new_ptr, d_p, alloc_sz, CU_STREAM_PER_THREAD)) == CUDA_SUCCESS) {
142+
// Free the smaller buffer. We don't need to synchronize
143+
// CU_STREAM_PER_THREAD, since cuMemFree synchronizes for us
144+
(void)cuMemFree(d_p);
145+
}
146+
else {
147+
// Failed to copy the bigger buffer, free the smaller one
148+
(void)cuMemFree(new_ptr);
149+
}
150+
}
151+
if (status == CUDA_SUCCESS) {
152+
d_p = new_ptr;
153+
reserve_sz = new_sz;
154+
}
155+
}
156+
157+
// Make sure to always return to the previous context the caller had
158+
(void)cuCtxSetCurrent(prev_ctx);
159+
160+
return status;
161+
}
162+
163+
// Actually commits num bytes of additional memory
164+
CUresult
165+
VectorMemAllocManaged::grow(size_t new_sz)
166+
{
167+
CUresult status = CUDA_SUCCESS;
168+
CUcontext prev_ctx;
169+
170+
if (new_sz <= alloc_sz) {
171+
return CUDA_SUCCESS;
172+
}
173+
if ((status = reserve(new_sz)) != CUDA_SUCCESS) {
174+
return status;
175+
}
176+
177+
(void)cuCtxGetCurrent(&prev_ctx);
178+
// Make sure we allocate on the correct context
179+
if ((status = cuCtxSetCurrent(ctx)) != CUDA_SUCCESS) {
180+
return status;
181+
}
182+
// Actually commit the needed memory
183+
// We explicitly use the per thread stream here to ensure we're not
184+
// conflicting with other uses of the null stream from other threads
185+
if ((status = cuMemPrefetchAsync(d_p + alloc_sz, (new_sz - alloc_sz), dev,
186+
CU_STREAM_PER_THREAD)) == CUDA_SUCCESS) {
187+
// Not completely necessary, but will ensure the prefetch is complete
188+
// and prevent future runtime faults. Also makes for a more fair
189+
// benchmark comparision
190+
if ((status = cuStreamSynchronize(CU_STREAM_PER_THREAD)) == CUDA_SUCCESS) {
191+
alloc_sz = new_sz;
192+
}
193+
}
194+
// Make sure to always return to the previous context the caller had
195+
(void)cuCtxSetCurrent(prev_ctx);
196+
return status;
197+
}
198+
199+
// *********************
200+
// VectorMemMap
201+
// *********************
202+
203+
VectorMemMap::VectorMemMap(CUcontext context) : d_p(0ULL), prop(), handles(), alloc_sz(0ULL), reserve_sz(0ULL), chunk_sz(0ULL)
204+
{
205+
CUdevice device;
206+
CUcontext prev_ctx;
207+
CUresult status = CUDA_SUCCESS;
208+
(void)status;
209+
210+
status = cuCtxGetCurrent(&prev_ctx);
211+
assert(status == CUDA_SUCCESS);
212+
if (cuCtxSetCurrent(context) == CUDA_SUCCESS) {
213+
status = cuCtxGetDevice(&device);
214+
assert(status == CUDA_SUCCESS);
215+
status = cuCtxSetCurrent(prev_ctx);
216+
assert(status == CUDA_SUCCESS);
217+
}
218+
219+
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
220+
prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
221+
prop.location.id = (int)device;
222+
prop.win32HandleMetaData = NULL;
223+
224+
accessDesc.location = prop.location;
225+
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
226+
227+
status = cuMemGetAllocationGranularity(&chunk_sz, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM);
228+
assert(status == CUDA_SUCCESS);
229+
}
230+
231+
VectorMemMap::~VectorMemMap()
232+
{
233+
CUresult status = CUDA_SUCCESS;
234+
(void)status;
235+
if (d_p != 0ULL) {
236+
status = cuMemUnmap(d_p, alloc_sz);
237+
assert(status == CUDA_SUCCESS);
238+
for (size_t i = 0ULL; i < va_ranges.size(); i++) {
239+
status = cuMemAddressFree(va_ranges[i].start, va_ranges[i].sz);
240+
assert(status == CUDA_SUCCESS);
241+
}
242+
for (size_t i = 0ULL; i < handles.size(); i++) {
243+
status = cuMemRelease(handles[i]);
244+
assert(status == CUDA_SUCCESS);
245+
}
246+
}
247+
}
248+
249+
CUresult
250+
VectorMemMap::reserve(size_t new_sz)
251+
{
252+
CUresult status = CUDA_SUCCESS;
253+
CUdeviceptr new_ptr = 0ULL;
254+
255+
if (new_sz <= reserve_sz) {
256+
return CUDA_SUCCESS;
257+
}
258+
259+
const size_t aligned_sz = ((new_sz + chunk_sz - 1) / chunk_sz) * chunk_sz;
260+
261+
status = cuMemAddressReserve(&new_ptr, (aligned_sz - reserve_sz), 0ULL, d_p + reserve_sz, 0ULL);
262+
263+
// Try to reserve an address just after what we already have reserved
264+
if (status != CUDA_SUCCESS || (new_ptr != d_p + reserve_sz)) {
265+
if (new_ptr != 0ULL) {
266+
(void)cuMemAddressFree(new_ptr, (aligned_sz - reserve_sz));
267+
}
268+
// Slow path - try to find a new address reservation big enough for us
269+
status = cuMemAddressReserve(&new_ptr, aligned_sz, 0ULL, 0U, 0);
270+
if (status == CUDA_SUCCESS && d_p != 0ULL) {
271+
CUdeviceptr ptr = new_ptr;
272+
// Found one, now unmap our previous allocations
273+
status = cuMemUnmap(d_p, alloc_sz);
274+
assert(status == CUDA_SUCCESS);
275+
for (size_t i = 0ULL; i < handles.size(); i++) {
276+
const size_t hdl_sz = handle_sizes[i];
277+
// And remap them, enabling their access
278+
if ((status = cuMemMap(ptr, hdl_sz, 0ULL, handles[i], 0ULL)) != CUDA_SUCCESS)
279+
break;
280+
if ((status = cuMemSetAccess(ptr, hdl_sz, &accessDesc, 1ULL)) != CUDA_SUCCESS)
281+
break;
282+
ptr += hdl_sz;
283+
}
284+
if (status != CUDA_SUCCESS) {
285+
// Failed the mapping somehow... clean up!
286+
status = cuMemUnmap(new_ptr, aligned_sz);
287+
assert(status == CUDA_SUCCESS);
288+
status = cuMemAddressFree(new_ptr, aligned_sz);
289+
assert(status == CUDA_SUCCESS);
290+
}
291+
else {
292+
// Clean up our old VA reservations!
293+
for (size_t i = 0ULL; i < va_ranges.size(); i++) {
294+
(void)cuMemAddressFree(va_ranges[i].start, va_ranges[i].sz);
295+
}
296+
va_ranges.clear();
297+
}
298+
}
299+
// Assuming everything went well, update everything
300+
if (status == CUDA_SUCCESS) {
301+
Range r;
302+
d_p = new_ptr;
303+
reserve_sz = aligned_sz;
304+
r.start = new_ptr;
305+
r.sz = aligned_sz;
306+
va_ranges.push_back(r);
307+
}
308+
}
309+
else {
310+
Range r;
311+
r.start = new_ptr;
312+
r.sz = aligned_sz - reserve_sz;
313+
va_ranges.push_back(r);
314+
if (d_p == 0ULL) {
315+
d_p = new_ptr;
316+
}
317+
reserve_sz = aligned_sz;
318+
}
319+
320+
return status;
321+
}
322+
323+
CUresult
324+
VectorMemMap::grow(size_t new_sz)
325+
{
326+
CUresult status = CUDA_SUCCESS;
327+
CUmemGenericAllocationHandle handle;
328+
if (new_sz <= alloc_sz) {
329+
return CUDA_SUCCESS;
330+
}
331+
332+
const size_t size_diff = new_sz - alloc_sz;
333+
// Round up to the next chunk size
334+
const size_t sz = ((size_diff + chunk_sz - 1) / chunk_sz) * chunk_sz;
335+
336+
if ((status = reserve(alloc_sz + sz)) != CUDA_SUCCESS) {
337+
return status;
338+
}
339+
340+
if ((status = cuMemCreate(&handle, sz, &prop, 0)) == CUDA_SUCCESS) {
341+
if ((status = cuMemMap(d_p + alloc_sz, sz, 0ULL, handle, 0ULL)) == CUDA_SUCCESS) {
342+
if ((status = cuMemSetAccess(d_p + alloc_sz, sz, &accessDesc, 1ULL)) == CUDA_SUCCESS) {
343+
handles.push_back(handle);
344+
handle_sizes.push_back(sz);
345+
alloc_sz += sz;
346+
}
347+
if (status != CUDA_SUCCESS) {
348+
(void)cuMemUnmap(d_p + alloc_sz, sz);
349+
}
350+
}
351+
if (status != CUDA_SUCCESS) {
352+
(void)cuMemRelease(handle);
353+
}
354+
}
355+
356+
return status;
357+
}
358+
359+
}

0 commit comments

Comments
 (0)