|
| 1 | +/* Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. |
| 2 | + * |
| 3 | + * Redistribution and use in source and binary forms, with or without |
| 4 | + * modification, are permitted provided that the following conditions |
| 5 | + * are met: |
| 6 | + * * Redistributions of source code must retain the above copyright |
| 7 | + * notice, this list of conditions and the following disclaimer. |
| 8 | + * * Redistributions in binary form must reproduce the above copyright |
| 9 | + * notice, this list of conditions and the following disclaimer in the |
| 10 | + * documentation and/or other materials provided with the distribution. |
| 11 | + * * Neither the name of NVIDIA CORPORATION nor the names of its |
| 12 | + * contributors may be used to endorse or promote products derived |
| 13 | + * from this software without specific prior written permission. |
| 14 | + * |
| 15 | + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY |
| 16 | + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 17 | + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
| 18 | + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR |
| 19 | + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
| 20 | + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
| 21 | + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
| 22 | + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
| 23 | + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| 24 | + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| 25 | + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 26 | + */ |
| 27 | +#include <cuda.h> |
| 28 | +#include <assert.h> |
| 29 | +#include "cuvector.h" |
| 30 | + |
| 31 | +// ************** |
| 32 | +// VectorMemAlloc |
| 33 | +// ************** |
| 34 | + |
| 35 | +namespace cuda_utils { |
| 36 | + |
| 37 | +VectorMemAlloc::VectorMemAlloc(CUcontext context) : ctx(context), d_p(0ULL), alloc_sz(0ULL) |
| 38 | +{ |
| 39 | + |
| 40 | +} |
| 41 | + |
| 42 | +VectorMemAlloc::~VectorMemAlloc() |
| 43 | +{ |
| 44 | + (void)cuMemFree(d_p); |
| 45 | +} |
| 46 | + |
| 47 | +// Although we're not supposed to "commit" memory in a reserve call, |
| 48 | +// doing so for this sample demonstrates why reserve is so important |
| 49 | +CUresult |
| 50 | +VectorMemAlloc::reserve(size_t new_sz) |
| 51 | +{ |
| 52 | + CUresult status = CUDA_SUCCESS; |
| 53 | + CUdeviceptr new_ptr = 0ULL; |
| 54 | + CUcontext prev_ctx; |
| 55 | + |
| 56 | + if (new_sz <= alloc_sz) { |
| 57 | + return CUDA_SUCCESS; |
| 58 | + } |
| 59 | + (void)cuCtxGetCurrent(&prev_ctx); |
| 60 | + // Make sure we allocate on the correct context |
| 61 | + if ((status = cuCtxSetCurrent(ctx)) != CUDA_SUCCESS) { |
| 62 | + return status; |
| 63 | + } |
| 64 | + // Allocate the bigger buffer |
| 65 | + if ((status = cuMemAlloc(&new_ptr, new_sz)) == CUDA_SUCCESS) { |
| 66 | + // Copy over the bigger buffer. We'll explicitly use the per thread |
| 67 | + // stream to ensure we don't add false dependencies on other threads |
| 68 | + // using the null stream, but we may have issues with other prior |
| 69 | + // work on this stream. Luckily, that's not the case in our sample. |
| 70 | + // |
| 71 | + // We only want to copy over the alloc_sz here, as that's what's |
| 72 | + // actually committed at the moment |
| 73 | + if ((status = cuMemcpyAsync(new_ptr, d_p, alloc_sz, CU_STREAM_PER_THREAD)) == CUDA_SUCCESS) { |
| 74 | + // Free the smaller buffer. We don't need to synchronize |
| 75 | + // CU_STREAM_PER_THREAD, since cuMemFree synchronizes for us |
| 76 | + (void)cuMemFree(d_p); |
| 77 | + d_p = new_ptr; |
| 78 | + alloc_sz = new_sz; |
| 79 | + } |
| 80 | + else { |
| 81 | + // Failed to copy the bigger buffer, free the smaller one |
| 82 | + (void)cuMemFree(new_ptr); |
| 83 | + } |
| 84 | + } |
| 85 | + // Make sure to always return to the previous context the caller had |
| 86 | + (void)cuCtxSetCurrent(prev_ctx); |
| 87 | + |
| 88 | + return status; |
| 89 | +} |
| 90 | + |
| 91 | +// ********************* |
| 92 | +// VectorMemAllocManaged |
| 93 | +// ********************* |
| 94 | + |
| 95 | +VectorMemAllocManaged::VectorMemAllocManaged(CUcontext context) : ctx(context), dev(CU_DEVICE_INVALID), d_p(0ULL), |
| 96 | + alloc_sz(0ULL), reserve_sz(0ULL) |
| 97 | +{ |
| 98 | + CUcontext prev_ctx; |
| 99 | + (void)cuCtxGetCurrent(&prev_ctx); |
| 100 | + if (cuCtxSetCurrent(context) == CUDA_SUCCESS) { |
| 101 | + (void)cuCtxGetDevice(&dev); |
| 102 | + (void)cuCtxSetCurrent(prev_ctx); |
| 103 | + } |
| 104 | +} |
| 105 | + |
| 106 | +VectorMemAllocManaged::~VectorMemAllocManaged() |
| 107 | +{ |
| 108 | + (void)cuMemFree(d_p); |
| 109 | +} |
| 110 | + |
| 111 | +CUresult |
| 112 | +VectorMemAllocManaged::reserve(size_t new_sz) |
| 113 | +{ |
| 114 | + CUresult status = CUDA_SUCCESS; |
| 115 | + CUcontext prev_ctx; |
| 116 | + CUdeviceptr new_ptr = 0ULL; |
| 117 | + |
| 118 | + if (new_sz <= reserve_sz) { |
| 119 | + return CUDA_SUCCESS; |
| 120 | + } |
| 121 | + |
| 122 | + (void)cuCtxGetCurrent(&prev_ctx); |
| 123 | + if ((status = cuCtxSetCurrent(ctx)) != CUDA_SUCCESS) { |
| 124 | + return status; |
| 125 | + } |
| 126 | + |
| 127 | + // Allocate the bigger buffer |
| 128 | + if ((status = cuMemAllocManaged(&new_ptr, new_sz, CU_MEM_ATTACH_GLOBAL)) == CUDA_SUCCESS) { |
| 129 | + // Set the preferred location for this managed allocation, to bias |
| 130 | + // any migration requests ("pinning" it under most circumstances to |
| 131 | + // the requested device) |
| 132 | + (void)cuMemAdvise(new_ptr, new_sz, CU_MEM_ADVISE_SET_PREFERRED_LOCATION, dev); |
| 133 | + // Copy over the bigger buffer. We'll explicitly use the per thread |
| 134 | + // stream to ensure we don't add false dependencies on other threads |
| 135 | + // using the null stream, but we may have issues with other prior |
| 136 | + // work on this stream. Luckily, that's not the case in our sample. |
| 137 | + // |
| 138 | + // We only want to copy over the alloc_sz here, as that's what's |
| 139 | + // actually committed at the moment |
| 140 | + if (alloc_sz > 0) { |
| 141 | + if ((status = cuMemcpyAsync(new_ptr, d_p, alloc_sz, CU_STREAM_PER_THREAD)) == CUDA_SUCCESS) { |
| 142 | + // Free the smaller buffer. We don't need to synchronize |
| 143 | + // CU_STREAM_PER_THREAD, since cuMemFree synchronizes for us |
| 144 | + (void)cuMemFree(d_p); |
| 145 | + } |
| 146 | + else { |
| 147 | + // Failed to copy the bigger buffer, free the smaller one |
| 148 | + (void)cuMemFree(new_ptr); |
| 149 | + } |
| 150 | + } |
| 151 | + if (status == CUDA_SUCCESS) { |
| 152 | + d_p = new_ptr; |
| 153 | + reserve_sz = new_sz; |
| 154 | + } |
| 155 | + } |
| 156 | + |
| 157 | + // Make sure to always return to the previous context the caller had |
| 158 | + (void)cuCtxSetCurrent(prev_ctx); |
| 159 | + |
| 160 | + return status; |
| 161 | +} |
| 162 | + |
| 163 | +// Actually commits num bytes of additional memory |
| 164 | +CUresult |
| 165 | +VectorMemAllocManaged::grow(size_t new_sz) |
| 166 | +{ |
| 167 | + CUresult status = CUDA_SUCCESS; |
| 168 | + CUcontext prev_ctx; |
| 169 | + |
| 170 | + if (new_sz <= alloc_sz) { |
| 171 | + return CUDA_SUCCESS; |
| 172 | + } |
| 173 | + if ((status = reserve(new_sz)) != CUDA_SUCCESS) { |
| 174 | + return status; |
| 175 | + } |
| 176 | + |
| 177 | + (void)cuCtxGetCurrent(&prev_ctx); |
| 178 | + // Make sure we allocate on the correct context |
| 179 | + if ((status = cuCtxSetCurrent(ctx)) != CUDA_SUCCESS) { |
| 180 | + return status; |
| 181 | + } |
| 182 | + // Actually commit the needed memory |
| 183 | + // We explicitly use the per thread stream here to ensure we're not |
| 184 | + // conflicting with other uses of the null stream from other threads |
| 185 | + if ((status = cuMemPrefetchAsync(d_p + alloc_sz, (new_sz - alloc_sz), dev, |
| 186 | + CU_STREAM_PER_THREAD)) == CUDA_SUCCESS) { |
| 187 | + // Not completely necessary, but will ensure the prefetch is complete |
| 188 | + // and prevent future runtime faults. Also makes for a more fair |
| 189 | + // benchmark comparision |
| 190 | + if ((status = cuStreamSynchronize(CU_STREAM_PER_THREAD)) == CUDA_SUCCESS) { |
| 191 | + alloc_sz = new_sz; |
| 192 | + } |
| 193 | + } |
| 194 | + // Make sure to always return to the previous context the caller had |
| 195 | + (void)cuCtxSetCurrent(prev_ctx); |
| 196 | + return status; |
| 197 | +} |
| 198 | + |
| 199 | +// ********************* |
| 200 | +// VectorMemMap |
| 201 | +// ********************* |
| 202 | + |
| 203 | +VectorMemMap::VectorMemMap(CUcontext context) : d_p(0ULL), prop(), handles(), alloc_sz(0ULL), reserve_sz(0ULL), chunk_sz(0ULL) |
| 204 | +{ |
| 205 | + CUdevice device; |
| 206 | + CUcontext prev_ctx; |
| 207 | + CUresult status = CUDA_SUCCESS; |
| 208 | + (void)status; |
| 209 | + |
| 210 | + status = cuCtxGetCurrent(&prev_ctx); |
| 211 | + assert(status == CUDA_SUCCESS); |
| 212 | + if (cuCtxSetCurrent(context) == CUDA_SUCCESS) { |
| 213 | + status = cuCtxGetDevice(&device); |
| 214 | + assert(status == CUDA_SUCCESS); |
| 215 | + status = cuCtxSetCurrent(prev_ctx); |
| 216 | + assert(status == CUDA_SUCCESS); |
| 217 | + } |
| 218 | + |
| 219 | + prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; |
| 220 | + prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; |
| 221 | + prop.location.id = (int)device; |
| 222 | + prop.win32HandleMetaData = NULL; |
| 223 | + |
| 224 | + accessDesc.location = prop.location; |
| 225 | + accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; |
| 226 | + |
| 227 | + status = cuMemGetAllocationGranularity(&chunk_sz, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM); |
| 228 | + assert(status == CUDA_SUCCESS); |
| 229 | +} |
| 230 | + |
| 231 | +VectorMemMap::~VectorMemMap() |
| 232 | +{ |
| 233 | + CUresult status = CUDA_SUCCESS; |
| 234 | + (void)status; |
| 235 | + if (d_p != 0ULL) { |
| 236 | + status = cuMemUnmap(d_p, alloc_sz); |
| 237 | + assert(status == CUDA_SUCCESS); |
| 238 | + for (size_t i = 0ULL; i < va_ranges.size(); i++) { |
| 239 | + status = cuMemAddressFree(va_ranges[i].start, va_ranges[i].sz); |
| 240 | + assert(status == CUDA_SUCCESS); |
| 241 | + } |
| 242 | + for (size_t i = 0ULL; i < handles.size(); i++) { |
| 243 | + status = cuMemRelease(handles[i]); |
| 244 | + assert(status == CUDA_SUCCESS); |
| 245 | + } |
| 246 | + } |
| 247 | +} |
| 248 | + |
| 249 | +CUresult |
| 250 | +VectorMemMap::reserve(size_t new_sz) |
| 251 | +{ |
| 252 | + CUresult status = CUDA_SUCCESS; |
| 253 | + CUdeviceptr new_ptr = 0ULL; |
| 254 | + |
| 255 | + if (new_sz <= reserve_sz) { |
| 256 | + return CUDA_SUCCESS; |
| 257 | + } |
| 258 | + |
| 259 | + const size_t aligned_sz = ((new_sz + chunk_sz - 1) / chunk_sz) * chunk_sz; |
| 260 | + |
| 261 | + status = cuMemAddressReserve(&new_ptr, (aligned_sz - reserve_sz), 0ULL, d_p + reserve_sz, 0ULL); |
| 262 | + |
| 263 | + // Try to reserve an address just after what we already have reserved |
| 264 | + if (status != CUDA_SUCCESS || (new_ptr != d_p + reserve_sz)) { |
| 265 | + if (new_ptr != 0ULL) { |
| 266 | + (void)cuMemAddressFree(new_ptr, (aligned_sz - reserve_sz)); |
| 267 | + } |
| 268 | + // Slow path - try to find a new address reservation big enough for us |
| 269 | + status = cuMemAddressReserve(&new_ptr, aligned_sz, 0ULL, 0U, 0); |
| 270 | + if (status == CUDA_SUCCESS && d_p != 0ULL) { |
| 271 | + CUdeviceptr ptr = new_ptr; |
| 272 | + // Found one, now unmap our previous allocations |
| 273 | + status = cuMemUnmap(d_p, alloc_sz); |
| 274 | + assert(status == CUDA_SUCCESS); |
| 275 | + for (size_t i = 0ULL; i < handles.size(); i++) { |
| 276 | + const size_t hdl_sz = handle_sizes[i]; |
| 277 | + // And remap them, enabling their access |
| 278 | + if ((status = cuMemMap(ptr, hdl_sz, 0ULL, handles[i], 0ULL)) != CUDA_SUCCESS) |
| 279 | + break; |
| 280 | + if ((status = cuMemSetAccess(ptr, hdl_sz, &accessDesc, 1ULL)) != CUDA_SUCCESS) |
| 281 | + break; |
| 282 | + ptr += hdl_sz; |
| 283 | + } |
| 284 | + if (status != CUDA_SUCCESS) { |
| 285 | + // Failed the mapping somehow... clean up! |
| 286 | + status = cuMemUnmap(new_ptr, aligned_sz); |
| 287 | + assert(status == CUDA_SUCCESS); |
| 288 | + status = cuMemAddressFree(new_ptr, aligned_sz); |
| 289 | + assert(status == CUDA_SUCCESS); |
| 290 | + } |
| 291 | + else { |
| 292 | + // Clean up our old VA reservations! |
| 293 | + for (size_t i = 0ULL; i < va_ranges.size(); i++) { |
| 294 | + (void)cuMemAddressFree(va_ranges[i].start, va_ranges[i].sz); |
| 295 | + } |
| 296 | + va_ranges.clear(); |
| 297 | + } |
| 298 | + } |
| 299 | + // Assuming everything went well, update everything |
| 300 | + if (status == CUDA_SUCCESS) { |
| 301 | + Range r; |
| 302 | + d_p = new_ptr; |
| 303 | + reserve_sz = aligned_sz; |
| 304 | + r.start = new_ptr; |
| 305 | + r.sz = aligned_sz; |
| 306 | + va_ranges.push_back(r); |
| 307 | + } |
| 308 | + } |
| 309 | + else { |
| 310 | + Range r; |
| 311 | + r.start = new_ptr; |
| 312 | + r.sz = aligned_sz - reserve_sz; |
| 313 | + va_ranges.push_back(r); |
| 314 | + if (d_p == 0ULL) { |
| 315 | + d_p = new_ptr; |
| 316 | + } |
| 317 | + reserve_sz = aligned_sz; |
| 318 | + } |
| 319 | + |
| 320 | + return status; |
| 321 | +} |
| 322 | + |
| 323 | +CUresult |
| 324 | +VectorMemMap::grow(size_t new_sz) |
| 325 | +{ |
| 326 | + CUresult status = CUDA_SUCCESS; |
| 327 | + CUmemGenericAllocationHandle handle; |
| 328 | + if (new_sz <= alloc_sz) { |
| 329 | + return CUDA_SUCCESS; |
| 330 | + } |
| 331 | + |
| 332 | + const size_t size_diff = new_sz - alloc_sz; |
| 333 | + // Round up to the next chunk size |
| 334 | + const size_t sz = ((size_diff + chunk_sz - 1) / chunk_sz) * chunk_sz; |
| 335 | + |
| 336 | + if ((status = reserve(alloc_sz + sz)) != CUDA_SUCCESS) { |
| 337 | + return status; |
| 338 | + } |
| 339 | + |
| 340 | + if ((status = cuMemCreate(&handle, sz, &prop, 0)) == CUDA_SUCCESS) { |
| 341 | + if ((status = cuMemMap(d_p + alloc_sz, sz, 0ULL, handle, 0ULL)) == CUDA_SUCCESS) { |
| 342 | + if ((status = cuMemSetAccess(d_p + alloc_sz, sz, &accessDesc, 1ULL)) == CUDA_SUCCESS) { |
| 343 | + handles.push_back(handle); |
| 344 | + handle_sizes.push_back(sz); |
| 345 | + alloc_sz += sz; |
| 346 | + } |
| 347 | + if (status != CUDA_SUCCESS) { |
| 348 | + (void)cuMemUnmap(d_p + alloc_sz, sz); |
| 349 | + } |
| 350 | + } |
| 351 | + if (status != CUDA_SUCCESS) { |
| 352 | + (void)cuMemRelease(handle); |
| 353 | + } |
| 354 | + } |
| 355 | + |
| 356 | + return status; |
| 357 | +} |
| 358 | + |
| 359 | +} |
0 commit comments