Skip to content

Commit bd4471c

Browse files
authored
Replaced gpuAtomicAdd by fastAtomicAdd (#7596)
1 parent 6ccc712 commit bd4471c

File tree

5 files changed

+95
-42
lines changed

5 files changed

+95
-42
lines changed

torchvision/csrc/ops/cuda/deform_conv2d_kernel.cu

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@
7070
#include <ATen/cuda/CUDAContext.h>
7171
#include <c10/cuda/CUDAGuard.h>
7272
#include <torch/library.h>
73-
#include <ATen/cuda/Atomic.cuh>
73+
#include <ATen/native/cuda/KernelUtils.cuh>
7474

7575
#include "cuda_helpers.h"
7676

@@ -335,6 +335,8 @@ __global__ void deformable_col2im_kernel(
335335
index_t out_w,
336336
bool use_mask,
337337
scalar_t* grad_im) {
338+
const index_t grad_im_numel = width * height * channels * batch_sz;
339+
338340
CUDA_1D_KERNEL_LOOP_T(index, n, int64_t) {
339341
const index_t out_x = index % out_w;
340342
const index_t out_y = (index / out_w) % out_h;
@@ -381,7 +383,12 @@ __global__ void deformable_col2im_kernel(
381383
std::abs(y - yp) < 1 && std::abs(x - xp) < 1) {
382384
index_t grad_pos = ((b * channels + c) * height + yp) * width + xp;
383385
scalar_t weight = (1 - std::abs(y - yp)) * (1 - std::abs(x - xp));
384-
gpuAtomicAdd(grad_im + grad_pos, mask_value * weight * col[index]);
386+
at::native::fastAtomicAdd(
387+
grad_im,
388+
grad_pos,
389+
grad_im_numel,
390+
mask_value * weight * col[index],
391+
true);
385392
}
386393
}
387394
}

torchvision/csrc/ops/cuda/ps_roi_align_kernel.cu

Lines changed: 31 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
#include <ATen/cuda/CUDAContext.h>
33
#include <c10/cuda/CUDAGuard.h>
44
#include <torch/library.h>
5-
#include <ATen/cuda/Atomic.cuh>
5+
#include <ATen/native/cuda/KernelUtils.cuh>
66

77
#include "cuda_helpers.h"
88

@@ -212,7 +212,8 @@ __global__ void ps_roi_align_backward_kernel_impl(
212212
int sampling_ratio,
213213
int channels_out,
214214
T* grad_input,
215-
const T* rois) {
215+
const T* rois,
216+
const int memory_span) {
216217
CUDA_1D_KERNEL_LOOP(index, nthreads) {
217218
// (n, *, ph, pw) is an element in the pooled output
218219
int pw = index % pooled_width;
@@ -235,8 +236,6 @@ __global__ void ps_roi_align_backward_kernel_impl(
235236
T bin_size_w = roi_width / static_cast<T>(pooled_width);
236237

237238
int c_in = channel_mapping[index];
238-
T* grad_input_offset =
239-
grad_input + (roi_batch_ind * channels + c_in) * height * width;
240239

241240
// Do not using floor/ceil; this implementation detail is critical
242241
T hstart = static_cast<T>(ph) * bin_size_h + roi_start_h;
@@ -252,6 +251,8 @@ __global__ void ps_roi_align_backward_kernel_impl(
252251
(sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
253252
const T count = roi_bin_grid_h * roi_bin_grid_w;
254253

254+
const int offset = (roi_batch_ind * channels + c_in) * height * width;
255+
255256
for (int iy = 0; iy < roi_bin_grid_h; iy++) {
256257
const T y = hstart +
257258
static_cast<T>(iy + .5f) * bin_size_h /
@@ -285,10 +286,30 @@ __global__ void ps_roi_align_backward_kernel_impl(
285286
T g4 = grad_output_this_bin * w4 / count;
286287

287288
if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
288-
gpuAtomicAdd(grad_input_offset + y_low * width + x_low, g1);
289-
gpuAtomicAdd(grad_input_offset + y_low * width + x_high, g2);
290-
gpuAtomicAdd(grad_input_offset + y_high * width + x_low, g3);
291-
gpuAtomicAdd(grad_input_offset + y_high * width + x_high, g4);
289+
at::native::fastAtomicAdd(
290+
grad_input,
291+
offset + y_low * width + x_low,
292+
memory_span,
293+
static_cast<T>(g1),
294+
true);
295+
at::native::fastAtomicAdd(
296+
grad_input,
297+
offset + y_low * width + x_high,
298+
memory_span,
299+
static_cast<T>(g2),
300+
true);
301+
at::native::fastAtomicAdd(
302+
grad_input,
303+
offset + y_high * width + x_low,
304+
memory_span,
305+
static_cast<T>(g3),
306+
true);
307+
at::native::fastAtomicAdd(
308+
grad_input,
309+
offset + y_high * width + x_high,
310+
memory_span,
311+
static_cast<T>(g4),
312+
true);
292313
} // if
293314
} // ix
294315
} // iy
@@ -430,7 +451,8 @@ at::Tensor ps_roi_align_backward_kernel(
430451
sampling_ratio,
431452
channels_out,
432453
grad_input.data_ptr<scalar_t>(),
433-
rois_.data_ptr<scalar_t>());
454+
rois_.data_ptr<scalar_t>(),
455+
grad_input.numel());
434456
});
435457
AT_CUDA_CHECK(cudaGetLastError());
436458
return grad_input;

torchvision/csrc/ops/cuda/ps_roi_pool_kernel.cu

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
#include <ATen/cuda/CUDAContext.h>
33
#include <c10/cuda/CUDAGuard.h>
44
#include <torch/library.h>
5-
#include <ATen/cuda/Atomic.cuh>
5+
#include <ATen/native/cuda/KernelUtils.cuh>
66

77
#include "cuda_helpers.h"
88

@@ -91,7 +91,8 @@ __global__ void ps_roi_pool_backward_kernel_impl(
9191
int pooled_width,
9292
int channels_out,
9393
T* grad_input,
94-
const T* rois) {
94+
const T* rois,
95+
const int memory_span) {
9596
CUDA_1D_KERNEL_LOOP(index, nthreads) {
9697
// (n, *, ph, pw) is an element in the pooled output
9798
int pw = index % pooled_width;
@@ -124,14 +125,15 @@ __global__ void ps_roi_pool_backward_kernel_impl(
124125
bool is_empty = (hend <= hstart) || (wend <= wstart);
125126

126127
int c_in = channel_mapping[index];
127-
T* grad_input_offset =
128-
grad_input + (roi_batch_ind * channels + c_in) * height * width;
129128
T bin_area = (hend - hstart) * (wend - wstart);
130129
T diff_val = is_empty ? static_cast<T>(0) : grad_output[index] / bin_area;
130+
131+
const int offset = (roi_batch_ind * channels + c_in) * height * width;
131132
for (int h = hstart; h < hend; ++h) {
132133
for (int w = wstart; w < wend; ++w) {
133134
int grad_input_index = h * width + w;
134-
gpuAtomicAdd(grad_input_offset + grad_input_index, diff_val);
135+
at::native::fastAtomicAdd(
136+
grad_input, offset + grad_input_index, memory_span, diff_val, true);
135137
}
136138
}
137139
}
@@ -269,7 +271,8 @@ at::Tensor ps_roi_pool_backward_kernel(
269271
pooled_width,
270272
channels_out,
271273
grad_input.data_ptr<scalar_t>(),
272-
rois_.data_ptr<scalar_t>());
274+
rois_.data_ptr<scalar_t>(),
275+
grad_input.numel());
273276
});
274277
AT_CUDA_CHECK(cudaGetLastError());
275278
return grad_input;

torchvision/csrc/ops/cuda/roi_align_kernel.cu

Lines changed: 32 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
#include <ATen/cuda/CUDAContext.h>
33
#include <c10/cuda/CUDAGuard.h>
44
#include <torch/library.h>
5-
#include <ATen/cuda/Atomic.cuh>
5+
#include <ATen/native/cuda/KernelUtils.cuh>
66

77
#include "cuda_helpers.h"
88

@@ -218,7 +218,8 @@ __global__ void roi_align_backward_kernel_impl(
218218
int n_stride,
219219
int c_stride,
220220
int h_stride,
221-
int w_stride) {
221+
int w_stride,
222+
const int memory_span) {
222223
CUDA_1D_KERNEL_LOOP(index, nthreads) {
223224
// (n, c, ph, pw) is an element in the pooled output
224225
int pw = index % pooled_width;
@@ -247,12 +248,9 @@ __global__ void roi_align_backward_kernel_impl(
247248
T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
248249
T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
249250

250-
T* offset_grad_input =
251-
grad_input + ((roi_batch_ind * channels + c) * height * width);
252-
253251
// We need to index the gradient using the tensor strides to access the
254252
// correct values.
255-
int output_offset = n * n_stride + c * c_stride;
253+
const int output_offset = n * n_stride + c * c_stride;
256254
const T* offset_grad_output = grad_output + output_offset;
257255
const T grad_output_this_bin =
258256
offset_grad_output[ph * h_stride + pw * w_stride];
@@ -267,6 +265,8 @@ __global__ void roi_align_backward_kernel_impl(
267265
// We do average (integral) pooling inside a bin
268266
const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
269267

268+
const int input_offset = (roi_batch_ind * channels + c) * height * width;
269+
270270
for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1
271271
{
272272
const T y = roi_start_h + ph * bin_size_h +
@@ -301,14 +301,30 @@ __global__ void roi_align_backward_kernel_impl(
301301
T g4 = grad_output_this_bin * w4 / count;
302302

303303
if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
304-
gpuAtomicAdd(
305-
offset_grad_input + y_low * width + x_low, static_cast<T>(g1));
306-
gpuAtomicAdd(
307-
offset_grad_input + y_low * width + x_high, static_cast<T>(g2));
308-
gpuAtomicAdd(
309-
offset_grad_input + y_high * width + x_low, static_cast<T>(g3));
310-
gpuAtomicAdd(
311-
offset_grad_input + y_high * width + x_high, static_cast<T>(g4));
304+
at::native::fastAtomicAdd(
305+
grad_input,
306+
input_offset + y_low * width + x_low,
307+
memory_span,
308+
static_cast<T>(g1),
309+
true);
310+
at::native::fastAtomicAdd(
311+
grad_input,
312+
input_offset + y_low * width + x_high,
313+
memory_span,
314+
static_cast<T>(g2),
315+
true);
316+
at::native::fastAtomicAdd(
317+
grad_input,
318+
input_offset + y_high * width + x_low,
319+
memory_span,
320+
static_cast<T>(g3),
321+
true);
322+
at::native::fastAtomicAdd(
323+
grad_input,
324+
input_offset + y_high * width + x_high,
325+
memory_span,
326+
static_cast<T>(g4),
327+
true);
312328
} // if
313329
} // ix
314330
} // iy
@@ -442,7 +458,8 @@ at::Tensor roi_align_backward_kernel(
442458
n_stride,
443459
c_stride,
444460
h_stride,
445-
w_stride);
461+
w_stride,
462+
grad_input.numel());
446463
});
447464
AT_CUDA_CHECK(cudaGetLastError());
448465
return grad_input;

torchvision/csrc/ops/cuda/roi_pool_kernel.cu

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
#include <c10/cuda/CUDAGuard.h>
44
#include <float.h>
55
#include <torch/library.h>
6-
#include <ATen/cuda/Atomic.cuh>
6+
#include <ATen/native/cuda/KernelUtils.cuh>
77

88
#include "cuda_helpers.h"
99

@@ -94,7 +94,8 @@ __global__ void roi_pool_backward_kernel_impl(
9494
int n_stride,
9595
int c_stride,
9696
int h_stride,
97-
int w_stride) {
97+
int w_stride,
98+
const int memory_span) {
9899
CUDA_1D_KERNEL_LOOP(index, nthreads) {
99100
// (n, c, ph, pw) is an element in the pooled output
100101
int pw = index % pooled_width;
@@ -104,19 +105,21 @@ __global__ void roi_pool_backward_kernel_impl(
104105

105106
const T* offset_rois = rois + n * 5;
106107
int roi_batch_ind = offset_rois[0];
107-
T* grad_input_offset =
108-
grad_input + ((roi_batch_ind * channels + c) * height * width);
109108

110-
int output_offset = n * n_stride + c * c_stride;
109+
const int output_offset = n * n_stride + c * c_stride;
111110
const int* argmax_data_offset =
112111
argmax_data + (n * channels + c) * pooled_height * pooled_width;
113-
int argmax = argmax_data_offset[ph * pooled_width + pw];
112+
const int argmax = argmax_data_offset[ph * pooled_width + pw];
113+
const int offset = (roi_batch_ind * channels + c) * height * width;
114114

115115
if (argmax != -1) {
116-
gpuAtomicAdd(
117-
grad_input_offset + argmax,
116+
at::native::fastAtomicAdd(
117+
grad_input,
118+
offset + argmax,
119+
memory_span,
118120
static_cast<T>(
119-
grad_output[output_offset + ph * h_stride + pw * w_stride]));
121+
grad_output[output_offset + ph * h_stride + pw * w_stride]),
122+
true);
120123
}
121124
}
122125
}
@@ -253,7 +256,8 @@ at::Tensor roi_pool_backward_kernel(
253256
n_stride,
254257
c_stride,
255258
h_stride,
256-
w_stride);
259+
w_stride,
260+
grad_input.numel());
257261
});
258262
AT_CUDA_CHECK(cudaGetLastError());
259263
return grad_input;

0 commit comments

Comments
 (0)