PaddlePaddle
diff --git a/‎paddle/phi/api/yaml/backward.yaml‎
Lines changed: 2 additions & 2 deletions b/‎paddle/phi/api/yaml/backward.yaml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎paddle/phi/api/yaml/op_compat.yaml‎
Lines changed: 1 addition & 1 deletion b/‎paddle/phi/api/yaml/op_compat.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/phi/api/yaml/ops.yaml‎
Lines changed: 1 addition & 1 deletion b/‎paddle/phi/api/yaml/ops.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/phi/backends/gpu/gpu_primitives.h‎
Lines changed: 176 additions & 0 deletions b/‎paddle/phi/backends/gpu/gpu_primitives.h‎
Lines changed: 176 additions & 0 deletions
diff --git a/‎paddle/phi/kernels/cpu/cum_maxmin_grad_kernel.cc‎
Lines changed: 4 additions & 4 deletions b/‎paddle/phi/kernels/cpu/cum_maxmin_grad_kernel.cc‎
Lines changed: 4 additions & 4 deletions
@@ -1760,8 +1760,8 @@
  optional : boxes_num
 
 - backward_op : put_along_axis_grad
- forward : put_along_axis (Tensor arr, Tensor indices, Tensor values, int axis, str reduce = "assign") -> Tensor(out)
- args : (Tensor arr, Tensor indices, Tensor out_grad, int axis, str reduce)
+ forward : put_along_axis (Tensor arr, Tensor indices, Tensor values, int axis, str reduce = "assign", bool include_self = true) -> Tensor(out)
+ args : (Tensor arr, Tensor indices, Tensor values, Tensor out, Tensor out_grad, int axis, str reduce, bool include_self)
  output : Tensor(arr_grad), Tensor(values_grad)
  infer_meta :
  func : GeneralBinaryGradInferMeta
 
@@ -2432,7 +2432,7 @@
  outputs :
  out : Result
  attrs :
- {axis : Axis, reduce : Reduce}
+ {axis : Axis, reduce : Reduce, include_self: Include_self}
 
 - op : pylayer
  backward : pylayer_grad
 
@@ -2032,7 +2032,7 @@
  backward : psroi_pool_grad
 
 - op : put_along_axis
- args : (Tensor arr, Tensor indices, Tensor values, int axis, str reduce = "assign")
+ args : (Tensor arr, Tensor indices, Tensor values, int axis, str reduce = "assign", bool include_self = true)
  output : Tensor(out)
  infer_meta :
  func : UnchangedInferMeta
 
@@ -395,6 +395,182 @@ CUDA_ATOMIC_WRAPPER(Add, complex<double>) {
  CudaAtomicAdd(imag, val.imag));
 }
 
+// For atomicMul.
+CUDA_ATOMIC_WRAPPER(Mul, int) {
+ int res = *address, old = res; // NOLINT
+ do {
+ old = res;
+ res = atomicCAS(address, // NOLINT
+ old, // NOLINT
+ val * old); // NOLINT
+ } while (old != res);
+ return res;
+}
+
+CUDA_ATOMIC_WRAPPER(Mul, unsigned int) {
+ unsigned int res = *address, old = res; // NOLINT
+ do {
+ old = res;
+ res = atomicCAS(address, // NOLINT
+ old, // NOLINT
+ val * old); // NOLINT
+ } while (old != res);
+ return res;
+}
+// CUDA API uses unsigned long long int, we cannot use uint64_t here.
+// It because unsigned long long int is not necessarily uint64_t
+CUDA_ATOMIC_WRAPPER(Mul, unsigned long long int) { // NOLINT
+ unsigned long long int old = *address, assumed; // NOLINT
+
+ do {
+ assumed = old;
+ old = atomicCAS(address, assumed, val * assumed);
+ } while (assumed != old);
+ return old;
+}
+
+CUDA_ATOMIC_WRAPPER(Mul, int64_t) {
+ // Here, we check long long int must be int64_t.
+ static_assert(sizeof(int64_t) == sizeof(long long int), // NOLINT
+ "long long should be int64");
+ long long int res = *address, old = res; // NOLINT
+ do {
+ old = res;
+ res = (long long int)atomicCAS( // NOLINT
+ (unsigned long long int *)address, // NOLINT
+ (unsigned long long int)old, // NOLINT
+ (unsigned long long int)val * (unsigned long long int)old); // NOLINT
+ } while (old != res);
+ return res;
+}
+
+CUDA_ATOMIC_WRAPPER(Mul, float) {
+ int *const address_as_i = reinterpret_cast<int *>(address);
+ int old = *address_as_i, assumed;
+
+ do {
+ assumed = old;
+ old = atomicCAS(
+ address_as_i, assumed, __float_as_int(val * __int_as_float(assumed)));
+ } while (assumed != old);
+
+ return __int_as_float(old);
+}
+
+CUDA_ATOMIC_WRAPPER(Mul, double) {
+ unsigned long long int *const address_as_ull = // NOLINT
+ reinterpret_cast<unsigned long long int *>(address); // NOLINT
+ unsigned long long int old = *address_as_ull, assumed; // NOLINT
+
+ do {
+ assumed = old;
+
+ old = atomicCAS(address_as_ull,
+ assumed,
+ __double_as_longlong(val * __longlong_as_double(assumed)));
+ } while (assumed != old);
+
+ return __longlong_as_double(old);
+}
+
+#ifdef PADDLE_CUDA_FP16
+inline static __device__ uint32_t mul_to_low_half(uint32_t val, float x) {
+ phi::dtype::float16 low_half;
+ // The float16 in lower 16bits
+ low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
+ low_half = static_cast<phi::dtype::float16>(static_cast<float>(low_half) * x);
+ return (val & 0xFFFF0000u) | low_half.x;
+}
+
+inline static __device__ uint32_t mul_to_high_half(uint32_t val, float x) {
+ phi::dtype::float16 high_half;
+ // The float16 in higher 16bits
+ high_half.x = static_cast<uint16_t>(val >> 16);
+ high_half =
+ static_cast<phi::dtype::float16>(static_cast<float>(high_half) * x);
+ return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
+}
+
+CUDA_ATOMIC_WRAPPER(Mul, phi::dtype::float16) {
+ if (*address >= val) {
+ return *address;
+ }
+ uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
+ reinterpret_cast<char *>(address) -
+ (reinterpret_cast<uintptr_t>(address) & 0x02));
+ float val_f = static_cast<float>(val);
+ uint32_t old = *address_as_ui;
+ uint32_t assumed;
+ if (((uintptr_t)address & 0x02) == 0) {
+ // The float16 value stay at lower 16 bits of the address.
+ do {
+ assumed = old;
+ old = atomicCAS(address_as_ui, assumed, mul_to_low_half(assumed, val_f));
+ } while (old != assumed);
+ phi::dtype::float16 ret;
+ ret.x = old & 0xFFFFu;
+ return ret;
+ } else {
+ // The float16 value stay at higher 16 bits of the address.
+ do {
+ assumed = old;
+ old = atomicCAS(address_as_ui, assumed, mul_to_high_half(assumed, val_f));
+ } while (old != assumed);
+ phi::dtype::float16 ret;
+ ret.x = old >> 16;
+ return ret;
+ }
+}
+#endif
+
+inline static __device__ uint32_t bf16_mul_to_low_half(uint32_t val, float x) {
+ phi::dtype::bfloat16 low_half;
+ // The bfloat16 in lower 16bits
+ low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
+ low_half =
+ static_cast<phi::dtype::bfloat16>(static_cast<float>(low_half) * x);
+ return (val & 0xFFFF0000u) | low_half.x;
+}
+
+inline static __device__ uint32_t bf16_mul_to_high_half(uint32_t val, float x) {
+ phi::dtype::bfloat16 high_half;
+ // The bfloat16 in higher 16bits
+ high_half.x = static_cast<uint16_t>(val >> 16);
+ high_half =
+ static_cast<phi::dtype::bfloat16>(static_cast<float>(high_half) * x);
+ return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
+}
+
+CUDA_ATOMIC_WRAPPER(Mul, phi::dtype::bfloat16) {
+ uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
+ reinterpret_cast<char *>(address) -
+ (reinterpret_cast<uintptr_t>(address) & 0x02));
+ float val_f = static_cast<float>(val);
+ uint32_t old = *address_as_ui;
+ uint32_t assumed;
+ if (((uintptr_t)address & 0x02) == 0) {
+ // The bfloat16 value stay at lower 16 bits of the address.
+ do {
+ assumed = old;
+ old = atomicCAS(
+ address_as_ui, assumed, bf16_mul_to_low_half(assumed, val_f));
+ } while (old != assumed);
+ phi::dtype::bfloat16 ret;
+ ret.x = old & 0xFFFFu;
+ return ret;
+ } else {
+ // The bfloat16 value stay at higher 16 bits of the address.
+ do {
+ assumed = old;
+ old = atomicCAS(
+ address_as_ui, assumed, bf16_mul_to_high_half(assumed, val_f));
+ } while (old != assumed);
+ phi::dtype::bfloat16 ret;
+ ret.x = old >> 16;
+ return ret;
+ }
+}
+
 // For atomicMax
 USE_CUDA_ATOMIC(Max, int);
 USE_CUDA_ATOMIC(Max, unsigned int);
 
@@ -38,10 +38,10 @@ void CummaxGradKernel(const Context& dev_ctx,
  }
  if (dtype == DataType::INT32) {
  phi::funcs::cpu_scatter_add_kernel<T, int32_t>(
- *x_grad, axis, indices, out_grad, dev_ctx);
+ *x_grad, axis, indices, out_grad, true, dev_ctx);
  } else if (dtype == DataType::INT64) {
  phi::funcs::cpu_scatter_add_kernel<T, int64_t>(
- *x_grad, axis, indices, out_grad, dev_ctx);
+ *x_grad, axis, indices, out_grad, true, dev_ctx);
  }
 }
 
@@ -61,10 +61,10 @@ void CumminGradKernel(const Context& dev_ctx,
  }
  if (dtype == DataType::INT32) {
  phi::funcs::cpu_scatter_add_kernel<T, int32_t>(
- *x_grad, axis, indices, out_grad, dev_ctx);
+ *x_grad, axis, indices, out_grad, true, dev_ctx);
  } else if (dtype == DataType::INT64) {
  phi::funcs::cpu_scatter_add_kernel<T, int64_t>(
- *x_grad, axis, indices, out_grad, dev_ctx);
+ *x_grad, axis, indices, out_grad, true, dev_ctx);
  }
 }
Original file line number	Diff line number	Diff line change
`@@ -38,10 +38,10 @@ void CummaxGradKernel(const Context& dev_ctx,`
`38`	`38`	`}`
`39`	`39`	`if (dtype == DataType::INT32) {`
`40`	`40`	`phi::funcs::cpu_scatter_add_kernel<T, int32_t>(`
`41`		`- *x_grad, axis, indices, out_grad, dev_ctx);`
	`41`	`+ *x_grad, axis, indices, out_grad, true, dev_ctx);`
`42`	`42`	`} else if (dtype == DataType::INT64) {`
`43`	`43`	`phi::funcs::cpu_scatter_add_kernel<T, int64_t>(`
`44`		`- *x_grad, axis, indices, out_grad, dev_ctx);`
	`44`	`+ *x_grad, axis, indices, out_grad, true, dev_ctx);`
`45`	`45`	`}`
`46`	`46`	`}`
`47`	`47`
`@@ -61,10 +61,10 @@ void CumminGradKernel(const Context& dev_ctx,`
`61`	`61`	`}`
`62`	`62`	`if (dtype == DataType::INT32) {`
`63`	`63`	`phi::funcs::cpu_scatter_add_kernel<T, int32_t>(`
`64`		`- *x_grad, axis, indices, out_grad, dev_ctx);`
	`64`	`+ *x_grad, axis, indices, out_grad, true, dev_ctx);`
`65`	`65`	`} else if (dtype == DataType::INT64) {`
`66`	`66`	`phi::funcs::cpu_scatter_add_kernel<T, int64_t>(`
`67`		`- *x_grad, axis, indices, out_grad, dev_ctx);`
	`67`	`+ *x_grad, axis, indices, out_grad, true, dev_ctx);`
`68`	`68`	`}`
`69`	`69`	`}`
`70`	`70`