PaddlePaddle
diff --git a/‎paddle/fluid/operators/elementwise/elementwise_min_op.cu‎
Lines changed: 101 additions & 0 deletions b/‎paddle/fluid/operators/elementwise/elementwise_min_op.cu‎
Lines changed: 101 additions & 0 deletions
@@ -35,6 +35,106 @@ class ElementwiseMinKernel<platform::CUDADeviceContext, T>
  }
 };
 
+template <typename InT, typename OutT>
+struct MinGradXYFunctor {
+ inline HOSTDEVICE paddle::framework::Array<OutT, 2> operator()(
+ const InT& a, // x
+ const InT& b, // y
+ const InT& c) { // dout
+ paddle::framework::Array<OutT, 2> outs;
+ // dx = dout * (x < y)
+ outs[0] = a < b ? c : static_cast<InT>(0);
+ // dy = dout * (x >= y)
+ outs[1] = (a > b || a == b) ? c : static_cast<InT>(0);
+ return outs;
+ }
+};
+
+template <typename T>
+void ReduceWrapper(const platform::CUDADeviceContext& dev_ctx, int axis,
+ const framework::Tensor* in, const framework::Tensor* out,
+ framework::Tensor* src, framework::Tensor* dst) {
+ std::vector<int> reduce_dims = GetReduceDim(in->dims(), out->dims(), axis);
+ TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+ *src, dst, kps::IdentityFunctor<T>(), reduce_dims, dev_ctx.stream());
+}
+
+template <typename DeviceContext, typename T>
+void DefaultElementMinGrad(const framework::ExecutionContext& ctx,
+ const framework::Tensor* x,
+ const framework::Tensor* y,
+ const framework::Tensor* out,
+ const framework::Tensor* dout, framework::Tensor* dx,
+ framework::Tensor* dy) {
+ int axis = ctx.Attr<int>("axis");
+ const auto& dev_ctx =
+ ctx.template device_context<platform::CUDADeviceContext>();
+ framework::Tensor tmp_dx;
+ framework::Tensor tmp_dy;
+ tmp_dx.mutable_data<T>(dout->dims(), ctx.GetPlace());
+ tmp_dy.mutable_data<T>(dout->dims(), ctx.GetPlace());
+
+ if (dx != nullptr && dy != nullptr) {
+ dx->mutable_data<T>(ctx.GetPlace());
+ dy->mutable_data<T>(ctx.GetPlace());
+ std::vector<const framework::Tensor*> ins = {x, y, dout};
+ std::vector<framework::Tensor*> outs;
+ if (dx->dims() == dout->dims() && dy->dims() == dout->dims()) {
+ outs = {dx, dy};
+ } else if (dx->dims() != dout->dims() && dy->dims() == dout->dims()) {
+ outs = {&tmp_dx, dy};
+ } else if (dx->dims() == dout->dims() && dy->dims() != dout->dims()) {
+ outs = {dx, &tmp_dy};
+ } else if (dx->dims() != dout->dims() && dy->dims() != dout->dims()) {
+ outs = {&tmp_dx, &tmp_dy};
+ }
+ auto functor = MinGradXYFunctor<T, T>();
+ LaunchElementwiseCudaKernel<ElementwiseType::kTernary, T, T,
+ decltype(functor), 2>(dev_ctx, ins, &outs, axis,
+ functor);
+ if (dx->dims() != dout->dims() && dy->dims() == dout->dims()) {
+ ReduceWrapper<T>(dev_ctx, axis, x, out, &tmp_dx, dx);
+ } else if (dx->dims() == dout->dims() && dy->dims() != dout->dims()) {
+ ReduceWrapper<T>(dev_ctx, axis, y, out, &tmp_dy, dy);
+ } else if (dx->dims() != dout->dims() && dy->dims() != dout->dims()) {
+ ReduceWrapper<T>(dev_ctx, axis, x, out, &tmp_dx, dx);
+ ReduceWrapper<T>(dev_ctx, axis, y, out, &tmp_dy, dy);
+ }
+
+ } else if (dx != nullptr && dy == nullptr) {
+ dx->mutable_data<T>(ctx.GetPlace());
+ std::vector<const framework::Tensor*> ins = {x, y, dout};
+ std::vector<framework::Tensor*> outs;
+ if (dx->dims() != dout->dims()) {
+ outs = {&tmp_dx};
+ } else {
+ outs = {dx};
+ }
+
+ LaunchElementwiseCudaKernel<ElementwiseType::kTernary, T, T>(
+ dev_ctx, ins, &outs, axis, TernaryLessThanFunctor<T>());
+ if (dx->dims() != dout->dims()) {
+ ReduceWrapper<T>(dev_ctx, axis, x, out, &tmp_dx, dx);
+ }
+ } else if (dx == nullptr && dy != nullptr) {
+ dy->mutable_data<T>(ctx.GetPlace());
+ std::vector<const framework::Tensor*> ins = {x, y, dout};
+ std::vector<framework::Tensor*> outs;
+ if (dy->dims() != dout->dims()) {
+ outs = {&tmp_dy};
+ } else {
+ outs = {dy};
+ }
+
+ LaunchElementwiseCudaKernel<ElementwiseType::kTernary, T, T>(
+ dev_ctx, ins, &outs, axis, TernaryGreaterEqualThanFunctor<T>());
+ if (dy->dims() != dout->dims()) {
+ ReduceWrapper<T>(dev_ctx, axis, y, out, &tmp_dy, dy);
+ }
+ }
+}
+
+/*
 template <typename DeviceContext, typename T>
 void DefaultElementMinGrad(const framework::ExecutionContext& ctx,
  const framework::Tensor* x,
@@ -95,6 +195,7 @@ void DefaultElementMinGrad(const framework::ExecutionContext& ctx,
  }
  }
 }
+*/
 
 template <typename T>
 class ElementwiseMinGradKernel<platform::CUDADeviceContext, T>