PaddlePaddle
diff --git a/‎paddle/fluid/operators/elementwise/elementwise_mul_op.cu‎
Lines changed: 35 additions & 7 deletions b/‎paddle/fluid/operators/elementwise/elementwise_mul_op.cu‎
Lines changed: 35 additions & 7 deletions
diff --git a/‎paddle/fluid/operators/elementwise/elementwise_mul_op.h‎
Lines changed: 24 additions & 11 deletions b/‎paddle/fluid/operators/elementwise/elementwise_mul_op.h‎
Lines changed: 24 additions & 11 deletions
diff --git a/‎paddle/fluid/operators/elementwise/elementwise_op.h‎
Lines changed: 6 additions & 0 deletions b/‎paddle/fluid/operators/elementwise/elementwise_op.h‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎paddle/pten/api/include/math.h‎
Lines changed: 3 additions & 0 deletions b/‎paddle/pten/api/include/math.h‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎paddle/pten/api/lib/math.cc‎
Lines changed: 35 additions & 0 deletions b/‎paddle/pten/api/lib/math.cc‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎paddle/pten/api/lib/utils/tensor_utils.cc‎
Lines changed: 11 additions & 1 deletion b/‎paddle/pten/api/lib/utils/tensor_utils.cc‎
Lines changed: 11 additions & 1 deletion
diff --git a/‎paddle/pten/include/math.h‎
Lines changed: 14 additions & 0 deletions b/‎paddle/pten/include/math.h‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎paddle/pten/kernels/cpu/math.cc‎
Lines changed: 20 additions & 50 deletions b/‎paddle/pten/kernels/cpu/math.cc‎
Lines changed: 20 additions & 50 deletions
diff --git a/‎paddle/pten/kernels/cpu/math.h‎
Lines changed: 32 additions & 0 deletions b/‎paddle/pten/kernels/cpu/math.h‎
Lines changed: 32 additions & 0 deletions
@@ -17,6 +17,10 @@ limitations under the License. */
 #include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/float16.h"
 
+// only can include the headers in paddle/top/api dirs
+#include "paddle/pten/api/lib/utils/tensor_utils.h"
+#include "paddle/pten/include/core.h"
+#include "paddle/pten/include/math.h"
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
@@ -28,15 +32,39 @@ class ElementwiseMulKernel<platform::CUDADeviceContext, T>
  : public framework::OpKernel<T> {
  public:
  void Compute(const framework::ExecutionContext& ctx) const override {
- framework::Tensor x_for_selectedrows;
- std::vector<const framework::Tensor*> ins;
- std::vector<framework::Tensor*> outs;
+ auto x_var = ctx.InputVar("X");
+ PADDLE_ENFORCE_EQ(x_var != nullptr, true,
+ platform::errors::InvalidArgument(
+ "Cannot get input Variable X, Variable name = %s.",
+ ctx.InputName("X")));
  const auto& cuda_ctx =
  ctx.template device_context<platform::CUDADeviceContext>();
-
- int axis = PackTensorsIntoVector<T>(ctx, &ins, &outs, &x_for_selectedrows);
- LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
- cuda_ctx, ins, &outs, axis, MulFunctor<T>());
+ if (x_var->IsType<framework::SelectedRows>()) {
+ framework::Tensor x_for_selectedrows;
+ std::vector<const framework::Tensor*> ins;
+ std::vector<framework::Tensor*> outs;
+ int axis =
+ PackTensorsIntoVector<T>(ctx, &ins, &outs, &x_for_selectedrows);
+ LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
+ cuda_ctx, ins, &outs, axis, MulFunctor<T>());
+ } else if (x_var->IsType<framework::LoDTensor>()) {
+ auto* x_lod = ctx.Input<framework::LoDTensor>("X");
+ auto* y_lod = ctx.Input<framework::LoDTensor>("Y");
+ auto* z_lod = ctx.Output<framework::LoDTensor>("Out");
+ z_lod->mutable_data<T>(ctx.GetPlace());
+
+ int axis = ctx.Attr<int>("axis");
+ auto pt_x = paddle::experimental::MakePtenDenseTensor(*x_lod);
+ auto pt_y = paddle::experimental::MakePtenDenseTensor(*y_lod);
+ auto pt_z = paddle::experimental::MakePtenDenseTensor(*z_lod);
+ pten::ElementwiseMul<T>(cuda_ctx, *pt_x.get(), *pt_y.get(), axis,
+ pt_z.get());
+ } else {
+ PADDLE_THROW(platform::errors::InvalidArgument(
+ "X's type[%s] is not supported by elementwise_op. X's type should be "
+ "LoDTensor or SelectedRows.",
+ framework::ToTypeName(x_var->Type())));
+ }
  }
 };
 
 
@@ -15,11 +15,16 @@ limitations under the License. */
 #pragma once
 
 #include <string>
+#include "paddle/fluid/framework/pten_utils.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/platform/cpu_info.h"
 
+// only can include the headers in paddle/pten/include dirs
+#include "paddle/pten/api/lib/utils/tensor_utils.h"
+#include "paddle/pten/include/core.h"
+#include "paddle/pten/include/math.h"
 namespace paddle {
 namespace operators {
 
@@ -106,24 +111,32 @@ class ElementwiseMulKernel : public framework::OpKernel<T> {
  out_sele->mutable_value()->Resize(x_sele.value().dims());
  out_sele->mutable_value()->mutable_data(ctx.GetPlace(), x.type());
  z = ctx.Output<framework::SelectedRows>("Out")->mutable_value();
+ z->mutable_data<T>(ctx.GetPlace());
+ auto dims_equal = x.dims() == y->dims();
+ if (dims_equal) {
+ SameDimsElemwiseMul<DeviceContext, T> same_dims_mul;
+ same_dims_mul(ctx, &x, y, z);
+ } else {
+ default_elementwise_mul<DeviceContext, T>(ctx, &x, y, z);
+ }
  } else if (x_var->IsType<framework::LoDTensor>()) {
- x = x_var->Get<framework::LoDTensor>();
- z = ctx.Output<framework::LoDTensor>("Out");
+ auto* x_lod = ctx.Input<framework::LoDTensor>("X");
+ auto* z_lod = ctx.Output<framework::LoDTensor>("Out");
+ z_lod->mutable_data<T>(ctx.GetPlace());
+
+ auto& dev_ctx = ctx.device_context<DeviceContext>();
+ int axis = ctx.Attr<int>("axis");
+ auto pt_x = paddle::experimental::MakePtenDenseTensor(*x_lod);
+ auto pt_y = paddle::experimental::MakePtenDenseTensor(*y);
+ auto pt_z = paddle::experimental::MakePtenDenseTensor(*z_lod);
+ pten::ElementwiseMul<T>(dev_ctx, *pt_x.get(), *pt_y.get(), axis,
+ pt_z.get());
  } else {
  PADDLE_THROW(platform::errors::InvalidArgument(
  "X's type[%s] is not supported by elementwise_op. X's type should be "
  "LoDTensor or SelectedRows.",
  framework::ToTypeName(x_var->Type())));
  }
-
- z->mutable_data<T>(ctx.GetPlace());
- auto dims_equal = x.dims() == y->dims();
- if (dims_equal) {
- SameDimsElemwiseMul<DeviceContext, T> same_dims_mul;
- same_dims_mul(ctx, &x, y, z);
- } else {
- default_elementwise_mul<DeviceContext, T>(ctx, &x, y, z);
- }
  }
 };
 template <typename T>
 
@@ -160,6 +160,12 @@ class ElementwiseOp : public framework::OperatorWithKernel {
  {"axis"}, {"Out"});
  }
  }
+ if (Type() == "elementwise_mul") {
+ if (ctx.InputVar("X")->IsType<framework::LoDTensor>()) {
+ return framework::KernelSignature("elementwise_mul", {"X", "Y"},
+ {"axis"}, {"Out"});
+ }
+ }
  return framework::KernelSignature("None", {"X"}, {}, {"Out"});
  }
 };
 
@@ -28,5 +28,8 @@ PD_DLL_DECL Tensor add(const Tensor& x, const Tensor& y);
 PD_DLL_DECL Tensor subtract(const Tensor& x, const Tensor& y);
 
 PD_DLL_DECL Tensor divide(const Tensor& x, const Tensor& y);
+
+PD_DLL_DECL Tensor multiply(const Tensor& x, const Tensor& y);
+
 } // namespace experimental
 } // namespace paddle
@@ -172,6 +172,41 @@ PD_DLL_DECL Tensor divide(const Tensor& x, const Tensor& y) {
 
  return out;
 }
+
+PD_DLL_DECL Tensor multiply(const Tensor& x, const Tensor& y) {
+ // 1. Get kernel signature and kernel
+ auto kernel_key_set = ParseKernelKeyByInputArgs(x);
+ auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
+ auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
+ "elementwise_mul", kernel_key);
+
+ // 2. Get Device Context
+ auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
+ auto kernel_context = pten::KernelContext(dev_ctx);
+
+ // 3. Auto data transform
+ auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
+ kernel_context.EmplaceBackInput(dense_x);
+ auto dense_y = std::dynamic_pointer_cast<pten::DenseTensor>(y.impl());
+ kernel_context.EmplaceBackInput(dense_y);
+ kernel_context.EmplaceBackAttr(-1);
+
+ // 4. InferShape
+ auto out_meta = ElementwiseInferShape(dense_x->meta(), dense_y->meta(), -1);
+
+ // 5. Prepare outputs
+ Tensor out;
+ const auto allocator = std::make_shared<DefaultAllocator>(
+ pten::TransToFluidPlace(kernel_key.backend()));
+ auto dense_out = std::make_shared<pten::DenseTensor>(allocator, out_meta);
+ kernel_context.EmplaceBackOutput(dense_out);
+ out.set_impl(dense_out);
+
+ // 6. Call kernel
+ kernel(&kernel_context);
+
+ return out;
+}
 } // namespace experimental
 } // namespace paddle
 
 
@@ -234,9 +234,14 @@ void ReMakePtenDenseTensorFromVar(const framework::Variable& variable,
  const pten::TensorArgDef& arg_def,
  pten::DenseTensor* dst) {
  auto expected_place = pten::TransToFluidPlace(arg_def.backend);
-
  if (variable.IsType<framework::LoDTensor>()) {
  const auto& tensor = variable.Get<framework::LoDTensor>();
+ // check input dtype before ReMakePtenDenseTensor
+ PADDLE_ENFORCE(
+ (arg_def.dtype == pten::TransToPtenDataType(tensor.type())),
+ paddle::platform::errors::InvalidArgument(
+ "The type of input data is diffrent from the type of the "
+ "argument's definition in kernel."));
  if (!platform::is_same_place(tensor.place(), expected_place)) {
  framework::LoDTensor tmp_tensor;
  framework::TensorCopySync(tensor, expected_place, &tmp_tensor);
@@ -248,6 +253,11 @@ void ReMakePtenDenseTensorFromVar(const framework::Variable& variable,
  // TODO(chenweihang): now we don't deal with row and height
  // by xiaowei's advice
  const auto& tensor = variable.Get<framework::SelectedRows>();
+ PADDLE_ENFORCE(
+ (arg_def.dtype == pten::TransToPtenDataType(tensor.value().type())),
+ paddle::platform::errors::InvalidArgument(
+ "The type of input data is diffrent from the type of the "
+ "argument's definition in kernel."));
  if (!platform::is_same_place(tensor.value().place(), expected_place)) {
  framework::Tensor tmp_tensor;
  TensorCopySync(tensor.value(), expected_place, &tmp_tensor);
 
@@ -115,4 +115,18 @@ DenseTensor Divide(const ContextT& dev_ctx,
  ElementwiseDiv<T>(dev_ctx, x, y, axis, &dense_out);
  return dense_out;
 }
+
+template <typename T, typename ContextT>
+DenseTensor Multiply(const ContextT& dev_ctx,
+ const DenseTensor& x,
+ const DenseTensor& y,
+ int axis) {
+ auto out_meta = ElementwiseInferShape(x.meta(), y.meta(), axis);
+ const auto allocator =
+ std::make_shared<paddle::experimental::DefaultAllocator>(
+ dev_ctx.GetPlace());
+ pten::DenseTensor dense_out(allocator, out_meta);
+ ElementwiseMul<T>(dev_ctx, x, y, axis, &dense_out);
+ return dense_out;
+}
 } // namespace pten
@@ -64,56 +64,6 @@ void ScaleHost(const CPUContext& dev_ctx,
  out);
 }
 
-template <typename T>
-void ElementwiseAdd(const CPUContext& dev_ctx,
- const DenseTensor& x,
- const DenseTensor& y,
- int axis,
- DenseTensor* out) {
- // allocate memory for out
- out->mutable_data<T>();
-
- if (x.dims() == y.dims()) {
- SameDimsElementwiseCompute<general::SameDimsAddFunctor<CPUContext, T>>()(
- dev_ctx, x, y, out);
- } else {
- auto x_dims = x.dims();
- auto y_dims = y.dims();
- if (x_dims.size() >= y_dims.size()) {
- ElementwiseCompute<general::AddFunctor<T>, T>(
- dev_ctx, x, y, axis, general::AddFunctor<T>(), out);
- } else {
- ElementwiseCompute<general::InverseAddFunctor<T>, T>(
- dev_ctx, x, y, axis, general::InverseAddFunctor<T>(), out);
- }
- }
-}
-
-template <typename T>
-void ElementwiseSub(const CPUContext& dev_ctx,
- const DenseTensor& x,
- const DenseTensor& y,
- int axis,
- DenseTensor* out) {
- // allocate memory for out
- out->mutable_data<T>();
-
- if (x.dims() == y.dims()) {
- SameDimsElementwiseCompute<general::SameDimsSubFunctor<CPUContext, T>>()(
- dev_ctx, x, y, out);
- } else {
- auto x_dims = x.dims();
- auto y_dims = y.dims();
- if (x_dims.size() >= y_dims.size()) {
- ElementwiseCompute<general::SubFunctor<T>, T>(
- dev_ctx, x, y, axis, general::SubFunctor<T>(), out);
- } else {
- ElementwiseCompute<general::InverseSubFunctor<T>, T>(
- dev_ctx, x, y, axis, general::InverseSubFunctor<T>(), out);
- }
- }
-}
-
 template <typename T>
 void ElementwiseDiv(const CPUContext& dev_ctx,
  const DenseTensor& x,
@@ -138,6 +88,15 @@ void ElementwiseDiv(const CPUContext& dev_ctx,
  }
 }
 
+// Create the definition of ElementwiseAdd
+DEFINE_CPU_ELEMENTWISE_OP(Add)
+
+// Create the definition of ElementwiseSub
+DEFINE_CPU_ELEMENTWISE_OP(Sub)
+
+// Create the definition of ElementwiseMul
+DEFINE_CPU_ELEMENTWISE_OP(Mul)
+
 } // namespace pten
 
 // TODO(chenweihang): replace by better impl
@@ -208,3 +167,14 @@ PT_REGISTER_KERNEL("elementwise_div",
  int64_t,
  complex64,
  complex128) {}
+PT_REGISTER_KERNEL("elementwise_mul",
+ CPU,
+ ANY,
+ pten::ElementwiseMul,
+ float,
+ double,
+ int,
+ int64_t,
+ bool,
+ complex64,
+ complex128) {}
@@ -66,4 +66,36 @@ void ElementwiseDiv(const CPUContext& dev_ctx,
  const DenseTensor& y,
  int axis,
  DenseTensor* out);
+
+template <typename T>
+void ElementwiseMul(const CPUContext& dev_ctx,
+ const DenseTensor& x,
+ const DenseTensor& y,
+ int axis,
+ DenseTensor* out);
 } // namespace pten
+
+#define DEFINE_CPU_ELEMENTWISE_OP(name) \
+ template <typename T> \
+ void Elementwise##name(const CPUContext& dev_ctx, \
+ const DenseTensor& x, \
+ const DenseTensor& y, \
+ int axis, \
+ DenseTensor* out) { \
+ out->mutable_data<T>(); \
+ if (x.dims() == y.dims()) { \
+ SameDimsElementwiseCompute< \
+ general::SameDims##name##Functor<CPUContext, T>>()( \
+ dev_ctx, x, y, out); \
+ } else { \
+ auto x_dims = x.dims(); \
+ auto y_dims = y.dims(); \
+ if (x_dims.size() >= y_dims.size()) { \
+ ElementwiseCompute<general::name##Functor<T>, T>( \
+ dev_ctx, x, y, axis, general::name##Functor<T>(), out); \
+ } else { \
+ ElementwiseCompute<general::Inverse##name##Functor<T>, T>( \
+ dev_ctx, x, y, axis, general::Inverse##name##Functor<T>(), out); \
+ } \
+ } \
+ }
Original file line number	Diff line number	Diff line change
`@@ -160,6 +160,12 @@ class ElementwiseOp : public framework::OperatorWithKernel {`
`160`	`160`	`{"axis"}, {"Out"});`
`161`	`161`	`}`
`162`	`162`	`}`
	`163`	`+ if (Type() == "elementwise_mul") {`
	`164`	`+ if (ctx.InputVar("X")->IsType<framework::LoDTensor>()) {`
	`165`	`+ return framework::KernelSignature("elementwise_mul", {"X", "Y"},`
	`166`	`+ {"axis"}, {"Out"});`
	`167`	`+ }`
	`168`	`+ }`
`163`	`169`	`return framework::KernelSignature("None", {"X"}, {}, {"Out"});`
`164`	`170`	`}`
`165`	`171`	`};`