PaddlePaddle
diff --git a/‎paddle/fluid/operators/activation_op.cc‎
Lines changed: 4 additions & 4 deletions b/‎paddle/fluid/operators/activation_op.cc‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎paddle/fluid/operators/activation_op.h‎
Lines changed: 16 additions & 12 deletions b/‎paddle/fluid/operators/activation_op.h‎
Lines changed: 16 additions & 12 deletions
diff --git a/‎paddle/fluid/operators/arg_min_max_op_base.cu.h‎
Lines changed: 50 additions & 21 deletions b/‎paddle/fluid/operators/arg_min_max_op_base.cu.h‎
Lines changed: 50 additions & 21 deletions
diff --git a/‎paddle/fluid/operators/arg_min_max_op_base.h‎
Lines changed: 34 additions & 12 deletions b/‎paddle/fluid/operators/arg_min_max_op_base.h‎
Lines changed: 34 additions & 12 deletions
diff --git a/‎paddle/fluid/operators/bernoulli_op.cc‎
Lines changed: 88 additions & 0 deletions b/‎paddle/fluid/operators/bernoulli_op.cc‎
Lines changed: 88 additions & 0 deletions
@@ -781,8 +781,8 @@ class ReluDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker<T> {
  }
 };
 
-// leaky_relu Grad: dx=dy if y>=0 else alpha * dy
-// leaky_relu GradGrad: ddy=ddx if y>=0 else alpha * ddx
+// leaky_relu Grad: dx=dy if x>=0 else alpha * dy
+// leaky_relu GradGrad: ddy=ddx if x>=0 else alpha * ddx
 template <typename T>
 class LeakyReluDoubleGradMaker
  : public ::paddle::framework::SingleGradOpMaker<T> {
@@ -792,8 +792,8 @@ class LeakyReluDoubleGradMaker
  protected:
  void Apply(GradOpPtr<T> op) const override {
  op->SetType("leaky_relu_grad_grad");
- // input1: Out
- op->SetInput("Out", this->Input("Out"));
+ // input1: X
+ op->SetInput("X", this->Input("X"));
  // X@GRAD@GRAD: ddx
  op->SetInput("DDX", this->OutputGrad(framework::GradVarName("X")));
  op->SetAttrMap(this->Attrs());
 
@@ -1084,7 +1084,11 @@ struct LeakyReluFunctor : public BaseActivationFunctor<T> {
 
  template <typename Device, typename X, typename Out>
  void operator()(Device d, X x, Out out) const {
- out.device(d) = x.cwiseMax(static_cast<T>(alpha) * x);
+ if (alpha < 1.f) {
+ out.device(d) = x.cwiseMax(static_cast<T>(alpha) * x);
+ } else {
+ out.device(d) = x.cwiseMin(static_cast<T>(alpha) * x);
+ }
  }
 };
 
@@ -1098,12 +1102,12 @@ struct LeakyReluGradFunctor : public BaseActivationFunctor<T> {
  typename dX>
  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
  auto temp1 =
- static_cast<T>(alpha) * (out <= static_cast<T>(0)).template cast<T>();
- auto temp2 = (out > static_cast<T>(0)).template cast<T>();
+ static_cast<T>(alpha) * (x < static_cast<T>(0)).template cast<T>();
+ auto temp2 = (x >= static_cast<T>(0)).template cast<T>();
  dx.device(d) = dout * (temp1 + temp2).template cast<T>();
  }
 
- static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+ static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
 template <typename T>
@@ -1451,18 +1455,18 @@ struct LeakyReluGradGradFunctor : public BaseActivationFunctor<T> {
  auto* d = dev.eigen_device();
  auto ddx = framework::EigenVector<T>::Flatten(
  GET_DATA_SAFELY(ddX, "Input", "DDX", "LeakyReluGradGrad"));
- auto out = framework::EigenVector<T>::Flatten(
- GET_DATA_SAFELY(Out, "Output", "Out", "LeakyReluGradGrad"));
+ auto x = framework::EigenVector<T>::Flatten(
+ GET_DATA_SAFELY(X, "Input", "X", "LeakyReluGradGrad"));
  auto ddout = framework::EigenVector<T>::Flatten(
  GET_DATA_SAFELY(ddOut, "Output", "DOut", "LeakyReluGradGrad"));
- ddout.device(*d) = ddx *
-  ((out > static_cast<T>(0)).template cast<T>() +
-  static_cast<T>(alpha) *
-  (out <= static_cast<T>(0)).template cast<T>())
-  .template cast<T>();
+ ddout.device(*d) =
+ ddx *
+ ((x > static_cast<T>(0)).template cast<T>() +
+ static_cast<T>(alpha) * (x <= static_cast<T>(0)).template cast<T>())
+ .template cast<T>();
  }
  }
- static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+ static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
 template <typename T>
 
@@ -53,9 +53,9 @@ using Tensor = framework::Tensor;
  FIXED_BLOCK_DIM_CASE_BASE(3, ##__VA_ARGS__);
 
 template <typename T, typename IndType, class Reducer, size_t BlockDim>
-__global__ void ArgCUDAKernel(const IndType height, // n * h
- const IndType width, // c
- const IndType post_size, // h
+__global__ void ArgCUDAKernel(const int64_t height, // n * h
+ const int64_t width, // c
+ const int64_t post_size, // h
  const Reducer reducer, const T init, const T* in,
  IndType* out) {
  typedef cub::BlockReduce<KeyValuePair<int, T>, BlockDim> BlockReduce;
@@ -79,10 +79,10 @@ __global__ void ArgCUDAKernel(const IndType height, // n * h
 
 template <typename T, typename IndType, class Reducer>
 void ComputeFullArg(const platform::CUDADeviceContext& ctx, const Tensor& input,
- Tensor* indices, const IndType pre, const IndType post,
- const IndType n) {
+ Tensor* indices, const int64_t pre, const int64_t post,
+ const int64_t n) {
  auto cu_stream = ctx.stream();
- auto ComputeBlockSize = [](IndType col) {
+ auto ComputeBlockSize = [](int64_t col) {
  if (col > 512)
  return 1024;
  else if (col > 256)
@@ -101,10 +101,10 @@ void ComputeFullArg(const platform::CUDADeviceContext& ctx, const Tensor& input,
  return 8;
  };
 
- int max_grid_dimx = ctx.GetCUDAMaxGridDimSize().x;
- int height = pre * post;
- int width = n;
- int grid_size = height < max_grid_dimx ? height : max_grid_dimx;
+ int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize().x;
+ int64_t height = pre * post;
+ int64_t width = n;
+ int64_t grid_size = height < max_grid_dimx ? height : max_grid_dimx;
 
  const T* in_data = input.data<T>();
  IndType* out_data = indices->mutable_data<IndType>(ctx.GetPlace());
@@ -129,31 +129,60 @@ void ComputeFullArg(const platform::CUDADeviceContext& ctx, const Tensor& input,
 }
 
 template <typename T, class Reducer>
-class ArgMinMaxOpCUDAKernel : public framework::OpKernel<T> {
- public:
- void Compute(const framework::ExecutionContext& ctx) const override {
+struct VisitDataCudaArgMinMaxFunctor {
+ const framework::ExecutionContext& ctx;
+
+ explicit VisitDataCudaArgMinMaxFunctor(const framework::ExecutionContext& ctx)
+ : ctx(ctx) {}
+ template <typename IndType>
+ void apply() const {
  auto* input = ctx.Input<Tensor>("X");
  auto* output = ctx.Output<Tensor>("Out");
  int axis = ctx.Attr<int64_t>("axis");
- auto in_dims = input->dims();
- axis = (axis < 0) ? (in_dims.size() + axis) : axis;
+ const bool& flatten = ctx.Attr<bool>("flatten");
+
+ framework::DDim input_dims;
+ if (flatten) {
+ input_dims = framework::make_ddim({input->numel()});
+ // if flatten, the axis just as 0
+ axis = 0;
+ } else {
+ input_dims = input->dims();
+ if (axis < 0) axis += input->dims().size();
+ }
 
  int64_t numel = input->numel();
- int64_t groups = numel / in_dims[axis];
+ int64_t groups = numel / input_dims[axis];
  int64_t pre = 1;
  int64_t post = 1;
- int64_t n = in_dims[axis];
+ int64_t n = input_dims[axis];
 
  for (int i = 0; i < axis; i++) {
- pre *= in_dims[i];
+ pre *= input_dims[i];
  }
 
- for (int i = axis + 1; i < in_dims.size(); i++) {
- post *= in_dims[i];
+ for (int i = axis + 1; i < input_dims.size(); i++) {
+ post *= input_dims[i];
  }
 
  const auto& dev_ctx = ctx.cuda_device_context();
- ComputeFullArg<T, int64_t, Reducer>(dev_ctx, *input, output, pre, post, n);
+ ComputeFullArg<T, IndType, Reducer>(dev_ctx, *input, output, pre, post, n);
+ }
+};
+template <typename T, class Reducer>
+class ArgMinMaxOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+ void Compute(const framework::ExecutionContext& ctx) const override {
+ auto& dtype = ctx.Attr<int>("dtype");
+ if (dtype < 0) {
+ framework::VisitDataType(static_cast<framework::proto::VarType::Type>(
+ framework::proto::VarType::INT64),
+ VisitDataCudaArgMinMaxFunctor<T, Reducer>(ctx));
+ return;
+ }
+ framework::VisitDataType(
+ static_cast<framework::proto::VarType::Type>(dtype),
+ VisitDataCudaArgMinMaxFunctor<T, Reducer>(ctx));
  }
 };
 
 
@@ -38,8 +38,9 @@ struct ArgMinMaxFunctor {};
  struct ArgMinMaxFunctor<DeviceContext, T, Tout, Rank, \
  enum_argminmax_value> { \
  void operator()(const DeviceContext& ctx, const framework::LoDTensor& in, \
- framework::LoDTensor* out, int64_t axis, bool keepdims) { \
- auto in_eigen = framework::EigenTensor<T, Rank>::From(in); \
+ framework::LoDTensor* out, framework::DDim x_dims, \
+ int64_t axis, bool keepdims) { \
+ auto in_eigen = framework::EigenTensor<T, Rank>::From(in, x_dims); \
  if (keepdims) { \
  auto out_eigen = framework::EigenTensor<Tout, Rank>::From(*out); \
  out_eigen.device(*(ctx.eigen_device())) = \
@@ -68,16 +69,26 @@ struct VisitDataArgMinMaxFunctor {
  out.template mutable_data<Tout>(ctx.GetPlace());
  auto axis = ctx.Attr<int64_t>("axis");
  auto keepdims = ctx.Attr<bool>("keepdims");
- auto x_rank = x.dims().size();
- if (axis < 0) axis += x_rank;
+ const bool& flatten = ctx.Attr<bool>("flatten");
+
+ // if flatten, will construct the new dims for the cacluate
+ framework::DDim x_dims;
+ if (flatten) {
+ x_dims = framework::make_ddim({x.numel()});
+ // if flatten, the axis just as 0
+ axis = 0;
+ } else {
+ x_dims = x.dims();
+ if (axis < 0) axis += x_dims.size();
+ }
  auto& dev_ctx = ctx.template device_context<DeviceContext>();
 
 #define CALL_ARG_MINMAX_FUNCTOR(rank) \
  ArgMinMaxFunctor<DeviceContext, T, Tout, rank, EnumArgMinMaxValue> \
  functor##rank; \
- functor##rank(dev_ctx, x, &out, axis, keepdims)
+ functor##rank(dev_ctx, x, &out, x_dims, axis, keepdims)
 
- switch (x.dims().size()) {
+ switch (x_dims.size()) {
  case 1:
  CALL_ARG_MINMAX_FUNCTOR(1);
  break;
@@ -141,6 +152,7 @@ class ArgMinMaxOp : public framework::OperatorWithKernel {
  const auto& x_dims = ctx->GetInputDim("X");
  int64_t axis = ctx->Attrs().Get<int64_t>("axis");
  bool keepdims = ctx->Attrs().Get<bool>("keepdims");
+ const bool& flatten = ctx->Attrs().Get<bool>("flatten");
 
  PADDLE_ENFORCE_GE(axis, -x_dims.size(),
  platform::errors::InvalidArgument(
@@ -152,14 +164,21 @@ class ArgMinMaxOp : public framework::OperatorWithKernel {
  platform::errors::InvalidArgument(
  "'axis'(%d) must be less than Rank(X)(%d).", axis, x_dims.size()));
 
- auto x_rank = x_dims.size();
- if (axis < 0) axis += x_rank;
  std::vector<int64_t> vec;
- for (int64_t i = 0; i < axis; i++) vec.push_back(x_dims[i]);
- if (keepdims) {
- vec.push_back(static_cast<int64_t>(1));
+ if (flatten) {
+ // if is flatten, will return the only on element
+ if (keepdims) {
+ vec.emplace_back(static_cast<int64_t>(1));
+ }
+ } else {
+ auto x_rank = x_dims.size();
+ if (axis < 0) axis += x_rank;
+ for (int64_t i = 0; i < axis; i++) vec.emplace_back(x_dims[i]);
+ if (keepdims) {
+ vec.emplace_back(static_cast<int64_t>(1));
+ }
+ for (int64_t i = axis + 1; i < x_rank; i++) vec.emplace_back(x_dims[i]);
  }
- for (int64_t i = axis + 1; i < x_rank; i++) vec.push_back(x_dims[i]);
  ctx->SetOutputDim("Out", framework::make_ddim(vec));
  }
 };
@@ -176,6 +195,9 @@ class BaseArgMinMaxOpMaker : public framework::OpProtoAndCheckerMaker {
  AddAttr<int64_t>("axis", "The axis in which to compute the arg indics.");
  AddAttr<bool>("keepdims", "Keep the dim that to reduce.").SetDefault(false);
  AddAttr<int>("dtype", "Keep the dim that to reduce.").SetDefault(-1);
+ AddAttr<bool>("flatten",
+ "Flatten the input value, and search the min or max indices")
+ .SetDefault(false);
  AddComment(string::Sprintf(R"DOC(
  %s Operator.
 
 
@@ -0,0 +1,88 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/bernoulli_op.h"
+
+#include <algorithm>
+#include <string>
+
+#include "paddle/fluid/framework/generator.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/common_infer_shape_functions.h"
+
+namespace paddle {
+namespace operators {
+
+class BernoulliOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+ void Make() override {
+ AddInput("X",
+ "A tensor with probabilities for generating the random binary "
+ "number");
+ AddOutput("Out", "A Tensor filled with random binary number");
+ AddComment(R"DOC(
+This OP returns a Tensor filled with random binary(0 or 1) number from a Bernoulli distribution.
+
+ Out ~ Bernoulli(X)
+
+)DOC");
+ }
+};
+
+class BernoulliOp : public framework::OperatorWithKernel {
+ public:
+ using framework::OperatorWithKernel::OperatorWithKernel;
+
+ void InferShape(framework::InferShapeContext *ctx) const override {
+ return UnaryOpUnchangedInferShape(ctx);
+ }
+};
+
+// It seems that Eigen::Tensor::random in GPU will SEGFAULT.
+// Use std::random and thrust::random(thrust is a std library in CUDA) to
+// implement uniform random.
+template <typename T>
+class BernoulliOpKernel<platform::CPUDeviceContext, T>
+ : public framework::OpKernel<T> {
+ public:
+ void Compute(const framework::ExecutionContext &ctx) const override {
+ const auto x = ctx.Input<framework::Tensor>("X");
+ auto out = ctx.Output<framework::Tensor>("Out");
+ auto *in_data = x->data<T>();
+ auto *out_data = out->mutable_data<T>(ctx.GetPlace());
+
+ int64_t size = x->numel();
+ std::uniform_real_distribution<T> dist(0.0, 1.0);
+ auto gen_ptr = framework::Generator::GetInstance();
+ std::mt19937_64 &gen_engine = gen_ptr->GetCPUEngine();
+
+ for (int64_t i = 0; i < size; ++i) {
+ out_data[i] = BernoulliFunctor(in_data[i], dist(gen_engine));
+ }
+ }
+}; // namespace operators
+
+} // namespace operators
+} // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OPERATOR(
+ bernoulli, ops::BernoulliOp, ops::BernoulliOpMaker,
+ paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+ paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+
+REGISTER_OP_CPU_KERNEL(bernoulli,
+ ops::BernoulliOpKernel<plat::CPUDeviceContext, float>,
+ ops::BernoulliOpKernel<plat::CPUDeviceContext, double>);