PaddlePaddle
diff --git a/‎paddle/fluid/operators/bernoulli_op.cc‎
Lines changed: 0 additions & 28 deletions b/‎paddle/fluid/operators/bernoulli_op.cc‎
Lines changed: 0 additions & 28 deletions
diff --git a/‎paddle/fluid/operators/bernoulli_op.cu‎
Lines changed: 0 additions & 84 deletions b/‎paddle/fluid/operators/bernoulli_op.cu‎
Lines changed: 0 additions & 84 deletions
diff --git a/‎paddle/fluid/platform/transform.h‎
Lines changed: 47 additions & 0 deletions b/‎paddle/fluid/platform/transform.h‎
Lines changed: 47 additions & 0 deletions
diff --git a/‎paddle/pten/kernels/bernoulli_kernel.h‎
Lines changed: 40 additions & 0 deletions b/‎paddle/pten/kernels/bernoulli_kernel.h‎
Lines changed: 40 additions & 0 deletions
diff --git a/‎paddle/pten/kernels/cpu/bernoulli_kernel.cc‎
Lines changed: 42 additions & 0 deletions b/‎paddle/pten/kernels/cpu/bernoulli_kernel.cc‎
Lines changed: 42 additions & 0 deletions
diff --git a/‎paddle/pten/kernels/gpu/bernoulli_kernel.cu‎
Lines changed: 77 additions & 0 deletions b/‎paddle/pten/kernels/gpu/bernoulli_kernel.cu‎
Lines changed: 77 additions & 0 deletions
@@ -49,30 +49,6 @@ class BernoulliOp : public framework::OperatorWithKernel {
  }
 };
 
-// It seems that Eigen::Tensor::random in GPU will SEGFAULT.
-// Use std::random and thrust::random(thrust is a std library in CUDA) to
-// implement uniform random.
-template <typename T>
-class BernoulliOpKernel<platform::CPUDeviceContext, T>
- : public framework::OpKernel<T> {
- public:
- void Compute(const framework::ExecutionContext &ctx) const override {
- const auto x = ctx.Input<framework::Tensor>("X");
- auto out = ctx.Output<framework::Tensor>("Out");
- auto *in_data = x->data<T>();
- auto *out_data = out->mutable_data<T>(ctx.GetPlace());
-
- int64_t size = x->numel();
- std::uniform_real_distribution<T> dist(0.0, 1.0);
- auto gen_ptr = framework::DefaultCPUGenerator();
- auto engine = gen_ptr->GetCPUEngine();
-
- for (int64_t i = 0; i < size; ++i) {
- out_data[i] = BernoulliFunctor(in_data[i], dist(*engine));
- }
- }
-}; // namespace operators
-
 } // namespace operators
 } // namespace paddle
 
@@ -82,7 +58,3 @@ REGISTER_OPERATOR(
  bernoulli, ops::BernoulliOp, ops::BernoulliOpMaker,
  paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
  paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-
-REGISTER_OP_CPU_KERNEL(bernoulli,
- ops::BernoulliOpKernel<plat::CPUDeviceContext, float>,
- ops::BernoulliOpKernel<plat::CPUDeviceContext, double>);
@@ -141,6 +141,53 @@ struct Transform<platform::CUDADeviceContext> {
 #endif
  }
 };
+
+template <>
+struct Transform<pten::GPUContext> {
+ template <typename InputIter, typename OutputIter, typename UnaryOperation>
+ void operator()(const pten::GPUContext& context, InputIter first,
+ InputIter last, OutputIter result, UnaryOperation op) {
+ auto place = context.GetPlace();
+ PADDLE_ENFORCE_EQ(is_gpu_place(place), true,
+ platform::errors::PreconditionNotMet(
+ "The CUDA Transform must be used in GPU place."));
+#ifdef __HIPCC__
+ thrust::transform(thrust::hip::par.on(context.stream()),
+ details::CastToCUDATransformIterator(first),
+ details::CastToCUDATransformIterator(last),
+ details::CastToCUDATransformIterator(result), op);
+#else
+ thrust::transform(thrust::cuda::par.on(context.stream()),
+ details::CastToCUDATransformIterator(first),
+ details::CastToCUDATransformIterator(last),
+ details::CastToCUDATransformIterator(result), op);
+#endif
+ }
+
+ template <typename InputIter1, typename InputIter2, typename OutputIter,
+ typename BinaryOperation>
+ void operator()(const pten::GPUContext& context, InputIter1 first1,
+ InputIter1 last1, InputIter2 first2, OutputIter result,
+ BinaryOperation op) {
+ auto place = context.GetPlace();
+ PADDLE_ENFORCE_EQ(is_gpu_place(place), true,
+ platform::errors::PreconditionNotMet(
+ "The CUDA Transform must be used in GPU place."));
+#ifdef __HIPCC__
+ thrust::transform(thrust::hip::par.on(context.stream()),
+ details::CastToCUDATransformIterator(first1),
+ details::CastToCUDATransformIterator(last1),
+ details::CastToCUDATransformIterator(first2),
+ details::CastToCUDATransformIterator(result), op);
+#else
+ thrust::transform(thrust::cuda::par.on(context.stream()),
+ details::CastToCUDATransformIterator(first1),
+ details::CastToCUDATransformIterator(last1),
+ details::CastToCUDATransformIterator(first2),
+ details::CastToCUDATransformIterator(result), op);
+#endif
+ }
+};
 #endif
 
 } // namespace platform
 
@@ -0,0 +1,40 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/device_context.h"
+
+namespace pten {
+
+template <typename T>
+inline HOSTDEVICE T BernoulliFunctor(T p, T rand) {
+ PADDLE_ENFORCE_LE(p,
+ 1.0,
+ pten::errors::OutOfRange(
+ "The probability should be <= 1, but got %f", p));
+ PADDLE_ENFORCE_GE(p,
+ 0.0,
+ pten::errors::OutOfRange(
+ "The probability should be >= 0, but got %f", p));
+ return static_cast<T>(rand < p);
+}
+
+template <typename T, typename Context>
+void BernoulliKernel(const Context& ctx,
+ const DenseTensor& x,
+ DenseTensor* out);
+
+} // namespace pten
@@ -0,0 +1,42 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pten/kernels/bernoulli_kernel.h"
+#include <random>
+#include "paddle/pten/backends/cpu/cpu_context.h"
+#include "paddle/pten/core/kernel_registry.h"
+
+namespace pten {
+
+template <typename T, typename Context>
+void BernoulliKernel(const Context& ctx,
+ const DenseTensor& x,
+ DenseTensor* out) {
+ auto numel = x.numel();
+ auto* x_data = x.data<T>();
+ T* out_data = ctx.template Alloc<T>(out);
+
+ std::uniform_real_distribution<T> dist(0.0, 1.0);
+ auto gen_ptr = ctx.GetGenerator();
+ auto engine = gen_ptr->GetCPUEngine();
+
+ for (int64_t i = 0; i < numel; ++i) {
+ out_data[i] = BernoulliFunctor(x_data[i], dist(*engine));
+ }
+}
+
+} // namespace pten
+
+PT_REGISTER_KERNEL(
+ bernoulli, CPU, ALL_LAYOUT, pten::BernoulliKernel, float, double) {}
@@ -0,0 +1,77 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <thrust/execution_policy.h>
+#include <thrust/random.h>
+#include <thrust/transform.h>
+#include <algorithm>
+#include <vector>
+#include "paddle/pten/backends/gpu/gpu_context.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/kernels/bernoulli_kernel.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/platform/transform.h"
+
+namespace pten {
+
+template <typename T>
+struct BernoulliCudaFunctor {
+ unsigned int seed_;
+ unsigned int offset_;
+ __host__ __device__ BernoulliCudaFunctor(unsigned int seed,
+ unsigned int offset)
+ : seed_(seed), offset_(offset) {}
+
+ __host__ __device__ T operator()(const unsigned int n, const T p) const {
+ // NOTE(zhiqiu): currently, PADDLE_ENFORCE in cuda kernel may print several
+ // lines of error messages if, and it should be refined.
+ PADDLE_ENFORCE(p >= 0.0 && p <= 1.0,
+ "The probability should be >=0 and <= 1, but got %f",
+ p);
+ thrust::minstd_rand rng;
+ rng.seed(seed_);
+ thrust::uniform_real_distribution<T> dist(0.0, 1.0);
+ rng.discard(n + offset_);
+ return static_cast<T>(dist(rng) < p);
+ }
+};
+
+template <typename T, typename Context>
+void BernoulliKernel(const Context& ctx,
+ const DenseTensor& x,
+ DenseTensor* out) {
+ auto numel = x.numel();
+ auto* x_data = x.data<T>();
+ T* out_data = ctx.template Alloc<T>(out);
+
+ auto gen_cuda = ctx.GetGenerator();
+ auto seed_offset = gen_cuda->IncrementOffset(1);
+ int64_t gen_offset = numel * seed_offset.second;
+ paddle::platform::Transform<pten::GPUContext> trans;
+ thrust::counting_iterator<int64_t> index_sequence_begin(0);
+ trans(ctx,
+ index_sequence_begin,
+ index_sequence_begin + numel,
+ x_data,
+ out_data,
+ BernoulliCudaFunctor<T>(static_cast<int64_t>(seed_offset.first),
+ static_cast<int64_t>(gen_offset)));
+}
+
+} // namespace pten
+
+PT_REGISTER_KERNEL(
+ bernoulli, GPU, ALL_LAYOUT, pten::BernoulliKernel, float, double) {}