PaddlePaddle
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 2 additions & 1 deletion b/‎CMakeLists.txt‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎cmake/cuda.cmake‎
Lines changed: 13 additions & 7 deletions b/‎cmake/cuda.cmake‎
Lines changed: 13 additions & 7 deletions
diff --git a/‎paddle/fluid/distributed/collective/CMakeLists.txt‎
Lines changed: 3 additions & 0 deletions b/‎paddle/fluid/distributed/collective/CMakeLists.txt‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎paddle/fluid/distributed/collective/ProcessGroupGloo.cc‎
Lines changed: 308 additions & 0 deletions b/‎paddle/fluid/distributed/collective/ProcessGroupGloo.cc‎
Lines changed: 308 additions & 0 deletions
@@ -49,6 +49,9 @@ tools/__pycache__
 # This file is automatically generated.
 # TODO(zhiqiang) Move this file to build directory.
 paddle/infrt/dialect/pd_ops.td
+paddle/infrt/dialect/phi/ir/phi_cpu_kernels.td
+paddle/infrt/dialect/phi/ir/phi_gpu_kernels.td
+tools/infrt/kernels.json
 paddle/infrt/dialect/pd_ops_info.h
 .lit_test_times.txt
 paddle/infrt/tests/dialect/Output
 
@@ -238,7 +238,8 @@ option(WITH_MIPS "Compile PaddlePaddle with mips support" OFF)
 option(WITH_MUSL "Compile with musl libc instead of gblic" OFF)
 option(WITH_UNITY_BUILD "Compile with UnityBuild mode" OFF)
 option(WITH_STRIP "Strip so files of Whl packages" OFF)
-option(NEW_RELEASE_CUBIN "PaddlePaddle next-level release strategy for pypi cubin package" OFF)
+option(NEW_RELEASE_PYPI "PaddlePaddle next-level release strategy for pypi cubin package" OFF)
+option(NEW_RELEASE_ALL "PaddlePaddle next-level release strategy for all arches cubin package" OFF)
 option(NEW_RELEASE_JIT "PaddlePaddle next-level release strategy for backup jit package" OFF)
 option(WITH_ASCEND_INT64 "Compile with int64 kernel for ascend NPU" OFF)
 option(WITH_POCKETFFT "Compile with pocketfft support" ON)
 
@@ -6,16 +6,22 @@ if(WITH_NV_JETSON)
  add_definitions(-DWITH_NV_JETSON)
  set(paddle_known_gpu_archs "53 62 72")
  set(paddle_known_gpu_archs10 "53 62 72")
-elseif(NEW_RELEASE_CUBIN)
+elseif(NEW_RELEASE_ALL)
+ message("Using New Release Strategy - All Arches Packge")
+ add_definitions(-DNEW_RELEASE_ALL)
+ set(paddle_known_gpu_archs "35 50 52 60 61 70 75 80 86")
+ set(paddle_known_gpu_archs10 "35 50 52 60 61 70 75")
+ set(paddle_known_gpu_archs11 "35 50 52 60 61 70 75 80")
+elseif(NEW_RELEASE_PYPI)
  message("Using New Release Strategy - Cubin Packge")
- add_definitions(-DNEW_RELEASE_CUBIN)
- set(paddle_known_gpu_archs "35 37 50 52 60 61 70 75 80 86")
- set(paddle_known_gpu_archs10 "50 60 70 75")
- set(paddle_known_gpu_archs11 "60 70 75 80")
+ add_definitions(-DNEW_RELEASE_PYPI)
+ set(paddle_known_gpu_archs "35 50 52 60 61 70 75 80 86")
+ set(paddle_known_gpu_archs10 "")
+ set(paddle_known_gpu_archs11 "60 61 70 75 80")
 elseif(NEW_RELEASE_JIT)
  message("Using New Release Strategy - JIT Packge")
  add_definitions(-DNEW_RELEASE_JIT)
- set(paddle_known_gpu_archs "35 37 50 52 60 61 70 75 80 86")
+ set(paddle_known_gpu_archs "35 50 52 60 61 70 75 80 86")
  set(paddle_known_gpu_archs10 "35 50 60 70 75")
  set(paddle_known_gpu_archs11 "35 50 60 70 75 80")
 else()
@@ -148,7 +154,7 @@ function(select_nvcc_arch_flags out_variable)
 
  # remove dots and convert to lists
  string(REGEX REPLACE "\\." "" cuda_arch_bin "${cuda_arch_bin}")
- string(REGEX REPLACE "\\." "" cuda_arch_ptx "${CUDA_ARCH_PTX}")
+ string(REGEX REPLACE "\\." "" cuda_arch_ptx "${cuda_arch_ptx}")
  string(REGEX MATCHALL "[0-9()]+" cuda_arch_bin "${cuda_arch_bin}")
  string(REGEX MATCHALL "[0-9]+" cuda_arch_ptx "${cuda_arch_ptx}")
 
 
@@ -1,4 +1,7 @@
 cc_library(processgroup SRCS ProcessGroup.cc DEPS phi phi_api eager_api)
+if (WITH_DISTRIBUTE)
+ cc_library(processgroup_gloo SRCS ProcessGroupGloo.cc DEPS phi phi_api eager_api gloo_wrapper)
+endif()
 cc_library(eager_reducer SRCS reducer.cc DEPS eager_api processgroup)
 
 if(WITH_NCCL)
 
@@ -0,0 +1,308 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+
+#ifdef _WIN32
+#include <gloo/common/win.h>
+#include <winsock2.h>
+#include <ws2tcpip.h>
+#else
+#include <netdb.h>
+#include <sys/socket.h>
+#include <unistd.h>
+#endif
+
+#include <gloo/broadcast.h>
+#include "paddle/fluid/distributed/collective/ProcessGroupGloo.h"
+#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace distributed {
+
+#ifdef _WIN32
+#define GENERATE_FUNC(type, func, ...) \
+ switch (type) { \
+ case experimental::DataType::FLOAT32: \
+ func<float>(__VA_ARGS__); \
+ break; \
+ case experimental::DataType::FLOAT64: \
+ func<double>(__VA_ARGS__); \
+ break; \
+ case experimental::DataType::FLOAT16: \
+ func<gloo::float16>(__VA_ARGS__); \
+ break; \
+ case experimental::DataType::INT32: \
+ func<int32_t>(__VA_ARGS__); \
+ break; \
+ case experimental::DataType::INT64: \
+ func<int64_t>(__VA_ARGS__); \
+ break; \
+ default: \
+ VLOG(0) << "Error: Unknown DataType."; \
+ exit(-1); \
+ }
+
+#define HOST_NAME_MAX 256
+
+#else
+#define GENERATE_FUNC(type, func, args...) \
+ switch (type) { \
+ case experimental::DataType::FLOAT32: \
+ func<float>(args); \
+ break; \
+ case experimental::DataType::FLOAT64: \
+ func<double>(args); \
+ break; \
+ case experimental::DataType::FLOAT16: \
+ func<gloo::float16>(args); \
+ break; \
+ case experimental::DataType::INT32: \
+ func<int32_t>(args); \
+ break; \
+ case experimental::DataType::INT64: \
+ func<int64_t>(args); \
+ break; \
+ default: \
+ VLOG(0) << "Error: Unknown DataType."; \
+ exit(-1); \
+ }
+#endif
+
+typedef void (*reduce_func)(void*, const void*, const void*, size_t);
+
+template <typename T>
+reduce_func get_function(const ReduceOp& r) {
+ switch (r) {
+ case ReduceOp::SUM:
+ return reduce_func(&::gloo::sum<T>);
+ case ReduceOp::PRODUCT:
+ return reduce_func(&::gloo::product<T>);
+ case ReduceOp::MIN:
+ return reduce_func(&::gloo::min<T>);
+ case ReduceOp::MAX:
+ return reduce_func(&::gloo::max<T>);
+ case ReduceOp::AVG:
+ VLOG(0) << "Error: Unsupported ReduceOp::AVG.";
+ exit(-1);
+ }
+
+ VLOG(0) << "Error: Unknown ReduceOp.";
+ exit(-1);
+}
+
+bool CheckTensorsInCPUPlace(const std::vector<Tensor>& tensors) {
+ return std::all_of(tensors.cbegin(), tensors.cend(), [&](const Tensor& t) {
+ return t.place() == PlaceType::kCPU;
+ });
+}
+
+template <typename T>
+T* get_data(const Tensor& tensor) {
+ auto raw_tensor = std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
+ return static_cast<T*>(raw_tensor->data());
+}
+
+template <typename T>
+std::vector<T*> get_multi_data(const std::vector<Tensor>& tensors) {
+ std::vector<T*> ret(tensors.size());
+ for (size_t i = 0; i < tensors.size(); i++) {
+ ret[i] = get_data<T>(tensors[i]);
+ }
+ return ret;
+}
+
+template <typename T, typename P>
+void set_output(P& opts, const Tensor& tensor) { // NOLINT
+ opts.setOutput(get_data<T>(tensor), tensor.numel());
+}
+
+template <typename T, typename P>
+void set_input(P& opts, const Tensor& tensor) { // NOLINT
+ opts.setInput(get_data<T>(tensor), tensor.numel());
+}
+
+template <typename T, typename P>
+void set_outputs(P& opts, const std::vector<Tensor>& tensors) { // NOLINT
+ opts.setOutputs(get_multi_data<T>(tensors), tensors[0].numel());
+}
+
+template <typename T, typename P>
+void set_inputs(P& opts, const std::vector<Tensor>& tensors) { // NOLINT
+ opts.setInputs(get_multi_data<T>(tensors), tensors[0].numel());
+}
+
+ProcessGroupGloo::GlooTask::GlooTask(int rank,
+ const std::vector<Tensor>& inputs,
+ CommType comm_type)
+ : ProcessGroup::Task(rank, inputs, comm_type) {
+ PADDLE_ENFORCE_EQ(CheckTensorsInCPUPlace(inputs), true,
+ platform::errors::Fatal(
+ "Only CPU place is supported for ProcessGroupGloo."));
+}
+
+ProcessGroupGloo::ProcessGroupGloo(const std::shared_ptr<GlooStore>& store,
+ int rank, int world_size,
+ const std::shared_ptr<GlooOptions> options)
+ : ProcessGroup(rank, world_size), _tag(0), _store(store) {
+ _context = std::make_shared<gloo::rendezvous::Context>(rank, world_size);
+ auto prefix_store =
+ ::gloo::rendezvous::PrefixStore(std::to_string(0), *_store);
+ _context->connectFullMesh(prefix_store, options->device);
+}
+
+class BroadcastGlooTask : public ProcessGroupGloo::GlooTask {
+ public:
+ BroadcastGlooTask(const std::shared_ptr<gloo::Context>& context,
+ const std::vector<Tensor>& inputs, int rank, int root,
+ uint32_t tag)
+ : ProcessGroupGloo::GlooTask(rank, inputs, CommType::BROADCAST),
+ _context(context),
+ _root(root),
+ _inputs(inputs),
+ _tag(tag) {}
+
+ void Run() override { _do_broadcast(_inputs[0]); }
+
+ private:
+ std::shared_ptr<gloo::Context> _context;
+ const int _root;
+ std::vector<Tensor> _inputs{};
+ const uint32_t _tag;
+
+ void _do_broadcast(const Tensor& tensor) {
+ gloo::BroadcastOptions opts(_context);
+ const auto& dtype = tensor.type();
+ GENERATE_FUNC(dtype, set_output, opts, tensor);
+ opts.setRoot(_root);
+ opts.setTag(_tag);
+ gloo::broadcast(opts);
+ }
+};
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::Broadcast(
+ std::vector<Tensor>& inputs, const BroadcastOptions& opts) {
+ auto root = opts.source_rank;
+ std::unique_ptr<BroadcastGlooTask> task;
+ auto tag = next_tag();
+ auto context = get_context();
+ task = std::make_unique<BroadcastGlooTask>(context, inputs, rank_, root, tag);
+ task->Run();
+ return task;
+}
+
+class AllreduceGlooTask : public ProcessGroupGloo::GlooTask {
+ public:
+ AllreduceGlooTask(int rank, const std::shared_ptr<gloo::Context>& context,
+ std::vector<Tensor>& inputs, ReduceOp reduce_op, // NOLINT
+ uint32_t tag)
+ : ProcessGroupGloo::GlooTask(rank, inputs, CommType::ALLREDUCE),
+ _context(context),
+ _inputs(inputs),
+ _reduce_op(reduce_op),
+ _tag(tag) {}
+
+ void Run() override { _do_allreduce(_inputs); }
+
+ private:
+ std::shared_ptr<gloo::Context> _context;
+ std::vector<Tensor> _inputs;
+ const ReduceOp _reduce_op;
+ uint32_t _tag;
+
+ gloo::AllreduceOptions::Func _get_function(const experimental::DataType type,
+ const ReduceOp op) {
+ gloo::AllreduceOptions::Func fn;
+ GENERATE_FUNC(type, _get_function_impl, fn, op);
+ return fn;
+ }
+
+ template <typename T>
+ void _get_function_impl(gloo::AllreduceOptions::Func& fn, // NOLINT
+ const ReduceOp op) {
+ fn = get_function<T>(op);
+ }
+
+ void _do_allreduce(std::vector<Tensor>& tensors) { // NOLINT
+ const auto& dtype = tensors[0].type();
+ gloo::AllreduceOptions opts(_context);
+ GENERATE_FUNC(dtype, set_inputs, opts, tensors);
+ GENERATE_FUNC(dtype, set_outputs, opts, tensors);
+ opts.setReduceFunction(_get_function(dtype, _reduce_op));
+ opts.setTag(_tag);
+ gloo::allreduce(opts);
+ }
+};
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::AllReduce(
+ std::vector<Tensor>& inputs, const AllreduceOptions& opts) {
+ auto tag = next_tag();
+ std::shared_ptr<GlooTask> task;
+ auto context = get_context();
+ task = std::make_shared<AllreduceGlooTask>(rank_, context, inputs,
+ opts.reduce_op, tag);
+ task->Run();
+ return task;
+}
+
+std::shared_ptr<::gloo::transport::Device>
+ProcessGroupGloo::createDeviceForInterface(const std::string& ifname) {
+ ::gloo::transport::tcp::attr attr;
+ attr.iface = ifname;
+ return ::gloo::transport::tcp::CreateDevice(attr);
+}
+
+std::shared_ptr<::gloo::transport::Device>
+ProcessGroupGloo::createDeviceForHostname(const std::string& hostname) {
+ ::gloo::transport::tcp::attr attr;
+ attr.hostname = hostname;
+ return ::gloo::transport::tcp::CreateDevice(attr);
+}
+
+std::shared_ptr<::gloo::transport::Device>
+ProcessGroupGloo::createDefaultDevice() {
+ std::array<char, HOST_NAME_MAX> hostname{};
+ auto ret = ::gethostname(hostname.data(), HOST_NAME_MAX);
+ PADDLE_ENFORCE_EQ(ret, 0, platform::errors::Fatal(
+ "Get hostname error for createDefaultDevice."));
+ ::addrinfo* result;
+ result = tcputils::get_addr_info(hostname.data(), "", 0, AF_UNSPEC);
+ ::addrinfo* cur;
+ for (cur = result; cur != nullptr; cur = cur->ai_next) {
+ SocketType socket =
+ ::socket(cur->ai_family, cur->ai_socktype, cur->ai_protocol);
+ if (socket == -1) {
+ continue;
+ }
+ ret = ::bind(socket, cur->ai_addr, cur->ai_addrlen);
+#ifdef _WIN32
+ closesocket(socket);
+#else
+ close(socket);
+#endif
+ if (ret == -1) {
+ continue;
+ }
+ break;
+ }
+ freeaddrinfo(result);
+ if (cur != nullptr) {
+ return createDeviceForHostname(hostname.data());
+ }
+ return createDeviceForHostname("127.0.0.1");
+}
+
+} // namespace distributed
+} // namespace paddle