Skip to content

Commit 6fb030d

Browse files
committed
Merged develop
2 parents d82d315 + 3779e80 commit 6fb030d

File tree

360 files changed

+15630
-7103
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

360 files changed

+15630
-7103
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,9 @@ tools/__pycache__
4949
# This file is automatically generated.
5050
# TODO(zhiqiang) Move this file to build directory.
5151
paddle/infrt/dialect/pd_ops.td
52+
paddle/infrt/dialect/phi/ir/phi_cpu_kernels.td
53+
paddle/infrt/dialect/phi/ir/phi_gpu_kernels.td
54+
tools/infrt/kernels.json
5255
paddle/infrt/dialect/pd_ops_info.h
5356
.lit_test_times.txt
5457
paddle/infrt/tests/dialect/Output

CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -238,7 +238,8 @@ option(WITH_MIPS "Compile PaddlePaddle with mips support" OFF)
238238
option(WITH_MUSL "Compile with musl libc instead of gblic" OFF)
239239
option(WITH_UNITY_BUILD "Compile with UnityBuild mode" OFF)
240240
option(WITH_STRIP "Strip so files of Whl packages" OFF)
241-
option(NEW_RELEASE_CUBIN "PaddlePaddle next-level release strategy for pypi cubin package" OFF)
241+
option(NEW_RELEASE_PYPI "PaddlePaddle next-level release strategy for pypi cubin package" OFF)
242+
option(NEW_RELEASE_ALL "PaddlePaddle next-level release strategy for all arches cubin package" OFF)
242243
option(NEW_RELEASE_JIT "PaddlePaddle next-level release strategy for backup jit package" OFF)
243244
option(WITH_ASCEND_INT64 "Compile with int64 kernel for ascend NPU" OFF)
244245
option(WITH_POCKETFFT "Compile with pocketfft support" ON)

cmake/cuda.cmake

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,16 +6,22 @@ if(WITH_NV_JETSON)
66
add_definitions(-DWITH_NV_JETSON)
77
set(paddle_known_gpu_archs "53 62 72")
88
set(paddle_known_gpu_archs10 "53 62 72")
9-
elseif(NEW_RELEASE_CUBIN)
9+
elseif(NEW_RELEASE_ALL)
10+
message("Using New Release Strategy - All Arches Packge")
11+
add_definitions(-DNEW_RELEASE_ALL)
12+
set(paddle_known_gpu_archs "35 50 52 60 61 70 75 80 86")
13+
set(paddle_known_gpu_archs10 "35 50 52 60 61 70 75")
14+
set(paddle_known_gpu_archs11 "35 50 52 60 61 70 75 80")
15+
elseif(NEW_RELEASE_PYPI)
1016
message("Using New Release Strategy - Cubin Packge")
11-
add_definitions(-DNEW_RELEASE_CUBIN)
12-
set(paddle_known_gpu_archs "35 37 50 52 60 61 70 75 80 86")
13-
set(paddle_known_gpu_archs10 "50 60 70 75")
14-
set(paddle_known_gpu_archs11 "60 70 75 80")
17+
add_definitions(-DNEW_RELEASE_PYPI)
18+
set(paddle_known_gpu_archs "35 50 52 60 61 70 75 80 86")
19+
set(paddle_known_gpu_archs10 "")
20+
set(paddle_known_gpu_archs11 "60 61 70 75 80")
1521
elseif(NEW_RELEASE_JIT)
1622
message("Using New Release Strategy - JIT Packge")
1723
add_definitions(-DNEW_RELEASE_JIT)
18-
set(paddle_known_gpu_archs "35 37 50 52 60 61 70 75 80 86")
24+
set(paddle_known_gpu_archs "35 50 52 60 61 70 75 80 86")
1925
set(paddle_known_gpu_archs10 "35 50 60 70 75")
2026
set(paddle_known_gpu_archs11 "35 50 60 70 75 80")
2127
else()
@@ -148,7 +154,7 @@ function(select_nvcc_arch_flags out_variable)
148154

149155
# remove dots and convert to lists
150156
string(REGEX REPLACE "\\." "" cuda_arch_bin "${cuda_arch_bin}")
151-
string(REGEX REPLACE "\\." "" cuda_arch_ptx "${CUDA_ARCH_PTX}")
157+
string(REGEX REPLACE "\\." "" cuda_arch_ptx "${cuda_arch_ptx}")
152158
string(REGEX MATCHALL "[0-9()]+" cuda_arch_bin "${cuda_arch_bin}")
153159
string(REGEX MATCHALL "[0-9]+" cuda_arch_ptx "${cuda_arch_ptx}")
154160

paddle/fluid/distributed/collective/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
11
cc_library(processgroup SRCS ProcessGroup.cc DEPS phi phi_api eager_api)
2+
if (WITH_DISTRIBUTE)
3+
cc_library(processgroup_gloo SRCS ProcessGroupGloo.cc DEPS phi phi_api eager_api gloo_wrapper)
4+
endif()
25
cc_library(eager_reducer SRCS reducer.cc DEPS eager_api processgroup)
36

47
if(WITH_NCCL)
Lines changed: 308 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,308 @@
1+
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
#include <iostream>
16+
17+
#ifdef _WIN32
18+
#include <gloo/common/win.h>
19+
#include <winsock2.h>
20+
#include <ws2tcpip.h>
21+
#else
22+
#include <netdb.h>
23+
#include <sys/socket.h>
24+
#include <unistd.h>
25+
#endif
26+
27+
#include <gloo/broadcast.h>
28+
#include "paddle/fluid/distributed/collective/ProcessGroupGloo.h"
29+
#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
30+
#include "paddle/fluid/platform/enforce.h"
31+
32+
namespace paddle {
33+
namespace distributed {
34+
35+
#ifdef _WIN32
36+
#define GENERATE_FUNC(type, func, ...) \
37+
switch (type) { \
38+
case experimental::DataType::FLOAT32: \
39+
func<float>(__VA_ARGS__); \
40+
break; \
41+
case experimental::DataType::FLOAT64: \
42+
func<double>(__VA_ARGS__); \
43+
break; \
44+
case experimental::DataType::FLOAT16: \
45+
func<gloo::float16>(__VA_ARGS__); \
46+
break; \
47+
case experimental::DataType::INT32: \
48+
func<int32_t>(__VA_ARGS__); \
49+
break; \
50+
case experimental::DataType::INT64: \
51+
func<int64_t>(__VA_ARGS__); \
52+
break; \
53+
default: \
54+
VLOG(0) << "Error: Unknown DataType."; \
55+
exit(-1); \
56+
}
57+
58+
#define HOST_NAME_MAX 256
59+
60+
#else
61+
#define GENERATE_FUNC(type, func, args...) \
62+
switch (type) { \
63+
case experimental::DataType::FLOAT32: \
64+
func<float>(args); \
65+
break; \
66+
case experimental::DataType::FLOAT64: \
67+
func<double>(args); \
68+
break; \
69+
case experimental::DataType::FLOAT16: \
70+
func<gloo::float16>(args); \
71+
break; \
72+
case experimental::DataType::INT32: \
73+
func<int32_t>(args); \
74+
break; \
75+
case experimental::DataType::INT64: \
76+
func<int64_t>(args); \
77+
break; \
78+
default: \
79+
VLOG(0) << "Error: Unknown DataType."; \
80+
exit(-1); \
81+
}
82+
#endif
83+
84+
typedef void (*reduce_func)(void*, const void*, const void*, size_t);
85+
86+
template <typename T>
87+
reduce_func get_function(const ReduceOp& r) {
88+
switch (r) {
89+
case ReduceOp::SUM:
90+
return reduce_func(&::gloo::sum<T>);
91+
case ReduceOp::PRODUCT:
92+
return reduce_func(&::gloo::product<T>);
93+
case ReduceOp::MIN:
94+
return reduce_func(&::gloo::min<T>);
95+
case ReduceOp::MAX:
96+
return reduce_func(&::gloo::max<T>);
97+
case ReduceOp::AVG:
98+
VLOG(0) << "Error: Unsupported ReduceOp::AVG.";
99+
exit(-1);
100+
}
101+
102+
VLOG(0) << "Error: Unknown ReduceOp.";
103+
exit(-1);
104+
}
105+
106+
bool CheckTensorsInCPUPlace(const std::vector<Tensor>& tensors) {
107+
return std::all_of(tensors.cbegin(), tensors.cend(), [&](const Tensor& t) {
108+
return t.place() == PlaceType::kCPU;
109+
});
110+
}
111+
112+
template <typename T>
113+
T* get_data(const Tensor& tensor) {
114+
auto raw_tensor = std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
115+
return static_cast<T*>(raw_tensor->data());
116+
}
117+
118+
template <typename T>
119+
std::vector<T*> get_multi_data(const std::vector<Tensor>& tensors) {
120+
std::vector<T*> ret(tensors.size());
121+
for (size_t i = 0; i < tensors.size(); i++) {
122+
ret[i] = get_data<T>(tensors[i]);
123+
}
124+
return ret;
125+
}
126+
127+
template <typename T, typename P>
128+
void set_output(P& opts, const Tensor& tensor) { // NOLINT
129+
opts.setOutput(get_data<T>(tensor), tensor.numel());
130+
}
131+
132+
template <typename T, typename P>
133+
void set_input(P& opts, const Tensor& tensor) { // NOLINT
134+
opts.setInput(get_data<T>(tensor), tensor.numel());
135+
}
136+
137+
template <typename T, typename P>
138+
void set_outputs(P& opts, const std::vector<Tensor>& tensors) { // NOLINT
139+
opts.setOutputs(get_multi_data<T>(tensors), tensors[0].numel());
140+
}
141+
142+
template <typename T, typename P>
143+
void set_inputs(P& opts, const std::vector<Tensor>& tensors) { // NOLINT
144+
opts.setInputs(get_multi_data<T>(tensors), tensors[0].numel());
145+
}
146+
147+
ProcessGroupGloo::GlooTask::GlooTask(int rank,
148+
const std::vector<Tensor>& inputs,
149+
CommType comm_type)
150+
: ProcessGroup::Task(rank, inputs, comm_type) {
151+
PADDLE_ENFORCE_EQ(CheckTensorsInCPUPlace(inputs), true,
152+
platform::errors::Fatal(
153+
"Only CPU place is supported for ProcessGroupGloo."));
154+
}
155+
156+
ProcessGroupGloo::ProcessGroupGloo(const std::shared_ptr<GlooStore>& store,
157+
int rank, int world_size,
158+
const std::shared_ptr<GlooOptions> options)
159+
: ProcessGroup(rank, world_size), _tag(0), _store(store) {
160+
_context = std::make_shared<gloo::rendezvous::Context>(rank, world_size);
161+
auto prefix_store =
162+
::gloo::rendezvous::PrefixStore(std::to_string(0), *_store);
163+
_context->connectFullMesh(prefix_store, options->device);
164+
}
165+
166+
class BroadcastGlooTask : public ProcessGroupGloo::GlooTask {
167+
public:
168+
BroadcastGlooTask(const std::shared_ptr<gloo::Context>& context,
169+
const std::vector<Tensor>& inputs, int rank, int root,
170+
uint32_t tag)
171+
: ProcessGroupGloo::GlooTask(rank, inputs, CommType::BROADCAST),
172+
_context(context),
173+
_root(root),
174+
_inputs(inputs),
175+
_tag(tag) {}
176+
177+
void Run() override { _do_broadcast(_inputs[0]); }
178+
179+
private:
180+
std::shared_ptr<gloo::Context> _context;
181+
const int _root;
182+
std::vector<Tensor> _inputs{};
183+
const uint32_t _tag;
184+
185+
void _do_broadcast(const Tensor& tensor) {
186+
gloo::BroadcastOptions opts(_context);
187+
const auto& dtype = tensor.type();
188+
GENERATE_FUNC(dtype, set_output, opts, tensor);
189+
opts.setRoot(_root);
190+
opts.setTag(_tag);
191+
gloo::broadcast(opts);
192+
}
193+
};
194+
195+
std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::Broadcast(
196+
std::vector<Tensor>& inputs, const BroadcastOptions& opts) {
197+
auto root = opts.source_rank;
198+
std::unique_ptr<BroadcastGlooTask> task;
199+
auto tag = next_tag();
200+
auto context = get_context();
201+
task = std::make_unique<BroadcastGlooTask>(context, inputs, rank_, root, tag);
202+
task->Run();
203+
return task;
204+
}
205+
206+
class AllreduceGlooTask : public ProcessGroupGloo::GlooTask {
207+
public:
208+
AllreduceGlooTask(int rank, const std::shared_ptr<gloo::Context>& context,
209+
std::vector<Tensor>& inputs, ReduceOp reduce_op, // NOLINT
210+
uint32_t tag)
211+
: ProcessGroupGloo::GlooTask(rank, inputs, CommType::ALLREDUCE),
212+
_context(context),
213+
_inputs(inputs),
214+
_reduce_op(reduce_op),
215+
_tag(tag) {}
216+
217+
void Run() override { _do_allreduce(_inputs); }
218+
219+
private:
220+
std::shared_ptr<gloo::Context> _context;
221+
std::vector<Tensor> _inputs;
222+
const ReduceOp _reduce_op;
223+
uint32_t _tag;
224+
225+
gloo::AllreduceOptions::Func _get_function(const experimental::DataType type,
226+
const ReduceOp op) {
227+
gloo::AllreduceOptions::Func fn;
228+
GENERATE_FUNC(type, _get_function_impl, fn, op);
229+
return fn;
230+
}
231+
232+
template <typename T>
233+
void _get_function_impl(gloo::AllreduceOptions::Func& fn, // NOLINT
234+
const ReduceOp op) {
235+
fn = get_function<T>(op);
236+
}
237+
238+
void _do_allreduce(std::vector<Tensor>& tensors) { // NOLINT
239+
const auto& dtype = tensors[0].type();
240+
gloo::AllreduceOptions opts(_context);
241+
GENERATE_FUNC(dtype, set_inputs, opts, tensors);
242+
GENERATE_FUNC(dtype, set_outputs, opts, tensors);
243+
opts.setReduceFunction(_get_function(dtype, _reduce_op));
244+
opts.setTag(_tag);
245+
gloo::allreduce(opts);
246+
}
247+
};
248+
249+
std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::AllReduce(
250+
std::vector<Tensor>& inputs, const AllreduceOptions& opts) {
251+
auto tag = next_tag();
252+
std::shared_ptr<GlooTask> task;
253+
auto context = get_context();
254+
task = std::make_shared<AllreduceGlooTask>(rank_, context, inputs,
255+
opts.reduce_op, tag);
256+
task->Run();
257+
return task;
258+
}
259+
260+
std::shared_ptr<::gloo::transport::Device>
261+
ProcessGroupGloo::createDeviceForInterface(const std::string& ifname) {
262+
::gloo::transport::tcp::attr attr;
263+
attr.iface = ifname;
264+
return ::gloo::transport::tcp::CreateDevice(attr);
265+
}
266+
267+
std::shared_ptr<::gloo::transport::Device>
268+
ProcessGroupGloo::createDeviceForHostname(const std::string& hostname) {
269+
::gloo::transport::tcp::attr attr;
270+
attr.hostname = hostname;
271+
return ::gloo::transport::tcp::CreateDevice(attr);
272+
}
273+
274+
std::shared_ptr<::gloo::transport::Device>
275+
ProcessGroupGloo::createDefaultDevice() {
276+
std::array<char, HOST_NAME_MAX> hostname{};
277+
auto ret = ::gethostname(hostname.data(), HOST_NAME_MAX);
278+
PADDLE_ENFORCE_EQ(ret, 0, platform::errors::Fatal(
279+
"Get hostname error for createDefaultDevice."));
280+
::addrinfo* result;
281+
result = tcputils::get_addr_info(hostname.data(), "", 0, AF_UNSPEC);
282+
::addrinfo* cur;
283+
for (cur = result; cur != nullptr; cur = cur->ai_next) {
284+
SocketType socket =
285+
::socket(cur->ai_family, cur->ai_socktype, cur->ai_protocol);
286+
if (socket == -1) {
287+
continue;
288+
}
289+
ret = ::bind(socket, cur->ai_addr, cur->ai_addrlen);
290+
#ifdef _WIN32
291+
closesocket(socket);
292+
#else
293+
close(socket);
294+
#endif
295+
if (ret == -1) {
296+
continue;
297+
}
298+
break;
299+
}
300+
freeaddrinfo(result);
301+
if (cur != nullptr) {
302+
return createDeviceForHostname(hostname.data());
303+
}
304+
return createDeviceForHostname("127.0.0.1");
305+
}
306+
307+
} // namespace distributed
308+
} // namespace paddle

0 commit comments

Comments
 (0)