Skip to content

Commit 5cba3ec

Browse files
committed
Merge branch 'PaddlePaddle:develop' into phi/blas
2 parents 581e91b + 4a8b97e commit 5cba3ec

File tree

227 files changed

+4413
-1382
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

227 files changed

+4413
-1382
lines changed

cmake/external/lite.cmake

Lines changed: 5 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ if(NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
8484
if(WITH_ARM)
8585
set(LITE_BUILD_COMMAND ${CMAKE_COMMAND} --build . --target
8686
publish_inference -j)
87-
message(WARNING "BUILD_COMMAND: ${LITE_BUILD_COMMAND}")
87+
message(STATUS "BUILD_COMMAND: ${LITE_BUILD_COMMAND}")
8888
set(LITE_OPTIONAL_ARGS
8989
-DWITH_MKL=OFF
9090
-DLITE_WITH_CUDA=OFF
@@ -120,11 +120,7 @@ if(NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
120120
${LITE_PREFIX_DIR}/src/extern_lite/cmake/os/armlinux.cmake
121121
UPDATE_COMMAND ""
122122
BUILD_COMMAND ${LITE_BUILD_COMMAND}
123-
INSTALL_COMMAND
124-
cp ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.cc
125-
${LITE_PREFIX_DIR}/src/extern_lite-build/lite/core/ && cp
126-
${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
127-
${LITE_PREFIX_DIR}/src/extern_lite-build/lite/core/
123+
INSTALL_COMMAND ""
128124
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
129125
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
130126
-DCMAKE_CXX_FLAGS=${LITE_CMAKE_CXX_FLAGS}
@@ -141,6 +137,7 @@ if(NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
141137
else()
142138
set(LITE_BUILD_COMMAND ${CMAKE_COMMAND} --build . --target
143139
publish_inference -j)
140+
message(STATUS "BUILD_COMMAND: ${LITE_BUILD_COMMAND}")
144141
set(LITE_OPTIONAL_ARGS
145142
-DWITH_MKL=ON
146143
-DLITE_WITH_CUDA=OFF
@@ -173,11 +170,7 @@ if(NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
173170
"s?NNadapter_bridges_path = os.path.abspath('..')+\"\/lite\/kernels\/nnadapter\/bridges\/paddle_use_bridges.h\"?NNadapter_bridges_path = os.path.abspath(\'..\')+\"\/extern_lite\/lite\/kernels\/nnadapter\/bridges\/paddle_use_bridges.h\"?"
174171
${LITE_PREFIX_DIR}/src/extern_lite//lite/tools/cmake_tools/record_supported_kernel_op.py
175172
BUILD_COMMAND ${LITE_BUILD_COMMAND}
176-
INSTALL_COMMAND
177-
cp ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.cc
178-
${LITE_PREFIX_DIR}/src/extern_lite-build/lite/core/ && cp
179-
${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
180-
${LITE_PREFIX_DIR}/src/extern_lite-build/lite/core/
173+
INSTALL_COMMAND ""
181174
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
182175
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
183176
-DCMAKE_CXX_FLAGS=${LITE_CMAKE_CXX_FLAGS}
@@ -196,8 +189,7 @@ endif()
196189

197190
message(STATUS "Paddle-lite BINARY_DIR: ${LITE_BINARY_DIR}")
198191
message(STATUS "Paddle-lite SOURCE_DIR: ${LITE_SOURCE_DIR}")
199-
include_directories(${LITE_SOURCE_DIR})
200-
include_directories(${LITE_BINARY_DIR})
192+
include_directories(${LITE_BINARY_DIR}/${LITE_OUTPUT_BIN_DIR}/cxx/include)
201193
if(LITE_WITH_XPU)
202194
include_directories(${LITE_BINARY_DIR}/third_party/install/xpu/xdnn/include/)
203195
include_directories(${LITE_BINARY_DIR}/third_party/install/xpu/xre/include/)

paddle/fluid/distributed/collective/process_group_custom.cc

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,18 @@ bool ProcessGroupCustom::CustomTask::Wait(std::chrono::milliseconds timeout) {
100100
// Same as Wait
101101
void ProcessGroupCustom::CustomTask::Synchronize() { Wait(kWaitTimeout); }
102102

103+
void ProcessGroupCustom::CustomTask::UpdateWaitChain(
104+
const phi::DeviceContext& ctx) {
105+
PADDLE_ENFORCE_NE(
106+
std::find(places_.cbegin(), places_.cend(), ctx.GetPlace()),
107+
places_.cend(),
108+
phi::errors::NotFound("Cannot find the device context in this task."));
109+
auto index = std::find(places_.cbegin(), places_.cend(), ctx.GetPlace()) -
110+
places_.cbegin();
111+
control_events_[index].Record(
112+
reinterpret_cast<const phi::CustomContext&>(ctx));
113+
}
114+
103115
ProcessGroupCustom::ProcessGroupCustom(
104116
const std::shared_ptr<phi::distributed::Store>& store,
105117
const std::string& device_type,

paddle/fluid/distributed/collective/process_group_custom.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -46,10 +46,11 @@ class ProcessGroupCustom : public ProcessGroupWithoutStream {
4646
CommType CommType,
4747
const std::vector<phi::DenseTensor>& inputs);
4848

49-
bool IsCompleted();
49+
bool IsCompleted() override;
5050
void SynchronizeStreams();
51-
bool Wait(std::chrono::milliseconds timeout = kWaitTimeout);
52-
void Synchronize();
51+
bool Wait(std::chrono::milliseconds timeout = kWaitTimeout) override;
52+
void Synchronize() override;
53+
void UpdateWaitChain(const phi::DeviceContext& ctx) override;
5354
void SetOutputs(std::vector<phi::DenseTensor>& outputs); // NOLINT
5455
virtual ~CustomTask();
5556

paddle/fluid/eager/amp_utils.h

Lines changed: 32 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -122,90 +122,42 @@ inline paddle::experimental::DataType GetAmpDestDtype(
122122
const std::string& op_name,
123123
const paddle::small_vector<std::vector<paddle::Tensor>,
124124
kSlotSmallVectorSize>& amp_tensors_vector) {
125-
auto amp_dtype =
126-
egr::Controller::Instance().GetCurrentTracer()->GetAmpDtype();
127125
auto amp_level = egr::Controller::Instance().GetAMPLevel();
128-
VLOG(6) << "AMP GetAmpDestDtype:"
129-
<< " op(" << op_name << ") amp_dtype(" << amp_dtype << ") amp_level("
130-
<< static_cast<int>(amp_level) << ").";
131-
auto return_amp_type = paddle::experimental::DataType::FLOAT16;
132-
133-
if (amp_dtype == "float16") {
134-
if (amp_level == paddle::imperative::AmpLevel::O1) {
135-
if (paddle::imperative::AmpOperators::Instance()
136-
.GetMutableAllowOps()
137-
->count(op_name)) {
138-
return_amp_type = paddle::experimental::DataType::FLOAT16;
139-
} else if (paddle::imperative::AmpOperators::Instance()
140-
.GetMutableBlockOps()
141-
->count(op_name) ||
142-
paddle::imperative::AmpOperators::Instance()
143-
.GetMutableUnsupportedFp16Ops()
144-
->count(op_name)) {
145-
return_amp_type = paddle::experimental::DataType::FLOAT32;
146-
} else {
147-
auto dst_type = GetPromoteType(op_name,
148-
amp_tensors_vector,
149-
paddle::experimental::DataType::FLOAT16);
150-
if (dst_type == paddle::experimental::DataType::FLOAT16 &&
151-
paddle::imperative::AmpOperators::Instance()
152-
.GetMutableUnsupportedFp16Ops()
153-
->count(op_name)) {
154-
dst_type = paddle::experimental::DataType::FLOAT32;
155-
}
156-
return_amp_type = dst_type;
157-
}
158-
} else if (amp_level == paddle::imperative::AmpLevel::O2) {
159-
auto dst_type = paddle::experimental::DataType::FLOAT16;
160-
if (paddle::imperative::AmpOperators::Instance()
161-
.GetMutableUnsupportedFp16Ops()
162-
->count(op_name) ||
163-
paddle::imperative::AmpOperators::Instance()
164-
.GetMutableBlockOps()
165-
->count(op_name)) {
166-
dst_type = paddle::experimental::DataType::FLOAT32;
167-
}
168-
return_amp_type = dst_type;
126+
auto amp_setting_dtype =
127+
egr::Controller::Instance().GetCurrentTracer()->GetAmpPhiDtype();
128+
auto dst_type = amp_setting_dtype;
129+
if (amp_level == paddle::imperative::AmpLevel::O1) {
130+
if (paddle::imperative::AmpOperators::Instance()
131+
.GetMutableAllowOps()
132+
->count(op_name)) {
133+
dst_type = amp_setting_dtype;
134+
} else if (paddle::imperative::AmpOperators::Instance()
135+
.GetMutableBlockOps()
136+
->count(op_name)) {
137+
dst_type = paddle::experimental::DataType::FLOAT32;
138+
} else {
139+
dst_type = GetPromoteType(op_name, amp_tensors_vector, amp_setting_dtype);
169140
}
170-
} else if (amp_dtype == "bfloat16") {
171-
if (amp_level == paddle::imperative::AmpLevel::O1) {
172-
if (paddle::imperative::AmpOperators::Instance()
173-
.GetMutableAllowOps()
174-
->count(op_name)) {
175-
return_amp_type = paddle::experimental::DataType::BFLOAT16;
176-
} else if (paddle::imperative::AmpOperators::Instance()
177-
.GetMutableBlockOps()
178-
->count(op_name)) {
179-
return_amp_type = paddle::experimental::DataType::FLOAT32;
180-
} else {
181-
auto dst_type =
182-
GetPromoteType(op_name,
183-
amp_tensors_vector,
184-
paddle::experimental::DataType::BFLOAT16);
185-
if (dst_type == paddle::experimental::DataType::BFLOAT16 &&
186-
paddle::imperative::AmpOperators::Instance()
187-
.GetMutableUnsupportedBf16Ops()
188-
->count(op_name)) {
189-
dst_type = paddle::experimental::DataType::FLOAT32;
190-
}
191-
return_amp_type = dst_type;
192-
}
193-
} else if (amp_level == paddle::imperative::AmpLevel::O2) {
194-
auto dst_type = paddle::experimental::DataType::BFLOAT16;
195-
if (paddle::imperative::AmpOperators::Instance()
196-
.GetMutableUnsupportedBf16Ops()
197-
->count(op_name) ||
198-
paddle::imperative::AmpOperators::Instance()
199-
.GetMutableBlockOps()
200-
->count(op_name)) {
201-
dst_type = paddle::experimental::DataType::FLOAT32;
202-
}
203-
return_amp_type = dst_type;
141+
} else if (amp_level == paddle::imperative::AmpLevel::O2) {
142+
if (paddle::imperative::AmpOperators::Instance()
143+
.GetMutableBlockOps()
144+
->count(op_name)) {
145+
dst_type = paddle::experimental::DataType::FLOAT32;
204146
}
205-
} else {
206-
return_amp_type = paddle::experimental::DataType::FLOAT32;
207147
}
208-
return GetDtypeWithPlace(op_name, amp_tensors_vector, return_amp_type);
148+
149+
if (dst_type == amp_setting_dtype &&
150+
(paddle::imperative::AmpOperators::Instance()
151+
.GetMutableUnsupportedOps(amp_setting_dtype)
152+
->count(op_name))) {
153+
dst_type = paddle::experimental::DataType::FLOAT32;
154+
}
155+
156+
dst_type = GetDtypeWithPlace(op_name, amp_tensors_vector, dst_type);
157+
VLOG(6) << "AMP GetAmpDestDtype:"
158+
<< " op(" << op_name << ") amp_dtype(" << dst_type << ") amp_level("
159+
<< static_cast<int>(amp_level) << ").";
160+
return dst_type;
209161
}
210162

211163
} // namespace egr

paddle/fluid/eager/pylayer/py_layer_node.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,7 @@ GradNodePyLayer::operator()(
165165
this->OutputMeta()[i][0].IsStopGradient(),
166166
true,
167167
paddle::platform::errors::InvalidArgument(
168-
"%s's backward function should not return empyt at %d position.",
168+
"%s's backward function should not return empty at %d position.",
169169
name(),
170170
i));
171171
grad_out.push_back({});

paddle/fluid/framework/details/all_reduce_op_handle.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ void AllReduceOpHandle::AllReduceImpl(
135135

136136
if (i == 0) {
137137
numel = static_cast<int64_t>(lod_tensor.numel());
138-
// only enforce place0, we will enforce other palce numel == place0 numel
138+
// only enforce place0, we will enforce other place numel == place0 numel
139139
PADDLE_ENFORCE_GT(
140140
numel,
141141
0,

paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ FetchResultType BindThreadedSSAGraphExecutor::RunMainStream(
144144
while (cur_count < op_deps->size()) {
145145
cur_count++;
146146
auto cur_op = ready_ops->Pop();
147-
// when execption, get cur_op == nullptr
147+
// when exception, get cur_op == nullptr
148148
if (cur_op == nullptr) {
149149
std::lock_guard<std::mutex> lock(mutex_);
150150
exec_op_count_ = op_deps->size();

paddle/fluid/framework/details/eager_deletion_op_handle.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
5959
#endif
6060
PADDLE_ENFORCE_NOT_NULL(
6161
event_,
62-
platform::errors::InvalidArgument("The cuda envet created is NULL."));
62+
platform::errors::InvalidArgument("The cuda event created is NULL."));
6363
}
6464
}
6565
#endif

paddle/fluid/framework/details/fetch_async_op_handle.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -203,7 +203,7 @@ void FetchAsyncOpHandle::FetchMergedLodTensor(
203203
}
204204

205205
// slice and memcpy
206-
// for 0D tensor, can't concat eath tensor, stack them. for 1+D tensor, concat
206+
// for 0D tensor, can't concat each tensor, stack them. for 1+D tensor, concat
207207
// them
208208
int begin = 0;
209209
int end = 0;

paddle/fluid/framework/details/multi_devices_helper.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,8 @@ namespace details {
4444
// all variable in each devices.
4545
// The outside vector is the device vector. Each element of this vector is a
4646
// map from variable name to variables. The variables, who have the same name,
47-
// will have a differsent version. The offset in the
48-
// `std::vector<VarHandle*>` is the version of varaibles.
47+
// will have a different version. The offset in the
48+
// `std::vector<VarHandle*>` is the version of variables.
4949
typedef std::vector<std::unordered_map<std::string, std::vector<VarHandle *>>>
5050
GraphVars;
5151
constexpr char kGraphVars[] = "vars";

0 commit comments

Comments
 (0)