Skip to content

Commit c2b638b

Browse files
committed
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into utilize-stream-safe-cuda-allocator-in-new-executor-gc
2 parents 642b52c + 6b5e33b commit c2b638b

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

60 files changed

+704
-698
lines changed

cmake/pten_kernel.cmake

Lines changed: 31 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ function(kernel_declare TARGET_LIST)
1818
file(READ ${kernel_path} kernel_impl)
1919
# TODO(chenweihang): rename PT_REGISTER_CTX_KERNEL to PT_REGISTER_KERNEL
2020
# NOTE(chenweihang): now we don't recommend to use digit in kernel name
21-
string(REGEX MATCH "(PT_REGISTER_CTX_KERNEL|PT_REGISTER_GENERAL_KERNEL)\\([ \t\r\n]*[a-z_]*," first_registry "${kernel_impl}")
21+
string(REGEX MATCH "(PT_REGISTER_CTX_KERNEL|PT_REGISTER_GENERAL_KERNEL)\\([ \t\r\n]*[a-z0-9_]*," first_registry "${kernel_impl}")
2222
if (NOT first_registry STREQUAL "")
2323
# parse the first kernel name
2424
string(REPLACE "PT_REGISTER_CTX_KERNEL(" "" kernel_name "${first_registry}")
@@ -33,8 +33,6 @@ function(kernel_declare TARGET_LIST)
3333
file(APPEND ${kernel_declare_file} "PT_DECLARE_KERNEL(${kernel_name}, GPU, ALL_LAYOUT);\n")
3434
elseif (${kernel_path} MATCHES "./xpu\/")
3535
file(APPEND ${kernel_declare_file} "PT_DECLARE_KERNEL(${kernel_name}, XPU, ALL_LAYOUT);\n")
36-
elseif (${kernel_path} MATCHES "./npu\/*")
37-
file(APPEND ${kernel_declare_file} "PT_DECLARE_KERNEL(${kernel_name}, NPU, ALL_LAYOUT);\n")
3836
else ()
3937
# deal with device independent kernel, now we use CPU temporaary
4038
file(APPEND ${kernel_declare_file} "PT_DECLARE_KERNEL(${kernel_name}, CPU, ALL_LAYOUT);\n")
@@ -48,7 +46,9 @@ function(kernel_library TARGET)
4846
set(cpu_srcs)
4947
set(gpu_srcs)
5048
set(xpu_srcs)
51-
set(npu_srcs)
49+
# parse and save the deps kerenl targets
50+
set(all_srcs)
51+
set(kernel_deps)
5252

5353
set(oneValueArgs "")
5454
set(multiValueArgs SRCS DEPS)
@@ -57,7 +57,6 @@ function(kernel_library TARGET)
5757

5858
list(LENGTH kernel_library_SRCS kernel_library_SRCS_len)
5959
# one kernel only match one impl file in each backend
60-
# TODO(chenweihang): parse compile deps by include headers
6160
if (${kernel_library_SRCS_len} EQUAL 0)
6261
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
6362
list(APPEND common_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
@@ -75,57 +74,68 @@ function(kernel_library TARGET)
7574
list(APPEND xpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/xpu/${TARGET}.cc)
7675
endif()
7776
endif()
78-
if (WITH_ASCEND_CL)
79-
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/npu/${TARGET}.cc)
80-
list(APPEND npu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/npu/${TARGET}.cc)
81-
endif()
82-
endif()
8377
else()
8478
# TODO(chenweihang): impl compile by source later
8579
endif()
8680

81+
list(APPEND all_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.h)
82+
list(APPEND all_srcs ${common_srcs})
83+
list(APPEND all_srcs ${cpu_srcs})
84+
list(APPEND all_srcs ${gpu_srcs})
85+
list(APPEND all_srcs ${xpu_srcs})
86+
foreach(src ${all_srcs})
87+
file(READ ${src} target_content)
88+
string(REGEX MATCHALL "#include \"paddle\/pten\/kernels\/[a-z0-9_]+_kernel.h\"" include_kernels ${target_content})
89+
foreach(include_kernel ${include_kernels})
90+
string(REGEX REPLACE "#include \"paddle\/pten\/kernels\/" "" kernel_name ${include_kernel})
91+
string(REGEX REPLACE ".h\"" "" kernel_name ${kernel_name})
92+
list(APPEND kernel_deps ${kernel_name})
93+
endforeach()
94+
endforeach()
95+
list(REMOVE_DUPLICATES kernel_deps)
96+
list(REMOVE_ITEM kernel_deps ${TARGET})
97+
8798
list(LENGTH common_srcs common_srcs_len)
8899
list(LENGTH cpu_srcs cpu_srcs_len)
89100
list(LENGTH gpu_srcs gpu_srcs_len)
90101
list(LENGTH xpu_srcs xpu_srcs_len)
91-
list(LENGTH npu_srcs npu_srcs_len)
92102

93103
if (${common_srcs_len} GREATER 0)
94104
# If the kernel has a device independent public implementation,
95105
# we will use this implementation and will not adopt the implementation
96106
# under specific devices
97107
if (WITH_GPU)
98-
nv_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS})
108+
nv_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
99109
elseif (WITH_ROCM)
100-
hip_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS})
110+
hip_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
101111
else()
102-
cc_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS})
112+
cc_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
103113
endif()
104114
else()
105115
# If the kernel has a header file declaration, but no corresponding
106116
# implementation can be found, this is not allowed
107117
if (${cpu_srcs_len} EQUAL 0 AND ${gpu_srcs_len} EQUAL 0 AND
108-
${xpu_srcs_len} EQUAL 0 AND ${npu_srcs_len} EQUAL 0)
118+
${xpu_srcs_len} EQUAL 0)
109119
message(FATAL_ERROR "Cannot find any implementation for ${TARGET}")
110120
else()
111121
if (WITH_GPU)
112122
if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0)
113-
nv_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS})
123+
nv_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
114124
endif()
115125
elseif (WITH_ROCM)
116126
if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0)
117-
hip_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS})
127+
hip_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
118128
endif()
119129
else()
120-
if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${npu_srcs_len} GREATER 0)
121-
cc_library(${TARGET} SRCS ${cpu_srcs} ${xpu_srcs} ${npu_srcs} DEPS ${kernel_library_DEPS})
130+
if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0)
131+
cc_library(${TARGET} SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
122132
endif()
123133
endif()
124134
endif()
125135
endif()
126136

127-
if (${common_srcs_len} GREATER 0 OR ${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR
128-
${xpu_srcs_len} GREATER 0 OR ${npu_srcs_len} GREATER 0)
137+
if (${common_srcs_len} GREATER 0 OR ${cpu_srcs_len} GREATER 0 OR
138+
${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0)
129139
# append target into PTEN_KERNELS property
130140
get_property(pten_kernels GLOBAL PROPERTY PTEN_KERNELS)
131141
set(pten_kernels ${pten_kernels} ${TARGET})
@@ -147,9 +157,6 @@ function(kernel_library TARGET)
147157
if (${xpu_srcs_len} GREATER 0)
148158
kernel_declare(${xpu_srcs})
149159
endif()
150-
if (${npu_srcs_len} GREATER 0)
151-
kernel_declare(${npu_srcs})
152-
endif()
153160
endfunction()
154161

155162
function(register_kernels)

paddle/fluid/framework/CMakeLists.txt

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -91,15 +91,16 @@ endif()
9191
cc_test(copy_same_tensor_test SRCS copy_same_tensor_test.cc DEPS tensor)
9292

9393
cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
94+
cc_library(mixed_vector SRCS mixed_vector.cc DEPS device_context)
9495

9596
if(WITH_GPU)
96-
nv_test(mixed_vector_test SRCS mixed_vector_test.cc mixed_vector_test.cu DEPS place memory device_context tensor)
97+
nv_test(mixed_vector_test SRCS mixed_vector_test.cc mixed_vector_test.cu DEPS mixed_vector place memory device_context tensor)
9798
elseif(WITH_ROCM)
98-
hip_test(mixed_vector_test SRCS mixed_vector_test.cc mixed_vector_test.cu DEPS place memory device_context tensor)
99+
hip_test(mixed_vector_test SRCS mixed_vector_test.cc mixed_vector_test.cu DEPS mixed_vector place memory device_context tensor)
99100
else()
100-
cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor)
101+
cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS mixed_vector place memory device_context tensor)
101102
endif()
102-
cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto version)
103+
cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim mixed_vector place tensor framework_proto version)
103104

104105
cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
105106

paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass.cc

Lines changed: 7 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -67,12 +67,6 @@ FuseBatchNormActOneDNNPass::FuseBatchNormActOneDNNPass() {
6767
.AddAttr("epsilon")
6868
.IsNumGE(0.0f)
6969
.IsNumLE(0.001f)
70-
.End()
71-
.AddAttr("trainable_statistics")
72-
.IsBoolEQ(false)
73-
.End()
74-
.AddAttr("is_test")
75-
.IsBoolEQ(true)
7670
.End();
7771

7872
AddOpCompat(OpCompat("relu"))
@@ -114,21 +108,19 @@ void FuseBatchNormActOneDNNPass::FuseBatchNormAct(
114108
GET_IR_NODE_FROM_SUBGRAPH(act, act, bn_act_pattern);
115109

116110
auto *bn_op = batch_norm->Op();
117-
if (bn_op->HasAttr("use_mkldnn")) {
111+
if (bn_op->HasAttr("trainable_statistics")) {
118112
PADDLE_ENFORCE(
119-
BOOST_GET_CONST(bool, bn_op->GetAttr("use_mkldnn")),
113+
!BOOST_GET_CONST(bool, bn_op->GetAttr("trainable_statistics")),
120114
platform::errors::PreconditionNotMet(
121-
"The BatchNorm+Act fusion may happen only when oneDNN library "
122-
"is used."));
115+
"The BatchNorm+Act fusion may happen only when mean and variance "
116+
"are not calculated by current batch statistics."));
123117
}
124118

125-
auto *act_op = act->Op();
126-
if (act_op->HasAttr("use_mkldnn")) {
119+
if (bn_op->HasAttr("is_test")) {
127120
PADDLE_ENFORCE(
128-
BOOST_GET_CONST(bool, bn_op->GetAttr("use_mkldnn")),
121+
BOOST_GET_CONST(bool, bn_op->GetAttr("is_test")),
129122
platform::errors::PreconditionNotMet(
130-
"The BatchNorm+Act fusion may happen only when oneDNN library "
131-
"is used."));
123+
"The BatchNorm+Act fusion may happen only during inference."));
132124
}
133125

134126
bn_op->SetAttr("use_mkldnn", true);

paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass_tester.cc

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -65,9 +65,9 @@ TEST(FuseBatchNormActOneDNNPass, ThrowIsTestTrainableStats) {
6565
// No fusion in this attribute configuration
6666
constexpr int removed_nodes_count = 0;
6767

68-
EXPECT_TRUE(test::RunPassAndAssert(&graph, "batch_norm_act_fuse_pass", "x",
69-
"act_y", removed_nodes_count));
70-
EXPECT_TRUE(test::AssertOpsCount(graph, {{"batch_norm", 1}, {"relu", 1}}));
68+
EXPECT_THROW(test::RunPassAndAssert(&graph, "batch_norm_act_fuse_pass", "x",
69+
"act_y", removed_nodes_count),
70+
paddle::platform::EnforceNotMet);
7171
}
7272

7373
TEST(FuseBatchNormActOneDNNPass, FuseIsTest) {
@@ -123,9 +123,9 @@ TEST(FuseBatchNormActOneDNNPass, ThrowTrainableStats) {
123123
// No fusion in this attribute configuration
124124
constexpr int removed_nodes_count = 0;
125125

126-
EXPECT_TRUE(test::RunPassAndAssert(&graph, "batch_norm_act_fuse_pass", "x",
127-
"act_y", removed_nodes_count));
128-
EXPECT_TRUE(test::AssertOpsCount(graph, {{"batch_norm", 1}, {"relu", 1}}));
126+
EXPECT_THROW(test::RunPassAndAssert(&graph, "batch_norm_act_fuse_pass", "x",
127+
"act_y", removed_nodes_count),
128+
paddle::platform::EnforceNotMet);
129129
}
130130

131131
TEST(FuseBatchNormActOneDNNPass, AllAttrsFalse) {
@@ -149,9 +149,9 @@ TEST(FuseBatchNormActOneDNNPass, AllAttrsFalse) {
149149
// No fusion in this attribute configuration
150150
constexpr int removed_nodes_count = 0;
151151

152-
EXPECT_TRUE(test::RunPassAndAssert(&graph, "batch_norm_act_fuse_pass", "x",
153-
"act_y", removed_nodes_count));
154-
EXPECT_TRUE(test::AssertOpsCount(graph, {{"batch_norm", 1}, {"relu", 1}}));
152+
EXPECT_THROW(test::RunPassAndAssert(&graph, "batch_norm_act_fuse_pass", "x",
153+
"act_y", removed_nodes_count),
154+
paddle::platform::EnforceNotMet);
155155
}
156156

157157
TEST(FuseBatchNormActOneDNNPass, ThrowUseMkldnn) {
@@ -176,9 +176,9 @@ TEST(FuseBatchNormActOneDNNPass, ThrowUseMkldnn) {
176176
// No fusion in this attribute configuration
177177
constexpr int removed_nodes_count = 0;
178178

179-
EXPECT_TRUE(test::RunPassAndAssert(&graph, "batch_norm_act_fuse_pass", "x",
180-
"act_y", removed_nodes_count));
181-
EXPECT_TRUE(test::AssertOpsCount(graph, {{"batch_norm", 1}, {"relu", 1}}));
179+
EXPECT_THROW(test::RunPassAndAssert(&graph, "batch_norm_act_fuse_pass", "x",
180+
"act_y", removed_nodes_count),
181+
paddle::platform::EnforceNotMet);
182182
}
183183

184184
TEST(FuseBatchNormActOneDNNPass, pass_op_version_check) {
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
2+
3+
Licensed under the Apache License, Version 2.0 (the "License");
4+
you may not use this file except in compliance with the License.
5+
You may obtain a copy of the License at
6+
7+
http://www.apache.org/licenses/LICENSE-2.0
8+
9+
Unless required by applicable law or agreed to in writing, software
10+
distributed under the License is distributed on an "AS IS" BASIS,
11+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
See the License for the specific language governing permissions and
13+
limitations under the License. */
14+
15+
#include "paddle/fluid/framework/mixed_vector.h"
16+
17+
#include <algorithm>
18+
#include <initializer_list>
19+
#include <memory>
20+
#include <mutex> // NOLINT
21+
#include <utility>
22+
#include <vector>
23+
24+
#include "glog/logging.h"
25+
#include "paddle/fluid/framework/details/cow_ptr.h"
26+
#include "paddle/fluid/memory/malloc.h"
27+
#include "paddle/fluid/memory/memcpy.h"
28+
#include "paddle/fluid/platform/device_context.h"
29+
#include "paddle/utils/none.h"
30+
#include "paddle/utils/optional.h"
31+
32+
namespace paddle {
33+
namespace framework {
34+
35+
template <typename T>
36+
void CopyToCPUHelper(std::vector<T> *cpu_, paddle::memory::AllocationPtr *gpu_,
37+
size_t *gpu_memory_size_) {
38+
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
39+
// COPY GPU Data To CPU
40+
auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
41+
platform::DeviceContextPool::Instance().Get((*gpu_)->place()));
42+
auto stream = dev_ctx->stream();
43+
void *src = (*gpu_)->ptr();
44+
void *dst = cpu_->data();
45+
paddle::memory::Copy(platform::CPUPlace(), dst,
46+
OptionalCUDAPlace(*gpu_).get(), src, *gpu_memory_size_,
47+
stream);
48+
dev_ctx->Wait();
49+
#endif
50+
}
51+
52+
template <typename T>
53+
void CopyCPUDataToCUDAHelper(std::vector<T> *cpu_,
54+
paddle::memory::AllocationPtr *gpu_,
55+
size_t *gpu_memory_size_,
56+
const platform::Place &place) {
57+
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
58+
void *src = cpu_->data();
59+
*gpu_memory_size_ = cpu_->size() * sizeof(T); // sizeof(T)
60+
(*gpu_) = memory::Alloc(place, *gpu_memory_size_);
61+
void *dst = (*gpu_)->ptr();
62+
auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
63+
platform::DeviceContextPool::Instance().Get(place));
64+
auto stream = dev_ctx->stream();
65+
paddle::memory::Copy(OptionalCUDAPlace(*gpu_).get(), dst,
66+
platform::CPUPlace(), src, *gpu_memory_size_, stream);
67+
#endif
68+
}
69+
70+
#define INSTANTIATE_VECTOR_FOR_TYPE(__TYPE__) \
71+
template <> \
72+
void Vector<__TYPE__>::VectorData::CopyToCPU() const { \
73+
CopyToCPUHelper<__TYPE__>(&cpu_, &gpu_, &gpu_memory_size_); \
74+
} \
75+
\
76+
template <> \
77+
void Vector<__TYPE__>::VectorData::CopyCPUDataToCUDA( \
78+
const platform::Place &place) const { \
79+
CopyCPUDataToCUDAHelper<__TYPE__>(&cpu_, &gpu_, &gpu_memory_size_, place); \
80+
}
81+
82+
INSTANTIATE_VECTOR_FOR_TYPE(size_t)
83+
INSTANTIATE_VECTOR_FOR_TYPE(int)
84+
INSTANTIATE_VECTOR_FOR_TYPE(int64_t)
85+
86+
}; // namespace framework
87+
} // namespace paddle

0 commit comments

Comments
 (0)