Skip to content

Commit d59378e

Browse files
Merge branch 'PaddlePaddle:develop' into lppool
2 parents a3b62b1 + f1a09ac commit d59378e

File tree

68 files changed

+1444
-714
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

68 files changed

+1444
-714
lines changed

cmake/external/xpu.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ if(NOT DEFINED XPU_XDNN_BASE_DATE)
3232
set(XPU_XDNN_BASE_DATE "20240327")
3333
endif()
3434
if(NOT DEFINED XPU_XHPC_BASE_DATE)
35-
set(XPU_XHPC_BASE_DATE "20240514")
35+
set(XPU_XHPC_BASE_DATE "20240515")
3636
endif()
3737
set(XPU_XCCL_BASE_VERSION "1.2.0.5")
3838
if(NOT DEFINED XPU_XFT_BASE_VERSION)

paddle/cinn/backends/CMakeLists.txt

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,12 @@ gather_srcs(
1313
extern_func_protos.cc
1414
extern_func_jit_register.cc
1515
modular.cc
16-
compiler.cc)
16+
compiler.cc
17+
codegen_device_util.cc)
1718

1819
if(WITH_CUDA)
1920
add_subdirectory(nvrtc)
20-
list(APPEND srcs cuda_util.cc codegen_cuda_dev.cc codegen_cuda_util.cc)
21+
list(APPEND srcs cuda_util.cc codegen_cuda_dev.cc)
2122
endif()
2223

2324
if(WITH_OPENMP)

paddle/cinn/backends/codegen_cuda_generate_test.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121

2222
#include "paddle/cinn/backends/codegen_cuda_dev.h"
2323
#include "paddle/cinn/backends/codegen_cuda_host.h"
24-
#include "paddle/cinn/backends/codegen_cuda_util.h"
24+
#include "paddle/cinn/backends/codegen_device_util.h"
2525
#include "paddle/cinn/backends/extern_func_jit_register.h"
2626
#include "paddle/cinn/backends/llvm/execution_engine.h"
2727
#include "paddle/cinn/backends/llvm/simple_jit.h"

paddle/cinn/backends/codegen_cuda_host.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
#include <string>
1919
#include <unordered_map>
2020

21-
#include "paddle/cinn/backends/codegen_cuda_util.h"
21+
#include "paddle/cinn/backends/codegen_device_util.h"
2222
#include "paddle/cinn/backends/extern_func_emitter_builtin.h"
2323
#include "paddle/cinn/backends/extern_func_jit_register.h"
2424
#include "paddle/cinn/backends/llvm/llvm_util.h"

paddle/cinn/backends/codegen_cuda_util.cc renamed to paddle/cinn/backends/codegen_device_util.cc

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
// See the License for the specific language governing permissions and
1313
// limitations under the License.
1414

15-
#include "paddle/cinn/backends/codegen_cuda_util.h"
15+
#include "paddle/cinn/backends/codegen_device_util.h"
1616

1717
#include "paddle/cinn/backends/cuda_util.h"
1818
#include "paddle/cinn/common/cas.h"
@@ -22,7 +22,7 @@ PD_DECLARE_bool(cinn_bucket_compile);
2222
namespace cinn {
2323
namespace backends {
2424

25-
std::tuple<ir::Module, ir::Module> SplitCudaAndHostModule(ir::Module module) {
25+
std::tuple<ir::Module, ir::Module> SplitDeviceAndHostModule(ir::Module module) {
2626
if (FLAGS_cinn_bucket_compile) {
2727
detail::CollectBucketStrategyHostFunctionVisitor visitor(module->name);
2828
Expr expr(module);
@@ -91,7 +91,16 @@ void detail::CollectBucketStrategyHostFunctionVisitor::ProcessLoweredFunc(
9191
ir::Var kernel_ptr(GenDeviceKernelName(func_node->name, predicate),
9292
type_of<std::string>());
9393

94-
Expr shared_mem_bytes = CalculateSharedMemory(func);
94+
std::optional<Expr> shared_mem_bytes;
95+
cinn::common::DefaultDeviceTarget().arch.Match(
96+
[&](std::variant<common::UnknownArch, common::X86Arch, common::ARMArch>) {
97+
CINN_NOT_IMPLEMENTED;
98+
},
99+
[&](common::NVGPUArch) {
100+
#ifdef CINN_WITH_CUDA
101+
shared_mem_bytes = CalculateSharedMemory(func);
102+
#endif
103+
});
95104

96105
VLOG(6) << "Add a call node for func_node->name " << func_node->name << "\n"
97106
<< "grid_dim: (" << func_node->cuda_axis_info.grid_dim(0) << ", "
@@ -100,10 +109,18 @@ void detail::CollectBucketStrategyHostFunctionVisitor::ProcessLoweredFunc(
100109
<< "block_dim: (" << func_node->cuda_axis_info.block_dim(0) << ", "
101110
<< func_node->cuda_axis_info.block_dim(1) << ", "
102111
<< func_node->cuda_axis_info.block_dim(2) << "), "
103-
<< "shared_mem: " << shared_mem_bytes;
112+
<< "shared_mem: " << shared_mem_bytes.value();
113+
std::optional<const char *> call_kernel;
114+
cinn::common::DefaultDeviceTarget().arch.Match(
115+
[&](std::variant<common::UnknownArch, common::X86Arch, common::ARMArch>) {
116+
CINN_NOT_IMPLEMENTED;
117+
},
118+
[&](common::NVGPUArch) {
119+
call_kernel = runtime::intrinsic::call_cuda_kernel;
120+
});
104121
ir::Expr call_extern_api =
105122
ir::Call::Make(Void(),
106-
runtime::intrinsic::call_cuda_kernel,
123+
call_kernel.value(),
107124
{kernel_ptr,
108125
kernel_args_,
109126
kernel_args_num_,
@@ -113,7 +130,7 @@ void detail::CollectBucketStrategyHostFunctionVisitor::ProcessLoweredFunc(
113130
func_node->cuda_axis_info.block_dim(0), // block_x
114131
func_node->cuda_axis_info.block_dim(1), // block_y
115132
func_node->cuda_axis_info.block_dim(2), // block_z
116-
shared_mem_bytes, // shared_mem
133+
shared_mem_bytes.value(), // shared_mem
117134
kernel_stream_},
118135
{},
119136
ir::CallType::Extern,

paddle/cinn/backends/codegen_cuda_util.h renamed to paddle/cinn/backends/codegen_device_util.h

Lines changed: 30 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,14 @@
1919
#include <string>
2020
#include <tuple>
2121
#include <vector>
22-
22+
#ifdef CINN_WITH_CUDA
2323
#include "paddle/cinn/backends/codegen_cuda_dev.h"
24+
#endif
2425
#include "paddle/cinn/cinn.h"
2526
#include "paddle/cinn/ir/ir.h"
2627
#include "paddle/cinn/ir/ir_mutator.h"
2728
#include "paddle/cinn/ir/utils/ir_copy.h"
29+
#include "paddle/cinn/runtime/flags.h"
2830

2931
namespace cinn {
3032
namespace backends {
@@ -43,7 +45,7 @@ namespace backends {
4345
* - replace the original kernel function with a Call node and add it to the
4446
* first module, add a device kernel function to the second module.
4547
*/
46-
std::tuple<ir::Module, ir::Module> SplitCudaAndHostModule(ir::Module module);
48+
std::tuple<ir::Module, ir::Module> SplitDeviceAndHostModule(ir::Module module);
4749

4850
namespace detail {
4951

@@ -52,7 +54,7 @@ struct CollectHostFunctionVisitor : public ir::IRMutator<> {
5254
: host_module_builder(module_name + "_host",
5355
cinn::common::DefaultHostTarget()),
5456
device_module_builder(module_name + "_gpu_device",
55-
cinn::common::DefaultNVGPUTarget()) {}
57+
cinn::common::DefaultDeviceTarget()) {}
5658

5759
std::tuple<ir::Module, ir::Module> operator()(Expr* expr) {
5860
ir::IRMutator<>::Visit(expr, expr);
@@ -109,9 +111,18 @@ struct CollectHostFunctionVisitor : public ir::IRMutator<> {
109111
// shared_mem_bytes Can be calculated after codegen_cuda_dev buffer creation
110112
// however, this make CodeGenCUDA_Dev before spliting the host and device
111113
// module Maybe we could reorder the process.
112-
CodeGenCUDA_Dev codegen_dev(cinn::common::DefaultNVGPUTarget());
113-
codegen_dev.Compile(ir::LoweredFunc(func));
114-
Expr shared_mem_bytes = codegen_dev.GetDynSharedMemOffset();
114+
std::optional<Expr> shared_mem_bytes;
115+
cinn::common::DefaultDeviceTarget().arch.Match(
116+
[&](std::variant<common::UnknownArch,
117+
common::X86Arch,
118+
common::ARMArch>) { CINN_NOT_IMPLEMENTED; },
119+
[&](common::NVGPUArch) {
120+
#ifdef CINN_WITH_CUDA
121+
CodeGenCUDA_Dev codegen_dev(cinn::common::DefaultNVGPUTarget());
122+
codegen_dev.Compile(ir::LoweredFunc(func));
123+
shared_mem_bytes = codegen_dev.GetDynSharedMemOffset();
124+
#endif
125+
});
115126

116127
VLOG(6) << "Add a call node for func->name " << func->name << "\n"
117128
<< "grid_dim: (" << func->cuda_axis_info.grid_dim(0) << ", "
@@ -120,10 +131,20 @@ struct CollectHostFunctionVisitor : public ir::IRMutator<> {
120131
<< "block_dim: (" << func->cuda_axis_info.block_dim(0) << ", "
121132
<< func->cuda_axis_info.block_dim(1) << ", "
122133
<< func->cuda_axis_info.block_dim(2) << "), "
123-
<< "shared_mem: " << shared_mem_bytes;
134+
<< "shared_mem: " << shared_mem_bytes.value();
135+
136+
std::optional<const char*> call_kernel;
137+
cinn::common::DefaultDeviceTarget().arch.Match(
138+
[&](std::variant<common::UnknownArch,
139+
common::X86Arch,
140+
common::ARMArch>) { CINN_NOT_IMPLEMENTED; },
141+
[&](common::NVGPUArch) {
142+
call_kernel = runtime::intrinsic::call_cuda_kernel;
143+
});
144+
124145
auto call_extern_api =
125146
ir::Call::Make(Void(),
126-
runtime::intrinsic::call_cuda_kernel,
147+
call_kernel.value(),
127148
{kernel_ptr,
128149
kernel_args,
129150
kernel_args_num,
@@ -133,7 +154,7 @@ struct CollectHostFunctionVisitor : public ir::IRMutator<> {
133154
func->cuda_axis_info.block_dim(0), // block_x
134155
func->cuda_axis_info.block_dim(1), // block_y
135156
func->cuda_axis_info.block_dim(2), // block_z
136-
shared_mem_bytes,
157+
shared_mem_bytes.value(),
137158
kernel_stream},
138159
{},
139160
ir::CallType::Extern,

paddle/cinn/backends/compiler.cc

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
#ifdef CINN_WITH_CUDA
2525
#include "paddle/cinn/backends/codegen_cuda_dev.h"
2626
#include "paddle/cinn/backends/codegen_cuda_host.h"
27-
#include "paddle/cinn/backends/codegen_cuda_util.h"
27+
#include "paddle/cinn/backends/codegen_device_util.h"
2828
#include "paddle/cinn/backends/nvrtc/nvrtc_util.h"
2929
#include "paddle/cinn/runtime/cuda/cuda_module.h"
3030
#include "paddle/cinn/runtime/cuda/cuda_util.h"
@@ -246,7 +246,7 @@ std::string Compiler::GetSourceCode(const ir::Module& module) {
246246
[&](common::NVGPUArch) -> std::string {
247247
#ifdef CINN_WITH_CUDA
248248
auto _host_module_device_module_ =
249-
SplitCudaAndHostModule(module); // NOLINT
249+
SplitDeviceAndHostModule(module); // NOLINT
250250
auto& host_module = std::get<0>(_host_module_device_module_);
251251
auto& device_module = std::get<1>(_host_module_device_module_);
252252
CodeGenCUDA_Dev codegen(target_);
@@ -270,7 +270,8 @@ void Compiler::BuildDefault(const Module& module) {
270270
void Compiler::CompileCudaModule(const Module& module,
271271
const std::string& code) {
272272
#ifdef CINN_WITH_CUDA
273-
auto _host_module_device_module_ = SplitCudaAndHostModule(module); // NOLINT
273+
auto _host_module_device_module_ =
274+
SplitDeviceAndHostModule(module); // NOLINT
274275
auto& host_module = std::get<0>(_host_module_device_module_);
275276
auto& device_module = std::get<1>(_host_module_device_module_);
276277
VLOG(3) << "[CUDA] host module:\n" << host_module;

paddle/cinn/common/arch.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include <functional>
1818
#include <ostream>
1919
#include <variant>
20+
#include "paddle/common/overloaded.h"
2021

2122
namespace cinn {
2223
namespace common {
@@ -45,6 +46,8 @@ struct Arch final : public ArchBase {
4546
return static_cast<const ArchBase&>(*this);
4647
}
4748

49+
DEFINE_MATCH_METHOD();
50+
4851
bool operator==(const auto& other) const {
4952
return this->index() == other.index();
5053
}

paddle/cinn/common/cuda_test_helper.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616

1717
#include "paddle/cinn/backends/codegen_cuda_dev.h"
1818
#include "paddle/cinn/backends/codegen_cuda_host.h"
19-
#include "paddle/cinn/backends/codegen_cuda_util.h"
19+
#include "paddle/cinn/backends/codegen_device_util.h"
2020
#include "paddle/cinn/backends/nvrtc/nvrtc_util.h"
2121
#include "paddle/cinn/runtime/cuda/cuda_module.h"
2222
#include "paddle/cinn/runtime/cuda/cuda_util.h"
@@ -28,7 +28,7 @@ namespace common {
2828
void CudaModuleTester::Compile(const ir::Module& m,
2929
const std::string& rewrite_cuda_code) {
3030
auto _host_module_device_module_ =
31-
backends::SplitCudaAndHostModule(m); // NOLINT
31+
backends::SplitDeviceAndHostModule(m); // NOLINT
3232
auto& host_module = std::get<0>(_host_module_device_module_);
3333
auto& device_module = std::get<1>(_host_module_device_module_);
3434
CHECK(!host_module.functions().empty());

paddle/cinn/common/target.cc

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -249,6 +249,12 @@ const Target &DefaultNVGPUTarget() {
249249
return target;
250250
}
251251

252+
const Target &DefaultDeviceTarget() {
253+
#ifdef CINN_WITH_CUDA
254+
return DefaultNVGPUTarget();
255+
#endif
256+
}
257+
252258
int GetMaxThreads() {
253259
// cudaDeviceGetAttribute ( int* value, cudaDeviceAttr attr, int device )
254260
int max_threads = 1;

0 commit comments

Comments
 (0)