PaddlePaddle
diff --git a/‎cmake/external/xpu.cmake‎
Lines changed: 1 addition & 1 deletion b/‎cmake/external/xpu.cmake‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/cinn/backends/CMakeLists.txt‎
Lines changed: 3 additions & 2 deletions b/‎paddle/cinn/backends/CMakeLists.txt‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎paddle/cinn/backends/codegen_cuda_generate_test.cc‎
Lines changed: 1 addition & 1 deletion b/‎paddle/cinn/backends/codegen_cuda_generate_test.cc‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/cinn/backends/codegen_cuda_host.cc‎
Lines changed: 1 addition & 1 deletion b/‎paddle/cinn/backends/codegen_cuda_host.cc‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/cinn/backends/codegen_cuda_util.cc‎ renamed to ‎paddle/cinn/backends/codegen_device_util.cc‎
Lines changed: 23 additions & 6 deletions b/‎paddle/cinn/backends/codegen_cuda_util.cc‎ renamed to ‎paddle/cinn/backends/codegen_device_util.cc‎
Lines changed: 23 additions & 6 deletions
diff --git a/‎paddle/cinn/backends/codegen_cuda_util.h‎ renamed to ‎paddle/cinn/backends/codegen_device_util.h‎
Lines changed: 30 additions & 9 deletions b/‎paddle/cinn/backends/codegen_cuda_util.h‎ renamed to ‎paddle/cinn/backends/codegen_device_util.h‎
Lines changed: 30 additions & 9 deletions
diff --git a/‎paddle/cinn/backends/compiler.cc‎
Lines changed: 4 additions & 3 deletions b/‎paddle/cinn/backends/compiler.cc‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎paddle/cinn/common/arch.h‎
Lines changed: 3 additions & 0 deletions b/‎paddle/cinn/common/arch.h‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎paddle/cinn/common/cuda_test_helper.cc‎
Lines changed: 2 additions & 2 deletions b/‎paddle/cinn/common/cuda_test_helper.cc‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎paddle/cinn/common/target.cc‎
Lines changed: 6 additions & 0 deletions b/‎paddle/cinn/common/target.cc‎
Lines changed: 6 additions & 0 deletions
@@ -32,7 +32,7 @@ if(NOT DEFINED XPU_XDNN_BASE_DATE)
  set(XPU_XDNN_BASE_DATE "20240327")
 endif()
 if(NOT DEFINED XPU_XHPC_BASE_DATE)
- set(XPU_XHPC_BASE_DATE "20240514")
+ set(XPU_XHPC_BASE_DATE "20240515")
 endif()
 set(XPU_XCCL_BASE_VERSION "1.2.0.5")
 if(NOT DEFINED XPU_XFT_BASE_VERSION)
 
@@ -13,11 +13,12 @@ gather_srcs(
  extern_func_protos.cc
  extern_func_jit_register.cc
  modular.cc
- compiler.cc)
+ compiler.cc
+ codegen_device_util.cc)
 
 if(WITH_CUDA)
  add_subdirectory(nvrtc)
- list(APPEND srcs cuda_util.cc codegen_cuda_dev.cc codegen_cuda_util.cc)
+ list(APPEND srcs cuda_util.cc codegen_cuda_dev.cc)
 endif()
 
 if(WITH_OPENMP)
 
@@ -21,7 +21,7 @@
 
 #include "paddle/cinn/backends/codegen_cuda_dev.h"
 #include "paddle/cinn/backends/codegen_cuda_host.h"
-#include "paddle/cinn/backends/codegen_cuda_util.h"
+#include "paddle/cinn/backends/codegen_device_util.h"
 #include "paddle/cinn/backends/extern_func_jit_register.h"
 #include "paddle/cinn/backends/llvm/execution_engine.h"
 #include "paddle/cinn/backends/llvm/simple_jit.h"
 
@@ -18,7 +18,7 @@
 #include <string>
 #include <unordered_map>
 
-#include "paddle/cinn/backends/codegen_cuda_util.h"
+#include "paddle/cinn/backends/codegen_device_util.h"
 #include "paddle/cinn/backends/extern_func_emitter_builtin.h"
 #include "paddle/cinn/backends/extern_func_jit_register.h"
 #include "paddle/cinn/backends/llvm/llvm_util.h"
 
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/cinn/backends/codegen_cuda_util.h"
+#include "paddle/cinn/backends/codegen_device_util.h"
 
 #include "paddle/cinn/backends/cuda_util.h"
 #include "paddle/cinn/common/cas.h"
@@ -22,7 +22,7 @@ PD_DECLARE_bool(cinn_bucket_compile);
 namespace cinn {
 namespace backends {
 
-std::tuple<ir::Module, ir::Module> SplitCudaAndHostModule(ir::Module module) {
+std::tuple<ir::Module, ir::Module> SplitDeviceAndHostModule(ir::Module module) {
  if (FLAGS_cinn_bucket_compile) {
  detail::CollectBucketStrategyHostFunctionVisitor visitor(module->name);
  Expr expr(module);
@@ -91,7 +91,16 @@ void detail::CollectBucketStrategyHostFunctionVisitor::ProcessLoweredFunc(
  ir::Var kernel_ptr(GenDeviceKernelName(func_node->name, predicate),
  type_of<std::string>());
 
- Expr shared_mem_bytes = CalculateSharedMemory(func);
+ std::optional<Expr> shared_mem_bytes;
+ cinn::common::DefaultDeviceTarget().arch.Match(
+ [&](std::variant<common::UnknownArch, common::X86Arch, common::ARMArch>) {
+ CINN_NOT_IMPLEMENTED;
+ },
+ [&](common::NVGPUArch) {
+#ifdef CINN_WITH_CUDA
+ shared_mem_bytes = CalculateSharedMemory(func);
+#endif
+ });
 
  VLOG(6) << "Add a call node for func_node->name " << func_node->name << "\n"
  << "grid_dim: (" << func_node->cuda_axis_info.grid_dim(0) << ", "
@@ -100,10 +109,18 @@ void detail::CollectBucketStrategyHostFunctionVisitor::ProcessLoweredFunc(
  << "block_dim: (" << func_node->cuda_axis_info.block_dim(0) << ", "
  << func_node->cuda_axis_info.block_dim(1) << ", "
  << func_node->cuda_axis_info.block_dim(2) << "), "
- << "shared_mem: " << shared_mem_bytes;
+ << "shared_mem: " << shared_mem_bytes.value();
+ std::optional<const char *> call_kernel;
+ cinn::common::DefaultDeviceTarget().arch.Match(
+ [&](std::variant<common::UnknownArch, common::X86Arch, common::ARMArch>) {
+ CINN_NOT_IMPLEMENTED;
+ },
+ [&](common::NVGPUArch) {
+ call_kernel = runtime::intrinsic::call_cuda_kernel;
+ });
  ir::Expr call_extern_api =
  ir::Call::Make(Void(),
- runtime::intrinsic::call_cuda_kernel,
+ call_kernel.value(),
  {kernel_ptr,
  kernel_args_,
  kernel_args_num_,
@@ -113,7 +130,7 @@ void detail::CollectBucketStrategyHostFunctionVisitor::ProcessLoweredFunc(
  func_node->cuda_axis_info.block_dim(0), // block_x
  func_node->cuda_axis_info.block_dim(1), // block_y
  func_node->cuda_axis_info.block_dim(2), // block_z
- shared_mem_bytes,  // shared_mem
+ shared_mem_bytes.value(), // shared_mem
  kernel_stream_},
  {},
  ir::CallType::Extern,
 
@@ -19,12 +19,14 @@
 #include <string>
 #include <tuple>
 #include <vector>
-
+#ifdef CINN_WITH_CUDA
 #include "paddle/cinn/backends/codegen_cuda_dev.h"
+#endif
 #include "paddle/cinn/cinn.h"
 #include "paddle/cinn/ir/ir.h"
 #include "paddle/cinn/ir/ir_mutator.h"
 #include "paddle/cinn/ir/utils/ir_copy.h"
+#include "paddle/cinn/runtime/flags.h"
 
 namespace cinn {
 namespace backends {
@@ -43,7 +45,7 @@ namespace backends {
  * - replace the original kernel function with a Call node and add it to the
  * first module, add a device kernel function to the second module.
  */
-std::tuple<ir::Module, ir::Module> SplitCudaAndHostModule(ir::Module module);
+std::tuple<ir::Module, ir::Module> SplitDeviceAndHostModule(ir::Module module);
 
 namespace detail {
 
@@ -52,7 +54,7 @@ struct CollectHostFunctionVisitor : public ir::IRMutator<> {
  : host_module_builder(module_name + "_host",
  cinn::common::DefaultHostTarget()),
  device_module_builder(module_name + "_gpu_device",
- cinn::common::DefaultNVGPUTarget()) {}
+ cinn::common::DefaultDeviceTarget()) {}
 
  std::tuple<ir::Module, ir::Module> operator()(Expr* expr) {
  ir::IRMutator<>::Visit(expr, expr);
@@ -109,9 +111,18 @@ struct CollectHostFunctionVisitor : public ir::IRMutator<> {
  // shared_mem_bytes Can be calculated after codegen_cuda_dev buffer creation
  // however, this make CodeGenCUDA_Dev before spliting the host and device
  // module Maybe we could reorder the process.
- CodeGenCUDA_Dev codegen_dev(cinn::common::DefaultNVGPUTarget());
- codegen_dev.Compile(ir::LoweredFunc(func));
- Expr shared_mem_bytes = codegen_dev.GetDynSharedMemOffset();
+ std::optional<Expr> shared_mem_bytes;
+ cinn::common::DefaultDeviceTarget().arch.Match(
+ [&](std::variant<common::UnknownArch,
+ common::X86Arch,
+ common::ARMArch>) { CINN_NOT_IMPLEMENTED; },
+ [&](common::NVGPUArch) {
+#ifdef CINN_WITH_CUDA
+ CodeGenCUDA_Dev codegen_dev(cinn::common::DefaultNVGPUTarget());
+ codegen_dev.Compile(ir::LoweredFunc(func));
+ shared_mem_bytes = codegen_dev.GetDynSharedMemOffset();
+#endif
+ });
 
  VLOG(6) << "Add a call node for func->name " << func->name << "\n"
  << "grid_dim: (" << func->cuda_axis_info.grid_dim(0) << ", "
@@ -120,10 +131,20 @@ struct CollectHostFunctionVisitor : public ir::IRMutator<> {
  << "block_dim: (" << func->cuda_axis_info.block_dim(0) << ", "
  << func->cuda_axis_info.block_dim(1) << ", "
  << func->cuda_axis_info.block_dim(2) << "), "
- << "shared_mem: " << shared_mem_bytes;
+ << "shared_mem: " << shared_mem_bytes.value();
+
+ std::optional<const char*> call_kernel;
+ cinn::common::DefaultDeviceTarget().arch.Match(
+ [&](std::variant<common::UnknownArch,
+ common::X86Arch,
+ common::ARMArch>) { CINN_NOT_IMPLEMENTED; },
+ [&](common::NVGPUArch) {
+ call_kernel = runtime::intrinsic::call_cuda_kernel;
+ });
+
  auto call_extern_api =
  ir::Call::Make(Void(),
- runtime::intrinsic::call_cuda_kernel,
+ call_kernel.value(),
  {kernel_ptr,
  kernel_args,
  kernel_args_num,
@@ -133,7 +154,7 @@ struct CollectHostFunctionVisitor : public ir::IRMutator<> {
  func->cuda_axis_info.block_dim(0), // block_x
  func->cuda_axis_info.block_dim(1), // block_y
  func->cuda_axis_info.block_dim(2), // block_z
- shared_mem_bytes,
+ shared_mem_bytes.value(),
  kernel_stream},
  {},
  ir::CallType::Extern,
 
@@ -24,7 +24,7 @@
 #ifdef CINN_WITH_CUDA
 #include "paddle/cinn/backends/codegen_cuda_dev.h"
 #include "paddle/cinn/backends/codegen_cuda_host.h"
-#include "paddle/cinn/backends/codegen_cuda_util.h"
+#include "paddle/cinn/backends/codegen_device_util.h"
 #include "paddle/cinn/backends/nvrtc/nvrtc_util.h"
 #include "paddle/cinn/runtime/cuda/cuda_module.h"
 #include "paddle/cinn/runtime/cuda/cuda_util.h"
@@ -246,7 +246,7 @@ std::string Compiler::GetSourceCode(const ir::Module& module) {
  [&](common::NVGPUArch) -> std::string {
 #ifdef CINN_WITH_CUDA
  auto _host_module_device_module_ =
- SplitCudaAndHostModule(module); // NOLINT
+ SplitDeviceAndHostModule(module); // NOLINT
  auto& host_module = std::get<0>(_host_module_device_module_);
  auto& device_module = std::get<1>(_host_module_device_module_);
  CodeGenCUDA_Dev codegen(target_);
@@ -270,7 +270,8 @@ void Compiler::BuildDefault(const Module& module) {
 void Compiler::CompileCudaModule(const Module& module,
  const std::string& code) {
 #ifdef CINN_WITH_CUDA
- auto _host_module_device_module_ = SplitCudaAndHostModule(module); // NOLINT
+ auto _host_module_device_module_ =
+ SplitDeviceAndHostModule(module); // NOLINT
  auto& host_module = std::get<0>(_host_module_device_module_);
  auto& device_module = std::get<1>(_host_module_device_module_);
  VLOG(3) << "[CUDA] host module:\n" << host_module;
 
@@ -17,6 +17,7 @@
 #include <functional>
 #include <ostream>
 #include <variant>
+#include "paddle/common/overloaded.h"
 
 namespace cinn {
 namespace common {
@@ -45,6 +46,8 @@ struct Arch final : public ArchBase {
  return static_cast<const ArchBase&>(*this);
  }
 
+ DEFINE_MATCH_METHOD();
+
  bool operator==(const auto& other) const {
  return this->index() == other.index();
  }
 
@@ -16,7 +16,7 @@
 
 #include "paddle/cinn/backends/codegen_cuda_dev.h"
 #include "paddle/cinn/backends/codegen_cuda_host.h"
-#include "paddle/cinn/backends/codegen_cuda_util.h"
+#include "paddle/cinn/backends/codegen_device_util.h"
 #include "paddle/cinn/backends/nvrtc/nvrtc_util.h"
 #include "paddle/cinn/runtime/cuda/cuda_module.h"
 #include "paddle/cinn/runtime/cuda/cuda_util.h"
@@ -28,7 +28,7 @@ namespace common {
 void CudaModuleTester::Compile(const ir::Module& m,
  const std::string& rewrite_cuda_code) {
  auto _host_module_device_module_ =
- backends::SplitCudaAndHostModule(m); // NOLINT
+ backends::SplitDeviceAndHostModule(m); // NOLINT
  auto& host_module = std::get<0>(_host_module_device_module_);
  auto& device_module = std::get<1>(_host_module_device_module_);
  CHECK(!host_module.functions().empty());
 
@@ -249,6 +249,12 @@ const Target &DefaultNVGPUTarget() {
  return target;
 }
 
+const Target &DefaultDeviceTarget() {
+#ifdef CINN_WITH_CUDA
+ return DefaultNVGPUTarget();
+#endif
+}
+
 int GetMaxThreads() {
  // cudaDeviceGetAttribute ( int* value, cudaDeviceAttr attr, int device )
  int max_threads = 1;
Original file line number	Diff line number	Diff line change
`@@ -17,6 +17,7 @@`
`17`	`17`	`#include <functional>`
`18`	`18`	`#include <ostream>`
`19`	`19`	`#include <variant>`
	`20`	`+#include "paddle/common/overloaded.h"`
`20`	`21`
`21`	`22`	`namespace cinn {`
`22`	`23`	`namespace common {`
`@@ -45,6 +46,8 @@ struct Arch final : public ArchBase {`
`45`	`46`	`return static_cast<const ArchBase&>(*this);`
`46`	`47`	`}`
`47`	`48`
	`49`	`+ DEFINE_MATCH_METHOD();`
	`50`	`+`
`48`	`51`	`bool operator==(const auto& other) const {`
`49`	`52`	`return this->index() == other.index();`
`50`	`53`	`}`