Skip to content
2 changes: 1 addition & 1 deletion cmake/inference_lib.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,7 @@ copy(inference_lib_dist
# the header file of pten is copied to the experimental directory,
# the include path of pten needs to be changed to adapt to inference api path
add_custom_command(TARGET inference_lib_dist POST_BUILD
COMMAND ${CMAKE_COMMAND} -P "${PADDLE_SOURCE_DIR}/cmake/pten.cmake"
COMMAND ${CMAKE_COMMAND} -P "${PADDLE_SOURCE_DIR}/cmake/pten_header.cmake"
COMMENT "Change pten header include path to adapt to inference api path")

# CAPI inference library for only inference
Expand Down
File renamed without changes.
175 changes: 175 additions & 0 deletions cmake/pten_kernel.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# call kernel_declare need to make sure the target of input is exists
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

后续pr 中可以删除注释中的 is

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, thx

function(kernel_declare TARGET_LIST)
foreach(kernel_path ${TARGET_LIST})
file(READ ${kernel_path} kernel_impl)
# TODO(chenweihang): rename PT_REGISTER_CTX_KERNEL to PT_REGISTER_KERNEL
# NOTE(chenweihang): now we don't recommend to use digit in kernel name
string(REGEX MATCH "PT_REGISTER_CTX_KERNEL\\([ \t\r\n]*[a-z_]*," first_registry "${kernel_impl}")
if (NOT first_registry STREQUAL "")
# parse the first kernel name
string(REPLACE "PT_REGISTER_CTX_KERNEL(" "" kernel_name "${first_registry}")
string(REPLACE "," "" kernel_name "${kernel_name}")
string(REGEX REPLACE "[ \t\r\n]+" "" kernel_name "${kernel_name}")
# append kernel declare into declarations.h
# TODO(chenweihang): default declare ALL_LAYOUT for each kernel
if (${kernel_path} MATCHES "./cpu\/")
file(APPEND ${kernel_declare_file} "PT_DECLARE_KERNEL(${kernel_name}, CPU, ALL_LAYOUT);\n")
elseif (${kernel_path} MATCHES "./gpu\/")
file(APPEND ${kernel_declare_file} "PT_DECLARE_KERNEL(${kernel_name}, GPU, ALL_LAYOUT);\n")
elseif (${kernel_path} MATCHES "./xpu\/")
file(APPEND ${kernel_declare_file} "PT_DECLARE_KERNEL(${kernel_name}, XPU, ALL_LAYOUT);\n")
elseif (${kernel_path} MATCHES "./npu\/*")
file(APPEND ${kernel_declare_file} "PT_DECLARE_KERNEL(${kernel_name}, NPU, ALL_LAYOUT);\n")
else ()
# deal with device independent kernel, now we use CPU temporaary
file(APPEND ${kernel_declare_file} "PT_DECLARE_KERNEL(${kernel_name}, CPU, ALL_LAYOUT);\n")
endif()
endif()
endforeach()
endfunction()

function(kernel_library TARGET)
set(common_srcs)
set(cpu_srcs)
set(gpu_srcs)
set(xpu_srcs)
set(npu_srcs)

set(oneValueArgs "")
set(multiValueArgs SRCS DEPS)
cmake_parse_arguments(kernel_library "${options}" "${oneValueArgs}"
"${multiValueArgs}" ${ARGN})

list(LENGTH kernel_library_SRCS kernel_library_SRCS_len)
# one kernel only match one impl file in each backend
# TODO(chenweihang): parse compile deps by include headers
if (${kernel_library_SRCS_len} EQUAL 0)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
list(APPEND common_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc)
list(APPEND cpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc)
endif()
if (WITH_GPU OR WITH_ROCM)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu)
list(APPEND gpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu)
endif()
endif()
if (WITH_XPU)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/xpu/${TARGET}.cc)
list(APPEND xpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/xpu/${TARGET}.cc)
endif()
endif()
if (WITH_ASCEND_CL)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/npu/${TARGET}.cc)
list(APPEND npu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/npu/${TARGET}.cc)
endif()
endif()
else()
# TODO(chenweihang): impl compile by source later
endif()

list(LENGTH common_srcs common_srcs_len)
list(LENGTH cpu_srcs cpu_srcs_len)
list(LENGTH gpu_srcs gpu_srcs_len)
list(LENGTH xpu_srcs xpu_srcs_len)
list(LENGTH npu_srcs npu_srcs_len)

if (${common_srcs_len} GREATER 0)
# If the kernel has a device independent public implementation,
# we will use this implementation and will not adopt the implementation
# under specific devices
if (WITH_GPU)
nv_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS})
elseif (WITH_ROCM)
hip_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS})
else()
cc_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS})
endif()
else()
# If the kernel has a header file declaration, but no corresponding
# implementation can be found, this is not allowed
if (${cpu_srcs_len} EQUAL 0 AND ${gpu_srcs_len} EQUAL 0 AND
${xpu_srcs_len} EQUAL 0 AND ${npu_srcs_len} EQUAL 0)
message(FATAL_ERROR "Cannot find any implementation for ${TARGET}")
else()
if (WITH_GPU)
if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0)
nv_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS})
endif()
elseif (WITH_ROCM)
if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0)
hip_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS})
endif()
else()
if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${npu_srcs_len} GREATER 0)
cc_library(${TARGET} SRCS ${cpu_srcs} ${xpu_srcs} ${npu_srcs} DEPS ${kernel_library_DEPS})
endif()
endif()
endif()
endif()

if (${common_srcs_len} GREATER 0 OR ${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR
${xpu_srcs_len} GREATER 0 OR ${npu_srcs_len} GREATER 0)
# append target into PTEN_KERNELS property
get_property(pten_kernels GLOBAL PROPERTY PTEN_KERNELS)
set(pten_kernels ${pten_kernels} ${TARGET})
set_property(GLOBAL PROPERTY PTEN_KERNELS ${pten_kernels})
endif()

# parse kernel name and auto generate kernel declaration
# here, we don't need to check WITH_XXX, because if not WITH_XXX, the
# xxx_srcs_len will be equal to 0
if (${common_srcs_len} GREATER 0)
kernel_declare(${common_srcs})
endif()
if (${cpu_srcs_len} GREATER 0)
kernel_declare(${cpu_srcs})
endif()
if (${gpu_srcs_len} GREATER 0)
kernel_declare(${gpu_srcs})
endif()
if (${xpu_srcs_len} GREATER 0)
kernel_declare(${xpu_srcs})
endif()
if (${npu_srcs_len} GREATER 0)
kernel_declare(${npu_srcs})
endif()
endfunction()

function(register_kernels)
set(options "")
set(oneValueArgs "")
set(multiValueArgs EXCLUDES DEPS)
cmake_parse_arguments(register_kernels "${options}" "${oneValueArgs}"
"${multiValueArgs}" ${ARGN})

file(GLOB KERNELS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_kernel.h")
string(REPLACE ".h" "" KERNELS "${KERNELS}")
list(LENGTH register_kernels_DEPS register_kernels_DEPS_len)

foreach(target ${KERNELS})
list(FIND register_kernels_EXCLUDES ${target} _index)
if (${_index} EQUAL -1)
if (${register_kernels_DEPS_len} GREATER 0)
kernel_library(${target} DEPS ${register_kernels_DEPS})
else()
kernel_library(${target})
endif()
endif()
endforeach()
endfunction()
6 changes: 4 additions & 2 deletions paddle/pten/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,12 @@ add_subdirectory(tests)

# make an unity target for compile deps
set(PTEN_DEPS convert_utils dense_tensor pten_context kernel_factory kernel_context)
set(PTEN_DEPS ${PTEN_DEPS} math_cpu linalg_cpu manipulation_cpu conj_kernel_cpu scale_kernel_cpu full_kernel_cpu flatten)
get_property(pten_kernels GLOBAL PROPERTY PTEN_KERNELS)
set(PTEN_DEPS ${PTEN_DEPS} ${pten_kernels})
set(PTEN_DEPS ${PTEN_DEPS} math_cpu linalg_cpu manipulation_cpu conj_kernel_cpu)
set(PTEN_DEPS ${PTEN_DEPS} nary unary binary)
if(WITH_GPU OR WITH_ROCM)
set(PTEN_DEPS ${PTEN_DEPS} math_gpu linalg_gpu manipulation_gpu conj_kernel_gpu scale_kernel_gpu full_kernel_gpu)
set(PTEN_DEPS ${PTEN_DEPS} math_gpu linalg_gpu manipulation_gpu conj_kernel_gpu)
endif()
if(WITH_XPU)
set(PTEN_DEPS ${PTEN_DEPS} manipulation_xpu)
Expand Down
7 changes: 0 additions & 7 deletions paddle/pten/api/lib/kernel_declare.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,25 +20,18 @@ limitations under the License. */
// the kernel declare statement is automatically generated according to the
// file name of the kernel, and this header file will be removed

PT_DECLARE_KERNEL(full_like, CPU, ALL_LAYOUT);
PT_DECLARE_KERNEL(dot, CPU, ALL_LAYOUT);
PT_DECLARE_KERNEL(cast, CPU, ALL_LAYOUT);
PT_DECLARE_KERNEL(flatten, CPU, ALL_LAYOUT);
PT_DECLARE_KERNEL(sign, CPU, ALL_LAYOUT);
PT_DECLARE_KERNEL(scale, CPU, ALL_LAYOUT);
PT_DECLARE_KERNEL(conj, CPU, ALL_LAYOUT);

#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PT_DECLARE_KERNEL(full_like, GPU, ALL_LAYOUT);
PT_DECLARE_KERNEL(dot, GPU, ALL_LAYOUT);
PT_DECLARE_KERNEL(cast, GPU, ALL_LAYOUT);
PT_DECLARE_KERNEL(flatten, GPU, ALL_LAYOUT);
PT_DECLARE_KERNEL(sign, GPU, ALL_LAYOUT);
PT_DECLARE_KERNEL(scale, GPU, ALL_LAYOUT);
PT_DECLARE_KERNEL(conj, GPU, ALL_LAYOUT);
#endif

#ifdef PADDLE_WITH_XPU
PT_DECLARE_KERNEL(flatten, XPU, ALL_LAYOUT);
PT_DECLARE_KERNEL(reshape, XPU, ALL_LAYOUT);
#endif
29 changes: 21 additions & 8 deletions paddle/pten/kernels/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
include(pten_kernel)

set(kernel_declare_file ${PADDLE_BINARY_DIR}/paddle/pten/kernels/declarations.h.tmp CACHE INTERNAL "declarations.h file")
set(kernel_declare_file_final ${PADDLE_BINARY_DIR}/paddle/pten/kernels/declarations.h)
file(WRITE ${kernel_declare_file} "// Generated by the paddle/pten/kernels/CMakeLists.txt. DO NOT EDIT!\n\n#pragma once\n\n")

# kernel primitive api
add_subdirectory(primitive)
# pten hybird functors and functions called by kernels
Expand All @@ -18,17 +24,24 @@ if(WITH_XPU)
add_subdirectory(xpu)
endif()

set(FLATTEN_DEPS dense_tensor kernel_context kernel_factory utils_cpu unary)
# pten depends all pten kernel targets
set_property(GLOBAL PROPERTY PTEN_KERNELS "")

set(COMMON_KERNEL_DEPS dense_tensor kernel_context kernel_factory)
set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function)

# auto build kernel targets by cmake
register_kernels(EXCLUDES flatten_kernel DEPS ${COMMON_KERNEL_DEPS})
# TODO(chenweihang): auto parse compile deps by include headers later
set(FLATTEN_DEPS ${COMMON_KERNEL_DEPS} utils_cpu unary)
if(WITH_GPU OR WITH_ROCM)
set(FLATTEN_DEPS ${FLATTEN_DEPS} utils_gpu)
elseif(WITH_XPU)
set(FLATTEN_DEPS ${FLATTEN_DEPS} utils_xpu)
endif()
kernel_library(flatten_kernel DEPS ${FLATTEN_DEPS})

if(WITH_GPU)
nv_library(flatten SRCS flatten_kernel.cc DEPS ${FLATTEN_DEPS})
elseif(WITH_ROCM)
hip_library(flatten SRCS flatten_kernel.cc DEPS ${FLATTEN_DEPS})
else()
cc_library(flatten SRCS flatten_kernel.cc DEPS ${FLATTEN_DEPS})
endif()
get_property(pten_kernels GLOBAL PROPERTY PTEN_KERNELS)
message(STATUS "PTEN_KERNELS: ${pten_kernels}")

copy_if_different(${kernel_declare_file} ${kernel_declare_file_final})
2 changes: 0 additions & 2 deletions paddle/pten/kernels/cpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,4 @@ cc_library(math_cpu SRCS math.cc DEPS dense_tensor kernel_context kernel_factory
cc_library(linalg_cpu SRCS linalg.cc DEPS dense_tensor kernel_context kernel_factory)
cc_library(utils_cpu SRCS utils.cc DEPS dense_tensor kernel_context kernel_factory memory convert_utils)
cc_library(manipulation_cpu SRCS manipulation.cc DEPS dense_tensor kernel_context kernel_factory utils_cpu unary)
cc_library(scale_kernel_cpu SRCS scale_kernel.cc DEPS dense_tensor kernel_context kernel_factory eigen_function)
cc_library(full_kernel_cpu SRCS full_kernel.cc DEPS dense_tensor kernel_context kernel_factory eigen_function)
cc_library(conj_kernel_cpu SRCS conj_kernel.cc DEPS dense_tensor kernel_context kernel_factory)
1 change: 0 additions & 1 deletion paddle/pten/kernels/flatten_kernel.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
// limitations under the License.

#include "paddle/pten/kernels/flatten_kernel.h"
#include "paddle/pten/api/ext/dispatch.h"
#include "paddle/pten/backends/all_context.h"
#include "paddle/pten/core/kernel_registry.h"
#include "paddle/pten/infermeta/unary.h"
Expand Down
4 changes: 0 additions & 4 deletions paddle/pten/kernels/gpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,11 @@ if(WITH_GPU)
nv_library(linalg_gpu SRCS linalg.cu DEPS eigen_function dense_tensor kernel_context kernel_factory)
nv_library(utils_gpu SRCS utils.cu DEPS dense_tensor kernel_context kernel_factory memory convert_utils)
nv_library(manipulation_gpu SRCS manipulation.cu DEPS dense_tensor kernel_context kernel_factory utils_gpu unary)
nv_library(scale_kernel_gpu SRCS scale_kernel.cu DEPS dense_tensor kernel_context kernel_factory eigen_function)
nv_library(full_kernel_gpu SRCS full_kernel.cu DEPS dense_tensor kernel_context kernel_factory eigen_function)
nv_library(conj_kernel_gpu SRCS conj_kernel.cu DEPS dense_tensor kernel_context kernel_factory)
elseif(WITH_ROCM)
hip_library(math_gpu SRCS math.cu DEPS eigen_function dense_tensor convert_utils kernel_context kernel_factory pten_transpose_gpu)
hip_library(linalg_gpu SRCS linalg.cu DEPS eigen_function dense_tensor kernel_context kernel_factory)
hip_library(utils_gpu SRCS utils.cu DEPS dense_tensor kernel_context kernel_factory memory convert_utils)
hip_library(manipulation_gpu SRCS manipulation.cu DEPS dense_tensor kernel_context kernel_factory utils_gpu unary)
hip_library(scale_kernel_gpu SRCS scale_kernel.cu DEPS dense_tensor kernel_context kernel_factory eigen_function)
hip_library(full_kernel_gpu SRCS full_kernel.cu DEPS dense_tensor kernel_context kernel_factory eigen_function)
hip_library(conj_kernel_gpu SRCS conj_kernel.cu DEPS dense_tensor kernel_context kernel_factory)
endif()
1 change: 1 addition & 0 deletions python/paddle/utils/code_gen/api_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,6 +349,7 @@ def source_include(header_file_path):
#include "paddle/pten/core/kernel_registry.h"
#include "paddle/pten/include/core.h"
#include "paddle/pten/include/infermeta.h"
#include "paddle/pten/kernels/declarations.h"
"""


Expand Down