Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion paddle/fluid/eager/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ set(eager_deps pten pten_api hook_utils tensor_utils utils global_utils backward
set(fluid_deps tracer layer proto_desc operator op_registry variable_helper memcpy)
set(generated_deps dygraph_function dygraph_node)

if(NOT DEFINED ON_INFER)
if(NOT ON_INFER)
message("Performing Eager Dygraph Auto Code Generation")
add_subdirectory(auto_code_generator)
endif()
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/eager/api/generated/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
add_subdirectory(eager_generated)

if(NOT DEFINED ON_INFER)
if(NOT ON_INFER)
add_subdirectory(fluid_generated)
endif()
33 changes: 31 additions & 2 deletions paddle/fluid/eager/auto_code_generator/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,38 @@ execute_process(
)

if(WIN32)
set(EAGER_CODEGEN_DEPS eager_generator)
if("${CMAKE_GENERATOR}" STREQUAL "Ninja")
set(eager_generator_path "${CMAKE_CURRENT_BINARY_DIR}")
else()
set(eager_generator_path "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}")
endif()

if(${CBLAS_PROVIDER} STREQUAL MKLML)
message("Copied libiomp5md.dll for Eager AutoCodeGen")
ADD_CUSTOM_COMMAND(OUTPUT ${eager_generator_path}/libiomp5md.dll
COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_SHARED_IOMP_LIB} ${eager_generator_path}
DEPENDS mklml)
list(APPEND EAGER_CODEGEN_DEPS ${eager_generator_path}/libiomp5md.dll)
else(${CBLAS_PROVIDER} STREQUAL EXTERN_OPENBLAS)
message("Copied openblas.dll for Eager AutoCodeGen")
ADD_CUSTOM_COMMAND(OUTPUT ${eager_generator_path}/openblas.dll
COMMAND ${CMAKE_COMMAND} -E copy ${OPENBLAS_SHARED_LIB} ${eager_generator_path}
DEPENDS extern_openblas)
list(APPEND EAGER_CODEGEN_DEPS ${eager_generator_path}/openblas.dll)
endif()

if(WITH_MKLDNN)
message("Copied mkldnn.dll for Eager AutoCodeGen")
ADD_CUSTOM_COMMAND(OUTPUT ${eager_generator_path}/mkldnn.dll
COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_SHARED_LIB} ${eager_generator_path}
DEPENDS mkldnn)
list(APPEND EAGER_CODEGEN_DEPS ${eager_generator_path}/mkldnn.dll)
endif()

add_custom_target(eager_codegen
COMMAND "${CMAKE_CURRENT_BINARY_DIR}/eager_generator.exe" "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated"
DEPENDS eager_generator
COMMAND "${eager_generator_path}/eager_generator.exe" "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated"
DEPENDS ${EAGER_CODEGEN_DEPS}
VERBATIM)
else()
add_custom_target(eager_codegen
Expand Down
4 changes: 4 additions & 0 deletions paddle/fluid/eager/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,6 @@
add_subdirectory(data_structure_tests)
add_subdirectory(task_tests)

if(NOT ON_INFER)
add_subdirectory(performance_tests)
endif()
7 changes: 7 additions & 0 deletions paddle/fluid/eager/tests/performance_tests/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
cc_library(performance_benchmark_utils SRCS benchmark_utils.cc DEPS ${eager_deps} ${fluid_deps} ${generated_deps} eager_scale scale_node scale_op matmul_v2_op)

cc_test(test_egr_performance_benchmark_eager_cpu SRCS benchmark_eager_cpu.cc DEPS performance_benchmark_utils ${eager_deps} ${fluid_deps})
cc_test(test_egr_performance_benchmark_fluid_cpu SRCS benchmark_fluid_cpu.cc DEPS performance_benchmark_utils ${eager_deps} ${fluid_deps})

cc_test(test_egr_performance_benchmark_eager_cuda SRCS benchmark_eager_cuda.cc DEPS performance_benchmark_utils ${eager_deps} ${fluid_deps})
cc_test(test_egr_performance_benchmark_fluid_cuda SRCS benchmark_fluid_cuda.cc DEPS performance_benchmark_utils ${eager_deps} ${fluid_deps})
20 changes: 10 additions & 10 deletions paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@

#include "paddle/fluid/imperative/tracer.h"

#include "paddle/fluid/eager/tests/benchmark/benchmark_utils.h"
#include "paddle/fluid/eager/tests/performance_tests/benchmark_utils.h"
#include "paddle/fluid/eager/tests/test_utils.h"

#ifdef WITH_GPERFTOOLS
Expand All @@ -42,11 +42,11 @@ TEST(Benchmark, Init) { FLAGS_run_pten_kernel = false; }

TEST(Benchmark, EagerScaleCPU) {
// Prepare Device Contexts
egr::InitEnv(paddle::platform::CPUPlace());
eager_test::InitEnv(paddle::platform::CPUPlace());

for (const std::string& mode : {"Accuracy", "Performance"}) {
paddle::framework::DDim ddim = paddle::framework::make_ddim({2, 4, 4, 4});
egr::EagerTensor tensor = EagerUtils::CreateTensorWithValue(
egr::EagerTensor tensor = CreateTensorWithValue(
ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
pten::DataLayout::NCHW, 5.0, true);
RetainGradForTensor(tensor);
Expand Down Expand Up @@ -78,20 +78,20 @@ TEST(Benchmark, EagerScaleCPU) {

TEST(Benchmark, EagerIntermediateMatmulCPU) {
// Prepare Device Contexts
InitEnv(paddle::platform::CPUPlace());
eager_test::InitEnv(paddle::platform::CPUPlace());

auto tracer = std::make_shared<paddle::imperative::Tracer>();
paddle::imperative::SetCurrentTracer(tracer);

for (const std::string& mode : {"Accuracy", "Performance"}) {
paddle::framework::DDim ddimX = paddle::framework::make_ddim({2, 2});
egr::EagerTensor X = EagerUtils::CreateTensorWithValue(
egr::EagerTensor X = CreateTensorWithValue(
ddimX, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
pten::DataLayout::NCHW, 1.0, true);
RetainGradForTensor(X);

paddle::framework::DDim ddimY = paddle::framework::make_ddim({2, 2});
egr::EagerTensor Y = EagerUtils::CreateTensorWithValue(
egr::EagerTensor Y = CreateTensorWithValue(
ddimY, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
pten::DataLayout::NCHW, 2.0, true);
RetainGradForTensor(Y);
Expand Down Expand Up @@ -122,15 +122,15 @@ TEST(Benchmark, EagerIntermediateMatmulCPU) {

TEST(Benchmark, EagerIntermediateMLPCPU) {
// Prepare Device Contexts
InitEnv(paddle::platform::CPUPlace());
eager_test::InitEnv(paddle::platform::CPUPlace());

auto tracer = std::make_shared<paddle::imperative::Tracer>();
paddle::imperative::SetCurrentTracer(tracer);

for (const std::string& mode : {"Accuracy", "Performance"}) {
paddle::framework::DDim ddimX =
paddle::framework::make_ddim({MLP_M, MLP_N});
egr::EagerTensor X = EagerUtils::CreateTensorWithValue(
egr::EagerTensor X = CreateTensorWithValue(
ddimX, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
pten::DataLayout::NCHW, MLP_X_VAL, true);
RetainGradForTensor(X);
Expand All @@ -140,13 +140,13 @@ TEST(Benchmark, EagerIntermediateMLPCPU) {
for (size_t i = 0; i < MLP_NUM_LINEAR; i++) {
paddle::framework::DDim ddimW =
paddle::framework::make_ddim({MLP_N, MLP_K});
egr::EagerTensor W = EagerUtils::CreateTensorWithValue(
egr::EagerTensor W = CreateTensorWithValue(
ddimW, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
pten::DataLayout::NCHW, MLP_W_VAL, true);
RetainGradForTensor(W);

paddle::framework::DDim ddimB = paddle::framework::make_ddim({MLP_K});
egr::EagerTensor B = EagerUtils::CreateTensorWithValue(
egr::EagerTensor B = CreateTensorWithValue(
ddimB, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
pten::DataLayout::NCHW, MLP_B_VAL, true);
RetainGradForTensor(B);
Expand Down
24 changes: 14 additions & 10 deletions paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@

#include "paddle/fluid/imperative/tracer.h"

#include "paddle/fluid/eager/tests/benchmark/benchmark_utils.h"
#include "paddle/fluid/eager/tests/performance_tests/benchmark_utils.h"
#include "paddle/fluid/eager/tests/test_utils.h"

#ifdef WITH_GPERFTOOLS
Expand All @@ -38,12 +38,14 @@ DECLARE_bool(run_pten_kernel);

TEST(Benchmark, Init) { FLAGS_run_pten_kernel = false; }

#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)

TEST(Benchmark, EagerScaleCUDA) {
egr::InitEnv(paddle::platform::CUDAPlace());
eager_test::InitEnv(paddle::platform::CUDAPlace());

for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) {
paddle::framework::DDim ddim = paddle::framework::make_ddim({2, 4, 4, 4});
egr::EagerTensor tensor = EagerUtils::CreateTensorWithValue(
egr::EagerTensor tensor = CreateTensorWithValue(
ddim, paddle::platform::CUDAPlace(), pten::DataType::FLOAT32,
pten::DataLayout::NCHW, 5.0 /*value*/, true /*is_leaf*/);
RetainGradForTensor(tensor);
Expand Down Expand Up @@ -77,21 +79,21 @@ TEST(Benchmark, EagerScaleCUDA) {

TEST(Benchmark, EagerIntermediateMatmulCUDA) {
paddle::platform::CUDAPlace place;
egr::InitEnv(place);
eager_test::InitEnv(place);

auto tracer = std::make_shared<paddle::imperative::Tracer>();
tracer->SetExpectedPlace(place);
paddle::imperative::SetCurrentTracer(tracer);

for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) {
paddle::framework::DDim ddimX = paddle::framework::make_ddim({2, 2});
egr::EagerTensor X = EagerUtils::CreateTensorWithValue(
egr::EagerTensor X = CreateTensorWithValue(
ddimX, paddle::platform::CUDAPlace(), pten::DataType::FLOAT32,
pten::DataLayout::NCHW, 1.0, true);
RetainGradForTensor(X);

paddle::framework::DDim ddimY = paddle::framework::make_ddim({2, 2});
egr::EagerTensor Y = EagerUtils::CreateTensorWithValue(
egr::EagerTensor Y = CreateTensorWithValue(
ddimY, paddle::platform::CUDAPlace(), pten::DataType::FLOAT32,
pten::DataLayout::NCHW, 2.0, true);
RetainGradForTensor(Y);
Expand Down Expand Up @@ -125,7 +127,7 @@ TEST(Benchmark, EagerIntermediateMatmulCUDA) {

TEST(Benchmark, EagerIntermediateMLPCUDA) {
paddle::platform::CUDAPlace place;
egr::InitEnv(place);
eager_test::InitEnv(place);

auto tracer = std::make_shared<paddle::imperative::Tracer>();
tracer->SetExpectedPlace(place);
Expand All @@ -134,7 +136,7 @@ TEST(Benchmark, EagerIntermediateMLPCUDA) {
for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) {
paddle::framework::DDim ddimX =
paddle::framework::make_ddim({MLP_M, MLP_N});
egr::EagerTensor X = EagerUtils::CreateTensorWithValue(
egr::EagerTensor X = CreateTensorWithValue(
ddimX, paddle::platform::CUDAPlace(), pten::DataType::FLOAT32,
pten::DataLayout::NCHW, MLP_X_VAL, true);
RetainGradForTensor(X);
Expand All @@ -144,13 +146,13 @@ TEST(Benchmark, EagerIntermediateMLPCUDA) {
for (size_t i = 0; i < MLP_NUM_LINEAR; i++) {
paddle::framework::DDim ddimW =
paddle::framework::make_ddim({MLP_N, MLP_K});
egr::EagerTensor W = EagerUtils::CreateTensorWithValue(
egr::EagerTensor W = CreateTensorWithValue(
ddimW, paddle::platform::CUDAPlace(), pten::DataType::FLOAT32,
pten::DataLayout::NCHW, MLP_W_VAL, true);
RetainGradForTensor(W);

paddle::framework::DDim ddimB = paddle::framework::make_ddim({MLP_K});
egr::EagerTensor B = EagerUtils::CreateTensorWithValue(
egr::EagerTensor B = CreateTensorWithValue(
ddimB, paddle::platform::CUDAPlace(), pten::DataType::FLOAT32,
pten::DataLayout::NCHW, MLP_B_VAL, true);
RetainGradForTensor(B);
Expand Down Expand Up @@ -185,3 +187,5 @@ TEST(Benchmark, EagerIntermediateMLPCUDA) {
}
}
}

#endif // PADDLE_WITH_CUDA || PADDLE_WITH_HIP
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
#include "glog/logging.h"
#include "gtest/gtest.h"

#include "paddle/fluid/eager/tests/benchmark/benchmark_utils.h"
#include "paddle/fluid/eager/tests/performance_tests/benchmark_utils.h"
#include "paddle/fluid/eager/tests/test_utils.h"
#include "paddle/fluid/imperative/basic_engine.h"
#include "paddle/fluid/imperative/tracer.h"
Expand All @@ -45,7 +45,7 @@ namespace imperative {
TEST(Benchmark, FluidScaleCPU) {
// Prepare Device Contexts
platform::CPUPlace place;
egr::InitEnv(place);
eager_test::InitEnv(place);

for (const std::string& mode : {"Accuracy", "Performance"}) {
std::shared_ptr<imperative::VarBase> X(new imperative::VarBase(true, "X"));
Expand Down Expand Up @@ -88,7 +88,7 @@ TEST(Benchmark, FluidScaleCPU) {
TEST(Benchmark, FluidMatmulCPU) {
// Prepare Device Contexts
platform::CPUPlace place;
egr::InitEnv(place);
eager_test::InitEnv(place);

for (const std::string& mode : {"Accuracy", "Performance"}) {
std::shared_ptr<imperative::VarBase> X(new imperative::VarBase(true, "X"));
Expand Down Expand Up @@ -141,7 +141,7 @@ TEST(Benchmark, FluidMatmulCPU) {
TEST(Benchmark, FluidMLPCPU) {
// Prepare Device Contexts
platform::CPUPlace place;
egr::InitEnv(place);
eager_test::InitEnv(place);

for (const std::string& mode : {"Accuracy", "Performance"}) {
std::vector<float> x_src_data(MLP_M * MLP_N, MLP_X_VAL);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
#include "glog/logging.h"
#include "gtest/gtest.h"

#include "paddle/fluid/eager/tests/benchmark/benchmark_utils.h"
#include "paddle/fluid/eager/tests/performance_tests/benchmark_utils.h"
#include "paddle/fluid/eager/tests/test_utils.h"
#include "paddle/fluid/imperative/basic_engine.h"
#include "paddle/fluid/imperative/tracer.h"
Expand All @@ -39,13 +39,15 @@ DECLARE_bool(run_pten_kernel);

TEST(Benchmark, Init) { FLAGS_run_pten_kernel = false; }

#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)

namespace paddle {
namespace imperative {

TEST(Benchmark, FluidScaleCUDA) {
// Prepare Device Contexts
platform::CUDAPlace place;
egr::InitEnv(place);
eager_test::InitEnv(place);

for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) {
std::shared_ptr<imperative::VarBase> X(new imperative::VarBase(true, "X"));
Expand Down Expand Up @@ -98,7 +100,7 @@ TEST(Benchmark, FluidScaleCUDA) {
TEST(Benchmark, FluidMatmulCUDA) {
// Prepare Device Contexts
platform::CUDAPlace place;
egr::InitEnv(place);
eager_test::InitEnv(place);

for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) {
std::shared_ptr<imperative::VarBase> X(new imperative::VarBase(true, "X"));
Expand Down Expand Up @@ -161,7 +163,7 @@ TEST(Benchmark, FluidMatmulCUDA) {
TEST(Benchmark, FluidMLPCUDA) {
// Prepare Device Contexts
platform::CUDAPlace place;
egr::InitEnv(place);
eager_test::InitEnv(place);

for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) {
paddle::platform::DeviceContextPool& pool =
Expand Down Expand Up @@ -252,3 +254,5 @@ USE_OP(scale);
USE_OP(matmul_v2);
USE_OP(reduce_sum);
USE_OP(reduce_sum_grad);

#endif // PADDLE_WITH_CUDA || PADDLE_WITH_HIP
Loading