Skip to content

Commit 22908aa

Browse files
committed
deal with conflict
2 parents 5e2f2a7 + 870402f commit 22908aa

File tree

101 files changed

+5329
-2223
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

101 files changed

+5329
-2223
lines changed

paddle/fluid/eager/auto_code_generator/eager_generator.cc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2653,7 +2653,8 @@ static void GenerateForwardDygraphFile(const std::string& forward_cc_path,
26532653
"#include \"paddle/fluid/eager/api/utils/global_utils.h\"\n"
26542654
"#include \"paddle/fluid/eager/amp_utils.h\"\n"
26552655
"#include \"paddle/fluid/eager/amp_auto_cast.h\"\n"
2656-
"#include \"paddle/fluid/platform/profiler/event_tracing.h\"\n\n";
2656+
"#include \"paddle/fluid/platform/profiler/event_tracing.h\"\n"
2657+
"#pragma GCC diagnostic ignored \"-Wunused-variable\"\n\n";
26572658
std::string forward_cc_include_str =
26582659
paddle::string::Sprintf(FORWARD_INCLUDE_TEMPLATE);
26592660
std::ofstream forward_cc_stream(forward_cc_path, std::ios::out);

paddle/fluid/framework/new_executor/data_transfer.cc

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -319,6 +319,7 @@ void ApplyDataTransform(const OpKernelType& expected_kernel_key,
319319
}
320320
}
321321

322+
bool transfered = false;
322323
DataTranferHelper data_transfer_helper(place, var_scope);
323324
for (auto& var_name_item : *ins_map_temp) {
324325
bool should_skip_input =
@@ -334,6 +335,9 @@ void ApplyDataTransform(const OpKernelType& expected_kernel_key,
334335
if (var->IsType<LoDTensor>() || var->IsType<phi::SelectedRows>()) {
335336
tensor_in = GetLoDTensorOrSelectedRowsValueFromVar(*var);
336337
} else if (var->IsType<LoDTensorArray>()) {
338+
if (var->Get<LoDTensorArray>().size() == 0) {
339+
continue;
340+
}
337341
tensor_in =
338342
static_cast<const Tensor*>(&(var->Get<LoDTensorArray>()[0]));
339343
} else {
@@ -389,6 +393,7 @@ void ApplyDataTransform(const OpKernelType& expected_kernel_key,
389393
}
390394

391395
if (is_transferred) {
396+
transfered = true;
392397
// update RuntimeContext.inputs and original op_func_node inputs
393398
op_func_node->input_index[var_name_item.first][i] =
394399
var_scope->VarId(new_var_name);
@@ -426,11 +431,13 @@ void ApplyDataTransform(const OpKernelType& expected_kernel_key,
426431
}
427432
}
428433

429-
// NOTE(zhiqiu): UPDATE the corresponding OeratorBase to make it consistent
430-
// with instruction. (hot fix, it is not good design here)
431-
op_func_node->operator_base_ =
432-
std::shared_ptr<OperatorBase>(framework::OpRegistry::CreateOp(
433-
op_base->Type(), new_ins, new_outs, op_base->Attrs()));
434+
if (transfered) {
435+
// NOTE(zhiqiu): UPDATE the corresponding OeratorBase to make it consistent
436+
// with instruction. (hot fix, it is not good design here)
437+
op_func_node->operator_base_ =
438+
std::shared_ptr<OperatorBase>(framework::OpRegistry::CreateOp(
439+
op_base->Type(), new_ins, new_outs, op_base->Attrs()));
440+
}
434441
op_func_node->no_data_transform_index = std::move(no_data_transform_index);
435442
}
436443

paddle/fluid/framework/new_executor/interpretercore.cc

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -300,8 +300,16 @@ void InterpreterCore::Convert(
300300
gc_event_.emplace_back(vec_instruction_[i].DeviceContext().GetPlace(),
301301
platform::GenerateDeviceEventFlag());
302302
}
303+
bool inplaced = false;
304+
for (auto inst : vec_instruction_) {
305+
if (inst.OpBase()->Type() == "share_buffer" ||
306+
inst.OpBase()->Type() == "share_data") {
307+
VLOG(4) << "Already inplaced, skip inplace now.";
308+
inplaced = true;
309+
}
310+
}
303311

304-
if (FLAGS_new_executor_use_inplace) {
312+
if (FLAGS_new_executor_use_inplace && !inplaced) {
305313
BuildInplace();
306314
}
307315

@@ -565,12 +573,11 @@ void InterpreterCore::RunNextInstructions(
565573
const Instruction& instr, std::queue<size_t>* reserved_next_ops,
566574
std::vector<std::atomic<size_t>>* atomic_deps,
567575
std::vector<std::atomic<size_t>>* atomic_var_ref) {
568-
VLOG(4) << "atomic 1:" << atomic_deps;
569576
auto& next_instr = instr.NextInstructions();
570577

571578
auto IsReady = [atomic_deps](size_t next_id) {
572-
VLOG(4) << "atomic:" << atomic_deps << " " << &(*atomic_deps)[next_id]
573-
<< " " << next_id;
579+
VLOG(4) << "atomic:" << atomic_deps << " op_id: " << next_id
580+
<< ", remain deps: " << (*atomic_deps)[next_id];
574581
return (*atomic_deps)[next_id].fetch_sub(1, std::memory_order_relaxed) == 1;
575582
};
576583

paddle/fluid/framework/new_executor/interpretercore_util.cc

Lines changed: 146 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -428,19 +428,19 @@ void build_op_func_list(const platform::Place& place,
428428
op_func_node.dev_ctx_ = dev_ctx;
429429
VLOG(3) << op_with_kernel->Type()
430430
<< " : expected_kernel_key : " << expected_kernel_key;
431-
auto exec_ctx =
432-
ExecutionContext(*op_with_kernel, scope, *dev_ctx, runtime_context);
433431

434432
// see OperatorWithKernel::RunImpl in operator.cc for why
435433
if (!(op->HasAttr(kAllKernelsMustComputeRuntimeShape) &&
436434
op->Attr<bool>(kAllKernelsMustComputeRuntimeShape))) {
437435
InterpretercoreInferShapeContext infer_shape_ctx(*op, runtime_context);
438436
// TODO(Aurelius84): In case of control flow ops, they are NOT
439-
// inheritted
440-
// from OperatorWithKernel.
437+
// inheritted from OperatorWithKernel.
441438
op_with_kernel->Info().infer_shape_(&infer_shape_ctx);
442439
}
443440

441+
auto exec_ctx =
442+
ExecutionContext(*op_with_kernel, scope, *dev_ctx, runtime_context);
443+
444444
auto run_phi_kernel = false;
445445
if (phi::KernelFactory::Instance().HasCompatiblePhiKernel(
446446
op_with_kernel->Type())) {
@@ -476,7 +476,6 @@ void build_op_func_list(const platform::Place& place,
476476
op_with_kernel->BuildPhiKernelContext(runtime_context, dev_ctx,
477477
&pt_kernel_context);
478478
op_func_node.pt_kernel_ = op_with_kernel->PhiKernel();
479-
480479
(*op_func_node.pt_kernel_)(&pt_kernel_context);
481480
} else {
482481
auto kernels_iter = all_op_kernels.find(op->Type());
@@ -711,6 +710,7 @@ std::map<int, std::list<int>> build_op_downstream_map(
711710
const std::set<std::string> random_op_set = {
712711
"bernoulli", "poisson", "multinomial", "gaussian_random",
713712
"uniform_random", "randint", "randperm", "exponential"};
713+
714714
int dependence_op_idx = -1;
715715
for (size_t op_idx = 0; op_idx < vec_instruction.size(); ++op_idx) {
716716
if (random_op_set.count(vec_instruction[op_idx].OpBase()->Type())) {
@@ -721,6 +721,147 @@ std::map<int, std::list<int>> build_op_downstream_map(
721721
}
722722
}
723723

724+
// add dependency for communication op
725+
const std::string communication_op_prefix = "c_";
726+
dependence_op_idx = -1;
727+
for (size_t op_idx = 0; op_idx < vec_instruction.size(); ++op_idx) {
728+
if (vec_instruction[op_idx].OpBase()->Type().find(
729+
communication_op_prefix) != std::string::npos) {
730+
if (dependence_op_idx != -1) {
731+
op2dependences[op_idx].insert(dependence_op_idx);
732+
}
733+
dependence_op_idx = op_idx;
734+
}
735+
}
736+
737+
// TODO(zhiqiu): there still some cases not handled
738+
// add dependency for c_sync_comm_stream
739+
740+
// in program, we can add only one c_sync_comm_stream to sync all
741+
// communication ops.
742+
// c_allreduce_sum(a)
743+
// c_allreduce_sum(b)
744+
// c_allreduce_sum(c)
745+
// c_sync_comm_stream(a)
746+
const std::string kSyncComm = "c_sync_comm_stream";
747+
dependence_op_idx = -1;
748+
for (size_t op_idx = 0; op_idx < vec_instruction.size(); ++op_idx) {
749+
if (vec_instruction[op_idx].OpBase()->Type() == kSyncComm) {
750+
dependence_op_idx = op_idx;
751+
} else {
752+
if (dependence_op_idx != -1) {
753+
VLOG(4) << "Add depend from "
754+
<< vec_instruction[dependence_op_idx].OpBase()->Type() << " to "
755+
<< vec_instruction[op_idx].OpBase()->Type();
756+
op2dependences[op_idx].insert(dependence_op_idx);
757+
}
758+
}
759+
}
760+
761+
// add dependency for coalesce_tensor
762+
const std::string kCoalesceTensor = "coalesce_tensor";
763+
for (size_t op_idx = 0; op_idx < vec_instruction.size(); ++op_idx) {
764+
if (vec_instruction[op_idx].OpBase()->Type() == kCoalesceTensor) {
765+
VLOG(4) << "Add depend for " << kCoalesceTensor << " " << op_idx;
766+
auto fused_out = vec_instruction[op_idx].Outputs().at("FusedOutput")[0];
767+
auto outputs = vec_instruction[op_idx].Outputs().at("Output");
768+
769+
auto is_read = [](const Instruction& inst, int var_id) -> bool {
770+
for (auto pair : inst.Inputs()) {
771+
for (auto item : pair.second) {
772+
if (item == var_id) {
773+
return true;
774+
}
775+
}
776+
}
777+
return false;
778+
};
779+
780+
auto is_write = [](const Instruction& inst, int var_id) -> bool {
781+
for (auto pair : inst.Outputs()) {
782+
for (auto item : pair.second) {
783+
if (item == var_id) {
784+
return true;
785+
}
786+
}
787+
}
788+
return false;
789+
};
790+
791+
// find first op that reads fused_out
792+
auto first_read_fused_out_op = -1;
793+
for (auto j = op_idx + 1; j < vec_instruction.size(); ++j) {
794+
if (is_read(vec_instruction[j], fused_out)) {
795+
first_read_fused_out_op = j;
796+
break;
797+
}
798+
}
799+
800+
if (UNLIKELY(first_read_fused_out_op == -1)) {
801+
VLOG(4) << "No op read FusedOutput";
802+
continue;
803+
}
804+
805+
// find ops that write 'outputs' between (op_index,
806+
// first_read_fused_out_op)
807+
// add depend: them->first_read_fused_out_op
808+
for (auto j = op_idx + 1;
809+
j < static_cast<size_t>(first_read_fused_out_op); ++j) {
810+
for (auto var_id : outputs) {
811+
if (is_write(vec_instruction[j], var_id)) {
812+
op2dependences[first_read_fused_out_op].insert(j);
813+
VLOG(4) << j << " -> " << first_read_fused_out_op;
814+
VLOG(4)
815+
<< "Add depend from " << vec_instruction[j].OpBase()->Type()
816+
<< " to "
817+
<< vec_instruction[first_read_fused_out_op].OpBase()->Type();
818+
}
819+
}
820+
}
821+
822+
// find first op read 'outputs' between (first_read_fused_out_op, end)
823+
// add depned: first_read_fused_out_op -> first op that reads 'outputs'
824+
825+
// special case for consecutive communication ops, for example,
826+
// FusedOutput = c_sync_calc_stream(FusedOutput)
827+
// FusedOutput= c_allreduce_sum(FusedOutput)
828+
// FusedOutput = c_sync_comm_stream(FusedOutput)
829+
// we should take the last one to add depned instead of
830+
// 'first_read_fused_out_op'
831+
size_t target = first_read_fused_out_op;
832+
for (size_t j = first_read_fused_out_op + 1; j < vec_instruction.size();
833+
++j) {
834+
if (j == target + 1 &&
835+
vec_instruction[target].OpBase()->Type().find(
836+
communication_op_prefix) != std::string::npos &&
837+
vec_instruction[j].OpBase()->Type().find(communication_op_prefix) !=
838+
std::string::npos) {
839+
VLOG(4) << "Found consecutive communication ops, "
840+
<< vec_instruction[target].OpBase()->Type() << " -> "
841+
<< vec_instruction[j].OpBase()->Type();
842+
target = j;
843+
continue;
844+
}
845+
846+
for (auto var_id : outputs) {
847+
if (is_read(vec_instruction[j], var_id)) {
848+
op2dependences[j].insert(target);
849+
VLOG(4) << target << " -> " << j;
850+
VLOG(4) << "Add depend from "
851+
<< vec_instruction[target].OpBase()->Type() << " to "
852+
<< vec_instruction[j].OpBase()->Type();
853+
}
854+
}
855+
}
856+
}
857+
}
858+
for (auto pair : op2dependences) {
859+
VLOG(10) << pair.first << " Depends on " << pair.second.size();
860+
std::ostringstream oss;
861+
std::copy(pair.second.begin(), pair.second.end(),
862+
std::ostream_iterator<int>(oss, " "));
863+
VLOG(10) << oss.str();
864+
}
724865
return std::move(get_downstream_map(op2dependences));
725866
}
726867

paddle/fluid/imperative/basic_engine.cc

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
#include "paddle/fluid/imperative/op_base.h"
3131
#include "paddle/fluid/imperative/tracer.h"
3232
#include "paddle/fluid/platform/profiler.h"
33+
#include "paddle/phi/kernels/autotune/switch_autotune.h"
3334
#include "paddle/phi/kernels/funcs/math_function.h"
3435

3536
DECLARE_bool(sort_sum_gradient);
@@ -645,6 +646,8 @@ void BasicEngine::Execute() {
645646
Clear();
646647

647648
VLOG(1) << "Backward op number: " << op_num;
649+
650+
phi::autotune::AutoTuneStatus::Instance().Update();
648651
}
649652

650653
void BasicEngine::Clear() {

paddle/fluid/memory/allocation/allocator_facade.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ PADDLE_DEFINE_EXPORTED_bool(use_virtual_memory_auto_growth, false,
8585
// NOTE(Ruibiao): This FLAGS is just to be compatibled with
8686
// the old single-stream CUDA allocator. It will be removed
8787
// after StreamSafeCudaAllocator has been fully tested.
88-
PADDLE_DEFINE_EXPORTED_bool(use_stream_safe_cuda_allocator, false,
88+
PADDLE_DEFINE_EXPORTED_bool(use_stream_safe_cuda_allocator, true,
8989
"Enable StreamSafeCUDAAllocator");
9090

9191
PADDLE_DEFINE_EXPORTED_bool(use_cuda_managed_memory, false,

paddle/fluid/operators/CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,10 +102,11 @@ endif()
102102

103103
set(OP_HEADER_DEPS ${OP_HEADER_DEPS} phi phi_api_utils gather_scatter_kernel)
104104

105-
register_operators(EXCLUDES py_layer_op py_func_op warpctc_op dgc_op load_combine_op lstm_op run_program_op eye_op
105+
register_operators(EXCLUDES py_layer_op py_func_op warpctc_op dgc_op load_combine_op lstm_op run_program_op eye_op quantize_linear_op
106106
recurrent_op save_combine_op sparse_attention_op sync_batch_norm_op spectral_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS})
107107

108108
op_library(run_program_op SRCS run_program_op.cc run_program_op.cu.cc DEPS executor_cache ${OP_HEADER_DEPS})
109+
op_library(quantize_linear_op DEPS cast_kernel)
109110
op_library(save_combine_op DEPS string_array)
110111
op_library(load_combine_op DEPS string_array)
111112

paddle/fluid/operators/detection/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ detection_library(yolo_box_op SRCS yolo_box_op.cc)
6666
detection_library(box_decoder_and_assign_op SRCS box_decoder_and_assign_op.cc box_decoder_and_assign_op.cu)
6767
detection_library(sigmoid_focal_loss_op SRCS sigmoid_focal_loss_op.cc sigmoid_focal_loss_op.cu)
6868
detection_library(retinanet_detection_output_op SRCS retinanet_detection_output_op.cc)
69+
detection_library(nms_op SRCS nms_op.cc nms_op.cu)
6970

7071
if(WITH_GPU OR WITH_ROCM)
7172
set(TMPDEPS memory)

0 commit comments

Comments
 (0)