Skip to content

多线程时,同一输入做两个embedding程序会崩溃 #9200

@jshower

Description

@jshower

在多线程时,使用如下网络结构进行训练时,在多线程时会出现训练崩溃的问题(单线程不会),在末尾处贴了错误的日志。

import sys import os import math import paddle.fluid as fluid from paddle.fluid.initializer import NormalInitializer def ner_net(word_dict_len, label_dict_len, parallel): IS_SPARSE = True #embedding_name = 'emb' #word_dict_len = 1942562 word_dim = 32 mention_dict_len = 57 mention_dim = 20 grnn_hidden = 36 #label_dict_len = 49 def _net_conf(word, mark, target): word_embedding = fluid.layers.embedding( input=word, size=[word_dict_len, word_dim], dtype='float32', is_sparse=IS_SPARSE, param_attr=fluid.ParamAttr(learning_rate=5)) mention_embedding = fluid.layers.embedding( input=mention, size=[mention_dict_len, mention_dim], dtype='float32', is_sparse=IS_SPARSE, param_attr=fluid.ParamAttr(learning_rate=5)) word_embedding_r = fluid.layers.embedding( input=word, size=[word_dict_len, word_dim], dtype='float32', is_sparse=IS_SPARSE, param_attr=fluid.ParamAttr(learning_rate=5)) mention_embedding_r = fluid.layers.embedding( input=mention, size=[mention_dict_len, mention_dim], dtype='float32', is_sparse=IS_SPARSE, param_attr=fluid.ParamAttr(learning_rate=5)) word_mention_vector = fluid.layers.concat( input=[word_embedding, mention_embedding], axis=1) word_mention_vector_r = fluid.layers.concat( input=[word_embedding_r, mention_embedding_r], axis=1) pre_gru = fluid.layers.fc(input = word_mention_vector, size = grnn_hidden * 3, param_attr = fluid.ParamAttr( regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4))) gru = fluid.layers.dynamic_gru(input=pre_gru, size=grnn_hidden, param_attr = fluid.ParamAttr( regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4))) pre_gru_r = fluid.layers.fc(input=word_mention_vector_r, size=grnn_hidden * 3, param_attr = fluid.ParamAttr( regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4))) gru_r = fluid.layers.dynamic_gru(input=pre_gru_r, size=grnn_hidden, is_reverse=True, param_attr = fluid.ParamAttr( regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4))) gru_merged = fluid.layers.concat(input=[gru, gru_r], axis=1) emission = fluid.layers.fc( size=label_dict_len, input=gru_merged, param_attr = fluid.ParamAttr( regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4))) crf_cost = fluid.layers.linear_chain_crf( input=emission, label=target, param_attr=fluid.ParamAttr( name='crfw', learning_rate=0.2, #regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4) )) avg_cost = fluid.layers.mean(x=crf_cost) return avg_cost, emission word = fluid.layers.data(name='word', shape=[1], dtype='int64', lod_level=1) mention = fluid.layers.data(name='mention', shape=[1], dtype='int64', lod_level=1) target = fluid.layers.data( name="target", shape=[1], dtype='int64', lod_level=1) if parallel: places = fluid.layers.get_places() pd = fluid.layers.ParallelDo(places) with pd.do(): word_ = pd.read_input(word) mention_ = pd.read_input(mention) target_ = pd.read_input(target) avg_cost, emission_base = _net_conf(word_, mention_, target_) pd.write_output(avg_cost) pd.write_output(emission_base) avg_cost_list, emission = pd() avg_cost = fluid.layers.mean(x=avg_cost_list) emission.stop_gradient = True else: avg_cost, emission = _net_conf(word, mention, target) return avg_cost, emission, word, mention, target

将上述程序稍作调整,去掉两个不同的embedding,则不会出错。想请教一下是什么原因,感觉这是一个对用户不友好的地方,因为对同一个输入做多个不同的embedding实际中也是存在的。

import sys import os import math import paddle.fluid as fluid from paddle.fluid.initializer import NormalInitializer def ner_net(word_dict_len, label_dict_len, parallel): IS_SPARSE = True #embedding_name = 'emb' #word_dict_len = 1942562 word_dim = 32 mention_dict_len = 57 mention_dim = 20 grnn_hidden = 36 #label_dict_len = 49 def _net_conf(word, mark, target): word_embedding = fluid.layers.embedding( input=word, size=[word_dict_len, word_dim], dtype='float32', is_sparse=IS_SPARSE, param_attr=fluid.ParamAttr(name="word_emb_f", learning_rate=5)) mention_embedding = fluid.layers.embedding( input=mention, size=[mention_dict_len, mention_dim], dtype='float32', is_sparse=IS_SPARSE, param_attr=fluid.ParamAttr(name="men_emb_r", learning_rate=5)) '''   word_embedding_r = fluid.layers.embedding(  input=word,  size=[word_dict_len, word_dim],  dtype='float32',  is_sparse=IS_SPARSE,  param_attr=fluid.ParamAttr(name="word_emb_f", learning_rate=5))   mention_embedding_r = fluid.layers.embedding(  input=mention,  size=[mention_dict_len, mention_dim],  dtype='float32',  is_sparse=IS_SPARSE,  param_attr=fluid.ParamAttr(name="men_emb_r", learning_rate=5))  ''' word_mention_vector = fluid.layers.concat( input=[word_embedding, mention_embedding], axis=1) '''  word_mention_vector_r = fluid.layers.concat(  input=[word_embedding_r, mention_embedding_r], axis=1)  ''' pre_gru = fluid.layers.fc(input = word_mention_vector, size = grnn_hidden * 3, param_attr = fluid.ParamAttr( regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4))) gru = fluid.layers.dynamic_gru(input=pre_gru, size=grnn_hidden, param_attr = fluid.ParamAttr( regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4))) pre_gru_r = fluid.layers.fc(input=word_mention_vector, size=grnn_hidden * 3, param_attr = fluid.ParamAttr( regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4))) gru_r = fluid.layers.dynamic_gru(input=pre_gru_r, size=grnn_hidden, is_reverse=True, param_attr = fluid.ParamAttr( regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4))) gru_merged = fluid.layers.concat(input=[gru, gru_r], axis=1) emission = fluid.layers.fc( size=label_dict_len, input=gru_merged, param_attr = fluid.ParamAttr( regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4))) crf_cost = fluid.layers.linear_chain_crf( input=emission, label=target, param_attr=fluid.ParamAttr( name='crfw', learning_rate=0.2, #regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4) )) avg_cost = fluid.layers.mean(x=crf_cost) return avg_cost, emission word = fluid.layers.data(name='word', shape=[1], dtype='int64', lod_level=1) mention = fluid.layers.data(name='mention', shape=[1], dtype='int64', lod_level=1) target = fluid.layers.data( name="target", shape=[1], dtype='int64', lod_level=1) if parallel: places = fluid.layers.get_places() pd = fluid.layers.ParallelDo(places) with pd.do(): word_ = pd.read_input(word) mention_ = pd.read_input(mention) target_ = pd.read_input(target) avg_cost, emission_base = _net_conf(word_, mention_, target_) pd.write_output(avg_cost) pd.write_output(emission_base) avg_cost_list, emission = pd() avg_cost = fluid.layers.mean(x=avg_cost_list) emission.stop_gradient = True else: avg_cost, emission = _net_conf(word, mention, target) return avg_cost, emission, word, mention, target
F0319 08:04:48.310140 3321 threadpool.h:96] The exception is thrown inside the thread pool. You should use RunAndGetException to handle the exception. The default exception handler is LOG(FATAL).enforce dtype != -1 failed, -1 == -1 Sum operator should have at least one tensor at [/paddle_gpu/Paddle/paddle/fluid/operators/sum_op.cc:73] PaddlePaddle Call Stacks: 0 0x7f2c50a1c48cp paddle::platform::EnforceNotMet::EnforceNotMet(std::__exception_ptr::exception_ptr, char const*, int) + 572 1 0x7f2c514f6138p paddle::operators::SumOp::GetExpectedKernelType(paddle::framework::ExecutionContext const&) const + 1912 2 0x7f2c5161682dp paddle::framework::OperatorWithKernel::RunImpl(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&) const + 381 3 0x7f2c50acd4a5p paddle::framework::Executor::RunPreparedContext(paddle::framework::ExecutorPrepareContext*, paddle::framework::Scope*, bool, bool) + 1781 4 0x7f2c50acea5fp paddle::framework::Executor::Run(paddle::framework::ProgramDesc const&, paddle::framework::Scope*, int, bool, bool) + 63 5 0x7f2c513f38b3p std::_Function_handler<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> (), std::__future_base::_Task_setter<std::unique_ptr<std::__future_base::_Result<std::unique_ptr<paddle::platform::EnforceNotMet, std::default_delete<paddle::platform::EnforceNotMet> > >, std::__future_base::_Result_base::_Deleter>, std::_Bind_simple<std::reference_wrapper<std::future<std::unique_ptr<paddle::platform::EnforceNotMet, std::default_delete<paddle::platform::EnforceNotMet> > > paddle::framework::ThreadPool::RunAndGetException<paddle::operators::ParallelDoGradOp::RunImpl(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&) const::{lambda()#1}>(paddle::operators::ParallelDoGradOp::RunImpl(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&) const::{lambda()#1})::{lambda()#1}> ()>, std::unique_ptr<paddle::platform::EnforceNotMet, std::default_delete<paddle::platform::EnforceNotMet> > > >::_M_invoke(std::_Any_data const&) + 99 6 0x7f2c513f058ep std::__future_base::_State_baseV2::_M_do_set(std::function<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> ()>*, bool*) + 46 7 0x7f2c8a954a99p 8 0x7f2c513f0bd2p std::__future_base::_State_baseV2::_M_set_result(std::function<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> ()>, bool) + 146 9 0x7f2c513f0d46p std::__future_base::_Task_state<std::future<std::unique_ptr<paddle::platform::EnforceNotMet, std::default_delete<std::unique_ptr> > > paddle::framework::ThreadPool::RunAndGetException<paddle::operators::ParallelDoGradOp::RunImpl(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&) const::{lambda()#1}>(paddle::operators::ParallelDoGradOp::RunImpl(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&) const::{lambda()#1})::{lambda()#1}, std::allocator<int>, std::default_delete<std::unique_ptr> ()>::_M_run() + 86 10 0x7f2c516315d4p paddle::framework::ThreadPool::TaskLoop() + 1012 11 0x7f2c7eab8c80p 12 0x7f2c8a94d6bap 13 0x7f2c8a68341dp clone + 109 *** Check failure stack trace: *** @ 0x7f2c517decad google::LogMessage::Fail() @ 0x7f2c517e0ff8 google::LogMessage::SendToLog() @ 0x7f2c517de7bb google::LogMessage::Flush() @ 0x7f2c517e1ece google::LogMessageFatal::~LogMessageFatal() @ 0x7f2c513f1847 std::_Function_handler<>::_M_invoke() @ 0x7f2c513f058e std::__future_base::_State_baseV2::_M_do_set() @ 0x7f2c8a954a99 __pthread_once_slow @ 0x7f2c513f0bd2 std::__future_base::_State_baseV2::_M_set_result() @ 0x7f2c513f0c91 std::__future_base::_Deferred_state<>::_M_complete_async() @ 0x7f2c513fa32a paddle::operators::ParallelDoGradOp::RunImpl() @ 0x7f2c50acd4a5 paddle::framework::Executor::RunPreparedContext() @ 0x7f2c50acea5f paddle::framework::Executor::Run() @ 0x7f2c50a38fc3 _ZZN8pybind1112cpp_function10initializeIZNS0_C4IvN6paddle9framework8ExecutorEIRKNS4_11ProgramDescEPNS4_5ScopeEibbEINS_4nameENS_9is_methodENS_7siblingEEEEMT0_FT_DpT1_EDpRKT2_EUlPS5_S8_SA_ibbE_vISO_S8_SA_ibbEISB_SC_SD_EEEvOSF_PFSE_SH_ESN_ENUlRNS_6detail13function_callEE1_4_FUNESV_ @ 0x7f2c50a36d04 pybind11::cpp_function::dispatcher() @ 0x4c37ed PyEval_EvalFrameEx @ 0x4b9ab6 PyEval_EvalCodeEx @ 0x4c16e7 PyEval_EvalFrameEx @ 0x4b9ab6 PyEval_EvalCodeEx @ 0x4c1e6f PyEval_EvalFrameEx @ 0x4b9ab6 PyEval_EvalCodeEx @ 0x4eb30f (unknown) @ 0x4e5422 PyRun_FileExFlags @ 0x4e3cd6 PyRun_SimpleFileExFlags @ 0x493ae2 Py_Main @ 0x7f2c8a59c830 __libc_start_main @ 0x4933e9 _start @ (nil) (unknown) Aborted 

如果需要完整的环境来复现问题,可以在Hi上联系我(jiaozhenyu)。

Metadata

Metadata

Labels

User用于标记用户问题

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions