Skip to content

Commit 092839d

Browse files
authored
[psgpu]add checknan print and fix trainer device (#38131)
* trainer_device fix and checknan tool for psgpu;test=develop * disable show_one_table;test=develop
1 parent 25c1b62 commit 092839d

File tree

5 files changed

+77
-25
lines changed

5 files changed

+77
-25
lines changed

paddle/fluid/framework/fleet/ps_gpu_wrapper.cc

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -454,9 +454,9 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) {
454454
this->HeterPs_->build_ps(i, gpu_task->device_keys_[i].data(),
455455
gpu_task->device_values_[i].data(),
456456
feature_keys_count[i], 500000, 2);
457-
if (feature_keys_count[i] > 0) {
458-
HeterPs_->show_one_table(i);
459-
}
457+
// if (feature_keys_count[i] > 0) {
458+
// HeterPs_->show_one_table(i);
459+
// }
460460
};
461461
for (size_t i = 0; i < threads.size(); i++) {
462462
threads[i] = std::thread(build_func, i);

paddle/fluid/framework/ps_gpu_trainer.cc

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,8 @@ void PSGPUTrainer::Initialize(const TrainerDesc& trainer_desc,
7575
workers_[i]->SetDumpParamVector(dump_param_);
7676
workers_[i]->InitRandomDumpConfig(trainer_desc);
7777
workers_[i]->SetDataFeed(readers[i]);
78+
workers_[i]->SetPlace(places_[i]);
79+
workers_[i]->SetReaderPlace(places_[i]);
7880
workers_[i]->Initialize(trainer_desc);
7981
workers_[i]->SetWorkerNum(place_num);
8082
}
@@ -102,8 +104,6 @@ void PSGPUTrainer::RegisterHeterCallback() {
102104
void PSGPUTrainer::InitTrainerEnv(const ProgramDesc& main_program,
103105
const platform::Place& place) {
104106
for (size_t i = 0; i < places_.size(); ++i) {
105-
workers_[i]->SetPlace(places_[i]);
106-
workers_[i]->SetReaderPlace(places_[i]);
107107
workers_[i]->SetRootScope(root_scope_);
108108
workers_[i]->CreateDeviceResource(main_program); // Program
109109
workers_[i]->BindingDataFeedMemory();
@@ -216,7 +216,9 @@ void PSGPUTrainer::Finalize() {
216216
continue;
217217
}
218218
LoDTensor* root_tensor = root_var->GetMutable<LoDTensor>();
219-
219+
if (root_tensor == nullptr || !root_tensor->IsInitialized()) {
220+
continue;
221+
}
220222
for (size_t j = 0; j < places_.size(); j++) {
221223
Scope* cur_thread_scope = workers_[j]->GetThreadScope();
222224
Variable* thread_var =
@@ -225,6 +227,9 @@ void PSGPUTrainer::Finalize() {
225227
continue;
226228
}
227229
LoDTensor* thread_tensor = thread_var->GetMutable<LoDTensor>();
230+
if (thread_tensor == nullptr || !thread_tensor->IsInitialized()) {
231+
continue;
232+
}
228233
#define MergeCallback(cpp_type, proto_type) \
229234
do { \
230235
if (root_tensor->type() == proto_type) { \

paddle/fluid/framework/ps_gpu_worker.cc

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ limitations under the License. */
1515
#include "paddle/fluid/framework/device_worker.h"
1616
#include "paddle/fluid/framework/device_worker_factory.h"
1717
#include "paddle/fluid/platform/cpu_helper.h"
18+
#include "paddle/fluid/platform/lodtensor_printer.h"
1819
#include "paddle/fluid/string/string_helper.h"
1920

2021
#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
@@ -149,6 +150,38 @@ void PSGPUWorker::TrainFiles() {
149150
DumpParam(*thread_scope_, batch_cnt);
150151
}
151152

153+
for (std::string& var_name : check_nan_var_names_) {
154+
Variable* var = thread_scope_->FindVar(var_name);
155+
if (var == nullptr) {
156+
continue;
157+
}
158+
LoDTensor* tensor = var->GetMutable<LoDTensor>();
159+
if (tensor == nullptr || !tensor->IsInitialized()) {
160+
continue;
161+
}
162+
if (framework::TensorContainsInf(*tensor) ||
163+
framework::TensorContainsNAN(*tensor)) {
164+
static std::mutex mutex;
165+
{
166+
std::lock_guard<std::mutex> lock(mutex);
167+
VLOG(0) << "worker " << thread_id_ << ": " << var_name
168+
<< " cantains inf or nan";
169+
auto all_vars = thread_scope_->LocalVarNames();
170+
std::stringstream ss;
171+
ss << "====== worker " << thread_id_ << "======\n";
172+
for (auto& local_var : all_vars) {
173+
platform::PrintVar(thread_scope_, local_var, local_var, &ss);
174+
ss << "\n";
175+
}
176+
std::cout << ss.str() << std::endl;
177+
VLOG(0) << "worker " << thread_id_ << "print nan var done....";
178+
}
179+
sleep(600);
180+
exit(-1);
181+
}
182+
}
183+
184+
dev_ctx_->Wait();
152185
PrintFetchVars();
153186
thread_scope_->DropKids();
154187
++batch_cnt;

paddle/fluid/operators/tensor_formatter.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,10 @@ class TensorFormatter {
3535
const std::string& tensor_name = "",
3636
const std::string& message = "");
3737

38+
template <typename T>
39+
void FormatData(const framework::LoDTensor& print_tensor,
40+
std::stringstream& log_stream);
41+
3842
void Print(const framework::LoDTensor& print_tensor,
3943
const std::string& tensor_name = "",
4044
const std::string& message = "");
@@ -46,10 +50,6 @@ class TensorFormatter {
4650
void SetSummarize(int64_t summarize);
4751

4852
private:
49-
template <typename T>
50-
void FormatData(const framework::LoDTensor& print_tensor,
51-
std::stringstream& log_stream);
52-
5353
int64_t summarize_ = -1;
5454
bool print_tensor_type_ = true;
5555
bool print_tensor_shape_ = true;

paddle/fluid/platform/lodtensor_printer.cc

Lines changed: 29 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -39,23 +39,37 @@ void PrintVar(framework::Scope* scope, const std::string& var_name,
3939
<< " does not exist in your scope";
4040
return;
4141
}
42+
if (!tensor->IsInitialized()) {
43+
VLOG(0) << "tensor of variable " << var_name
44+
<< " does not initialized in your scope";
45+
return;
46+
}
4247

43-
*sstream << print_info << ": ";
48+
*sstream << print_info;
4449

45-
#define PrintTensorCallback(cpp_type, proto_type) \
46-
do { \
47-
if (tensor->type() == proto_type) { \
48-
*sstream << "["; \
49-
auto* data = tensor->data<cpp_type>(); \
50-
auto element_num = tensor->numel(); \
51-
if (element_num > 0) { \
52-
*sstream << data[0]; \
53-
for (int j = 1; j < element_num; ++j) { \
54-
*sstream << " " << data[j]; \
55-
} \
56-
} \
57-
*sstream << "]"; \
58-
} \
50+
#define PrintTensorCallback(cpp_type, proto_type) \
51+
do { \
52+
if (tensor->type() == proto_type) { \
53+
*sstream << "["; \
54+
const cpp_type* data = nullptr; \
55+
framework::LoDTensor cpu_tensor; \
56+
if (is_cpu_place(tensor->place())) { \
57+
data = tensor->data<cpp_type>(); \
58+
} else { \
59+
platform::CPUPlace cpu_place; \
60+
TensorCopy(*tensor, cpu_place, &cpu_tensor); \
61+
data = cpu_tensor.data<cpp_type>(); \
62+
} \
63+
auto element_num = tensor->numel(); \
64+
*sstream << element_num << "]:["; \
65+
if (element_num > 0) { \
66+
*sstream << data[0]; \
67+
for (int j = 1; j < element_num; ++j) { \
68+
*sstream << " " << data[j]; \
69+
} \
70+
} \
71+
*sstream << "]"; \
72+
} \
5973
} while (0)
6074

6175
_ForEachDataType_(PrintTensorCallback);

0 commit comments

Comments
 (0)