Skip to content

Commit 0790f94

Browse files
liutiexingliutiexing
andauthored
Add cuda tracer (#39488)
* add align for WorkQueue * add spinlock * merge develop * merge * Add EventsWaiter * Revert "Add EventsWaiter" This reverts commit e206173. * add log for Executor * Add CudaTracer to trace CUDA events Co-authored-by: liutiexing <liutiexing@google.com>
1 parent 765a2ad commit 0790f94

File tree

12 files changed

+744
-93
lines changed

12 files changed

+744
-93
lines changed

paddle/fluid/platform/profiler/CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
cc_library(host_tracer SRCS host_tracer.cc DEPS enforce)
2-
cc_library(new_profiler SRCS profiler.cc DEPS host_tracer)
2+
cc_library(cuda_tracer SRCS cuda_tracer.cc cupti_data_process.cc DEPS workqueue_utils enforce glog)
3+
cc_library(new_profiler SRCS profiler.cc DEPS host_tracer cuda_tracer)
34
cc_library(event_node SRCS event_node.cc DEPS enforce)
45
cc_library(chrometracinglogger SRCS chrometracing_logger.cc DEPS event_node)
56
cc_test(test_event_node SRCS test_event_node.cc DEPS event_node chrometracinglogger)
Lines changed: 191 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,191 @@
1+
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
#include "paddle/fluid/platform/profiler/cuda_tracer.h"
16+
#include <string>
17+
#include <unordered_map>
18+
#include "glog/logging.h"
19+
#include "paddle/fluid/framework/new_executor/workqueue/workqueue_utils.h"
20+
#include "paddle/fluid/platform/os_info.h"
21+
#include "paddle/fluid/platform/profiler/cupti_data_process.h"
22+
23+
#define CUPTI_CALL(call) \
24+
do { \
25+
CUptiResult _status = call; \
26+
if (_status != CUPTI_SUCCESS) { \
27+
const char* errstr; \
28+
dynload::cuptiGetResultString(_status, &errstr); \
29+
LOG(ERROR) << "Function " << #call << " failed with error " << errstr; \
30+
exit(-1); \
31+
} \
32+
} while (0)
33+
34+
namespace paddle {
35+
namespace platform {
36+
37+
namespace details {
38+
std::unordered_map<uint32_t, uint64_t> CreateThreadIdMapping() {
39+
std::unordered_map<uint32_t, uint64_t> mapping;
40+
std::unordered_map<uint64_t, ThreadId> ids = GetAllThreadIds();
41+
for (const auto& id : ids) {
42+
mapping[id.second.cupti_tid] = id.second.sys_tid;
43+
}
44+
return mapping;
45+
}
46+
} // namespace details
47+
48+
CudaTracer::CudaTracer() {}
49+
50+
void CudaTracer::PrepareTracing() {
51+
PADDLE_ENFORCE_EQ(
52+
state_ == TracerState::UNINITED || state_ == TracerState::STOPED, true,
53+
platform::errors::PreconditionNotMet("Tracer must be UNINITED"));
54+
EnableCuptiActivity();
55+
state_ = TracerState::READY;
56+
}
57+
58+
void CudaTracer::StartTracing() {
59+
PADDLE_ENFORCE_EQ(
60+
state_ == TracerState::READY, true,
61+
platform::errors::PreconditionNotMet("Tracer must be READY or STOPPED"));
62+
ConsumeBuffers();
63+
tracing_start_ns_ = PosixInNsec();
64+
state_ = TracerState::STARTED;
65+
}
66+
67+
void CudaTracer::StopTracing() {
68+
PADDLE_ENFORCE_EQ(
69+
state_, TracerState::STARTED,
70+
platform::errors::PreconditionNotMet("Tracer must be STARTED"));
71+
DisableCuptiActivity();
72+
state_ = TracerState::STOPED;
73+
}
74+
75+
void CudaTracer::CollectTraceData(TraceEventCollector* collector) {
76+
PADDLE_ENFORCE_EQ(
77+
state_, TracerState::STOPED,
78+
platform::errors::PreconditionNotMet("Tracer must be STOPED"));
79+
ProcessCuptiActivity(collector);
80+
}
81+
82+
int CudaTracer::ProcessCuptiActivity(TraceEventCollector* collector) {
83+
int record_cnt = 0;
84+
#ifdef PADDLE_WITH_CUPTI
85+
CUPTI_CALL(dynload::cuptiActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED));
86+
auto mapping = details::CreateThreadIdMapping();
87+
std::vector<ActivityBuffer> buffers = ConsumeBuffers();
88+
for (auto& buffer : buffers) {
89+
if (buffer.addr == nullptr || buffer.valid_size == 0) {
90+
continue;
91+
}
92+
93+
CUpti_Activity* record = nullptr;
94+
while (true) {
95+
CUptiResult status = dynload::cuptiActivityGetNextRecord(
96+
buffer.addr, buffer.valid_size, &record);
97+
if (status == CUPTI_SUCCESS) {
98+
details::ProcessCuptiActivityRecord(record, tracing_start_ns_, mapping,
99+
collector);
100+
++record_cnt;
101+
} else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) {
102+
break;
103+
} else {
104+
CUPTI_CALL(status);
105+
}
106+
}
107+
108+
ReleaseBuffer(buffer.addr);
109+
}
110+
#endif
111+
return record_cnt;
112+
}
113+
114+
void CudaTracer::EnableCuptiActivity() {
115+
#ifdef PADDLE_WITH_CUPTI
116+
CUPTI_CALL(dynload::cuptiActivityRegisterCallbacks(BufferRequestedCallback,
117+
BufferCompletedCallback));
118+
119+
CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMCPY));
120+
CUPTI_CALL(
121+
dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL));
122+
CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DRIVER));
123+
CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME));
124+
CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMSET));
125+
VLOG(3) << "enable cupti activity";
126+
#endif
127+
}
128+
129+
void CudaTracer::DisableCuptiActivity() {
130+
#ifdef PADDLE_WITH_CUPTI
131+
CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMCPY));
132+
CUPTI_CALL(
133+
dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL));
134+
CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_DRIVER));
135+
CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_RUNTIME));
136+
CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMSET));
137+
VLOG(3) << "disable cupti activity";
138+
#endif
139+
}
140+
141+
#ifdef PADDLE_WITH_CUPTI
142+
void CUPTIAPI CudaTracer::BufferRequestedCallback(uint8_t** buffer,
143+
size_t* size,
144+
size_t* max_num_records) {
145+
GetInstance().AllocateBuffer(buffer, size);
146+
*max_num_records = 0;
147+
}
148+
149+
void CUPTIAPI CudaTracer::BufferCompletedCallback(CUcontext ctx,
150+
uint32_t stream_id,
151+
uint8_t* buffer, size_t size,
152+
size_t valid_size) {
153+
GetInstance().ProduceBuffer(buffer, valid_size);
154+
size_t dropped = 0;
155+
CUPTI_CALL(
156+
dynload::cuptiActivityGetNumDroppedRecords(ctx, stream_id, &dropped));
157+
if (dropped != 0) {
158+
LOG(WARNING) << "Stream " << stream_id << " Dropped " << dropped
159+
<< " activity records";
160+
}
161+
}
162+
#endif
163+
164+
void CudaTracer::AllocateBuffer(uint8_t** buffer, size_t* size) {
165+
constexpr size_t kBufSize = 1 << 23; // 8 MB
166+
constexpr size_t kBufAlign = 8; // 8 B
167+
*buffer = reinterpret_cast<uint8_t*>(
168+
paddle::framework::AlignedMalloc(kBufSize, kBufAlign));
169+
*size = kBufSize;
170+
}
171+
172+
void CudaTracer::ProduceBuffer(uint8_t* buffer, size_t valid_size) {
173+
std::lock_guard<std::mutex> guard(activity_buffer_lock_);
174+
activity_buffers_.emplace_back(buffer, valid_size);
175+
}
176+
177+
std::vector<CudaTracer::ActivityBuffer> CudaTracer::ConsumeBuffers() {
178+
std::vector<ActivityBuffer> buffers;
179+
{
180+
std::lock_guard<std::mutex> guard(activity_buffer_lock_);
181+
buffers.swap(activity_buffers_);
182+
}
183+
return buffers;
184+
}
185+
186+
void CudaTracer::ReleaseBuffer(uint8_t* buffer) {
187+
paddle::framework::AlignedFree(buffer);
188+
}
189+
190+
} // namespace platform
191+
} // namespace paddle
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
#pragma once
16+
17+
#include <cstdint>
18+
#include <mutex>
19+
#include <vector>
20+
#include "paddle/fluid/platform/dynload/cupti.h"
21+
#include "paddle/fluid/platform/macros.h"
22+
#include "paddle/fluid/platform/profiler/tracer_base.h"
23+
24+
namespace paddle {
25+
namespace platform {
26+
27+
// Based on CUDA CUPTI
28+
class CudaTracer : public TracerBase {
29+
public:
30+
// Singleton. CUPTI imposes this restriction.
31+
static CudaTracer& GetInstance() {
32+
static CudaTracer instance;
33+
return instance;
34+
}
35+
36+
void PrepareTracing() override;
37+
38+
void StartTracing() override;
39+
40+
void StopTracing() override;
41+
42+
void CollectTraceData(TraceEventCollector* collector) override;
43+
44+
private:
45+
struct ActivityBuffer {
46+
ActivityBuffer(uint8_t* addr, size_t size) : addr(addr), valid_size(size) {}
47+
uint8_t* addr;
48+
size_t valid_size;
49+
};
50+
51+
CudaTracer();
52+
53+
DISABLE_COPY_AND_ASSIGN(CudaTracer);
54+
55+
void EnableCuptiActivity();
56+
57+
void DisableCuptiActivity();
58+
59+
int ProcessCuptiActivity(TraceEventCollector* collector);
60+
61+
#ifdef PADDLE_WITH_CUPTI
62+
// Used by CUPTI Activity API to request buffer
63+
static void CUPTIAPI BufferRequestedCallback(uint8_t** buffer, size_t* size,
64+
size_t* max_num_records);
65+
66+
// Used by CUPTI Activity API to commit a completed buffer
67+
static void CUPTIAPI BufferCompletedCallback(CUcontext ctx,
68+
uint32_t stream_id,
69+
uint8_t* buffer, size_t size,
70+
size_t valid_size);
71+
#endif
72+
73+
void AllocateBuffer(uint8_t** buffer, size_t* size);
74+
75+
void ProduceBuffer(uint8_t* buffer, size_t valid_size);
76+
77+
std::vector<ActivityBuffer> ConsumeBuffers();
78+
79+
void ReleaseBuffer(uint8_t* buffer);
80+
81+
uint64_t tracing_start_ns_ = UINT64_MAX;
82+
std::mutex activity_buffer_lock_;
83+
std::vector<ActivityBuffer> activity_buffers_;
84+
};
85+
86+
} // namespace platform
87+
} // namespace paddle

0 commit comments

Comments
 (0)