Skip to content

Commit 9a5ec77

Browse files
committed
[REFACTOR] Cleanup Metrics
This PR run another round of cleanup of metrics. - Remove less useful ones - Reorganize by labels in prometheus style
1 parent 9631cc3 commit 9a5ec77

File tree

12 files changed

+231
-249
lines changed

12 files changed

+231
-249
lines changed

cpp/serve/engine_actions/action_commons.cc

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -94,8 +94,6 @@ void ProcessFinishedRequestStateEntries(std::vector<RequestStateEntry> finished_
9494
static_cast<double>((trequest_finish - rstate->tprefill_finish).count()) / 1e9;
9595
estate->metrics.sum_num_input_tokens += rsentry->request->num_input_tokens;
9696
estate->metrics.sum_num_prefill_tokens += root_rsentry->num_prefill_tokens;
97-
estate->metrics.sum_request_prefill_time += prefill_elapsed_time;
98-
estate->metrics.sum_request_decode_time += decode_elapsed_time;
9997
int64_t num_output_tokens = 0;
10098
for (const RequestStateEntry& entry : rstate->entries) {
10199
num_output_tokens += entry->mstates[0]->committed_tokens.size();
@@ -105,11 +103,11 @@ void ProcessFinishedRequestStateEntries(std::vector<RequestStateEntry> finished_
105103
num_output_tokens -= rsentry->request->generation_cfg->n;
106104
estate->metrics.sum_num_output_tokens += num_output_tokens;
107105

108-
estate->metrics.num_last_finished_req_input_tokens.Set(rsentry->request->num_input_tokens);
109-
estate->metrics.num_last_finished_req_prefill_tokens.Set(root_rsentry->num_prefill_tokens);
110-
estate->metrics.num_last_finished_req_output_tokens.Set(num_output_tokens);
111-
estate->metrics.last_finished_req_prefill_time.Set(prefill_elapsed_time);
112-
estate->metrics.last_finished_req_decode_time.Set(decode_elapsed_time);
106+
estate->metrics.last_finished_req_num_input_tokens = rsentry->request->num_input_tokens;
107+
estate->metrics.last_finished_req_num_prefill_tokens = root_rsentry->num_prefill_tokens;
108+
estate->metrics.last_finished_req_num_output_tokens = num_output_tokens;
109+
estate->metrics.last_finished_req_prefill_time = prefill_elapsed_time;
110+
estate->metrics.last_finished_req_decode_time = decode_elapsed_time;
113111
}
114112
}
115113
}

cpp/serve/engine_actions/batch_decode.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,7 @@ class BatchDecodeActionObj : public EngineActionObj {
130130
auto tend = std::chrono::high_resolution_clock::now();
131131
double elapsed_time = static_cast<double>((tend - tstart).count()) / 1e9;
132132
estate->metrics.sum_engine_decode_time += elapsed_time;
133-
estate->metrics.UpdateBatchDecodeTime(num_rsentries, elapsed_time);
133+
estate->metrics.UpdateDecodeTimeByBatchSize(num_rsentries, elapsed_time);
134134

135135
return estate->running_queue;
136136
}

cpp/serve/engine_actions/batch_draft.cc

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -134,11 +134,10 @@ class BatchDraftActionObj : public EngineActionObj {
134134
&model_workspaces_[0].draft_probs_storage);
135135
for (int i = 0; i < num_rsentries; ++i) {
136136
mstates[i]->AddDraftToken(sample_results[i], draft_token_slots_[i]);
137-
estate->metrics.sum_num_draft_tokens += 1;
138137
}
139138

140139
auto tdraft_end = std::chrono::high_resolution_clock::now();
141-
estate->metrics.UpdateBatchDraftTime(
140+
estate->metrics.UpdateDraftTimeByBatchSize(
142141
num_rsentries, static_cast<double>((tdraft_end - tdraft_start).count()) / 1e9);
143142
}
144143
}

cpp/serve/engine_actions/batch_verify.cc

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -147,9 +147,8 @@ class BatchVerifyActionObj : public EngineActionObj {
147147
rsentries[i]->mstates[verify_model_id_]->CommitToken(sample_result);
148148
rsentries[i]->mstates[draft_model_id_]->CommitToken(sample_result);
149149
}
150-
estate->metrics.sum_num_accepted_tokens += accept_length;
151-
estate->metrics.UpdateSpecDecodingStats(cum_verify_lengths[i + 1] - cum_verify_lengths[i],
152-
accept_length);
150+
estate->metrics.spec_decode.Update(cum_verify_lengths[i + 1] - cum_verify_lengths[i],
151+
accept_length);
153152
int rollback_length =
154153
std::max(cum_verify_lengths[i + 1] - cum_verify_lengths[i] - accept_length, 0);
155154
// rollback kv cache
@@ -210,7 +209,7 @@ class BatchVerifyActionObj : public EngineActionObj {
210209
auto tend = std::chrono::high_resolution_clock::now();
211210
double elapsed_time = static_cast<double>((tend - tstart).count()) / 1e9;
212211
estate->metrics.sum_engine_decode_time += elapsed_time;
213-
estate->metrics.UpdateBatchVerificationTime(total_verify_length, elapsed_time);
212+
estate->metrics.UpdateVerifyTimeByBatchSize(total_verify_length, elapsed_time);
214213

215214
return estate->running_queue;
216215
}

cpp/serve/engine_actions/eagle_batch_draft.cc

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -151,11 +151,10 @@ class EagleBatchDraftActionObj : public EngineActionObj {
151151
// No need to save hidden states as they are not used by subsequent engine actions
152152
for (int i = 0; i < num_rsentries; ++i) {
153153
mstates[i]->AddDraftToken(sample_results[i], draft_token_slots_[i]);
154-
estate->metrics.sum_num_draft_tokens += 1;
155154
}
156155

157156
auto tdraft_end = std::chrono::high_resolution_clock::now();
158-
estate->metrics.UpdateBatchDraftTime(
157+
estate->metrics.UpdateDraftTimeByBatchSize(
159158
num_rsentries, static_cast<double>((tdraft_end - tdraft_start).count()) / 1e9);
160159
}
161160
}

cpp/serve/engine_actions/eagle_batch_verify.cc

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -153,9 +153,8 @@ class EagleBatchVerifyActionObj : public EngineActionObj {
153153
rsentries[i]->mstates[verify_model_id_]->CommitToken(sample_result);
154154
rsentries[i]->mstates[draft_model_id_]->CommitToken(sample_result);
155155
}
156-
estate->metrics.UpdateSpecDecodingStats(cum_verify_lengths[i + 1] - cum_verify_lengths[i],
157-
accept_length);
158-
estate->metrics.sum_num_accepted_tokens += accept_length - 1;
156+
estate->metrics.spec_decode.Update(cum_verify_lengths[i + 1] - cum_verify_lengths[i],
157+
accept_length);
159158
// - Minus one because the last draft token has no kv cache entry
160159
// - Take max with 0 in case of all accepted.
161160
int rollback_length =
@@ -304,7 +303,7 @@ class EagleBatchVerifyActionObj : public EngineActionObj {
304303
auto tend = std::chrono::high_resolution_clock::now();
305304
double elapsed_time = static_cast<double>((tend - tstart).count()) / 1e9;
306305
estate->metrics.sum_engine_decode_time += elapsed_time;
307-
estate->metrics.UpdateBatchVerificationTime(cum_verify_lengths.back(), elapsed_time);
306+
estate->metrics.UpdateVerifyTimeByBatchSize(cum_verify_lengths.back(), elapsed_time);
308307

309308
return estate->running_queue;
310309
}
@@ -380,7 +379,6 @@ class EagleBatchVerifyActionObj : public EngineActionObj {
380379
}
381380
for (int i = 0; i < static_cast<int>(mstates.size()); ++i) {
382381
mstates[i]->AddDraftToken(sample_results[i], draft_token_slots_[i]);
383-
estate->metrics.sum_num_draft_tokens += 1;
384382
}
385383
}
386384
/*!

cpp/serve/engine_actions/eagle_new_request_prefill.cc

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -340,7 +340,6 @@ class EagleNewRequestPrefillActionObj : public BatchPrefillBaseActionObj {
340340
for (int i = 0; i < static_cast<int>(rsentries_for_sample.size()); ++i) {
341341
rsentries_for_sample[i]->mstates[model_id]->AddDraftToken(sample_results[i],
342342
draft_token_slots_[i]);
343-
estate->metrics.sum_num_draft_tokens += 1;
344343
}
345344
}
346345

cpp/serve/metrics.cc

Lines changed: 106 additions & 106 deletions
Original file line numberDiff line numberDiff line change
@@ -5,134 +5,134 @@
55
*/
66
#include "metrics.h"
77

8+
#include <tvm/runtime/logging.h>
9+
10+
#include <sstream>
11+
812
namespace mlc {
913
namespace llm {
1014
namespace serve {
1115

12-
picojson::value EngineMetrics::AsJSON() const {
13-
picojson::object metrics;
14-
metrics["sum_request_prefill_time"] = sum_request_prefill_time.AsJSON();
15-
metrics["sum_request_decode_time"] = sum_request_decode_time.AsJSON();
16-
metrics["sum_engine_prefill_time"] = sum_engine_prefill_time.AsJSON();
17-
metrics["sum_engine_decode_time"] = sum_engine_decode_time.AsJSON();
18-
metrics["sum_num_input_tokens"] = sum_num_input_tokens.AsJSON();
19-
metrics["sum_num_prefill_tokens"] = sum_num_prefill_tokens.AsJSON();
20-
metrics["sum_num_output_tokens"] = sum_num_output_tokens.AsJSON();
21-
metrics["sum_num_accepted_tokens"] = sum_num_accepted_tokens.AsJSON();
22-
metrics["sum_num_draft_tokens"] = sum_num_draft_tokens.AsJSON();
23-
24-
metrics["last_finished_req_prefill_time"] = last_finished_req_prefill_time.AsJSON();
25-
metrics["last_finished_req_decode_time"] = last_finished_req_decode_time.AsJSON();
26-
metrics["num_last_finished_req_input_tokens"] = num_last_finished_req_input_tokens.AsJSON();
27-
metrics["num_last_finished_req_prefill_tokens"] = num_last_finished_req_prefill_tokens.AsJSON();
28-
metrics["num_last_finished_req_output_tokens"] = num_last_finished_req_output_tokens.AsJSON();
29-
30-
picojson::array batch_decode_time_obj;
31-
picojson::array batch_draft_time_obj;
32-
picojson::array batch_verification_time_obj;
33-
batch_decode_time_obj.reserve(batch_decode_time_list.size());
34-
batch_draft_time_obj.reserve(batch_draft_time_list.size());
35-
batch_verification_time_obj.reserve(batch_verification_time_list.size());
36-
for (const Metric& batch_decode_time : batch_decode_time_list) {
37-
if (batch_decode_time.label.empty()) {
38-
continue;
39-
}
40-
batch_decode_time_obj.push_back(batch_decode_time.AsJSON());
16+
picojson::value TimeCost::AsJSON() const {
17+
picojson::object config;
18+
config["count"] = picojson::value(count);
19+
if (count != 0) {
20+
config["mean"] = picojson::value(sum / count);
4121
}
42-
for (const Metric& batch_draft_time : batch_draft_time_list) {
43-
if (batch_draft_time.label.empty()) {
44-
continue;
45-
}
46-
batch_draft_time_obj.push_back(batch_draft_time.AsJSON());
47-
}
48-
for (const Metric& batch_verification_time : batch_verification_time_list) {
49-
if (batch_verification_time.label.empty()) {
50-
continue;
51-
}
52-
batch_verification_time_obj.push_back(batch_verification_time.AsJSON());
53-
}
54-
metrics["batch_decode_time_per_batch_size"] = picojson::value(batch_decode_time_obj);
55-
metrics["batch_draft_time_per_batch_size"] = picojson::value(batch_draft_time_obj);
56-
metrics["batch_verification_time_per_batch_size"] = picojson::value(batch_verification_time_obj);
22+
return picojson::value(config);
23+
}
5724

25+
picojson::value SpecDecodeMetrics::AsJSON() const {
26+
picojson::object metrics;
5827
auto f_vector_to_array = [](const std::vector<int64_t>& vec) {
5928
picojson::array arr;
6029
for (int64_t v : vec) {
6130
arr.push_back(picojson::value(v));
6231
}
6332
return picojson::value(arr);
6433
};
65-
metrics["accept_count"] = f_vector_to_array(accept_count);
6634
metrics["draft_count"] = f_vector_to_array(draft_count);
67-
return picojson::value(metrics);
68-
}
35+
metrics["accept_count"] = f_vector_to_array(accept_count);
6936

70-
void EngineMetrics::Reset() {
71-
sum_request_prefill_time.Reset(/*warmed_up=*/true);
72-
sum_request_decode_time.Reset(/*warmed_up=*/true);
73-
sum_engine_prefill_time.Reset(/*warmed_up=*/true);
74-
sum_engine_decode_time.Reset(/*warmed_up=*/true);
75-
sum_num_input_tokens.Reset(/*warmed_up=*/true);
76-
sum_num_prefill_tokens.Reset(/*warmed_up=*/true);
77-
sum_num_output_tokens.Reset(/*warmed_up=*/true);
78-
sum_num_accepted_tokens.Reset(/*warmed_up=*/true);
79-
sum_num_draft_tokens.Reset(/*warmed_up=*/true);
80-
last_finished_req_prefill_time.Reset(/*warmed_up=*/true);
81-
last_finished_req_decode_time.Reset(/*warmed_up=*/true);
82-
num_last_finished_req_input_tokens.Reset(/*warmed_up=*/true);
83-
num_last_finished_req_prefill_tokens.Reset(/*warmed_up=*/true);
84-
num_last_finished_req_output_tokens.Reset(/*warmed_up=*/true);
85-
batch_decode_time_list.clear();
86-
batch_draft_time_list.clear();
87-
batch_verification_time_list.clear();
88-
batch_decode_time_list.resize(kMaxEffectiveBatchSize);
89-
batch_draft_time_list.resize(kMaxEffectiveBatchSize);
90-
batch_verification_time_list.resize(kMaxEffectiveBatchSize);
91-
accept_count.clear();
92-
draft_count.clear();
93-
}
37+
ICHECK_EQ(draft_count.size(), accept_count.size());
38+
// NOTE: label follows prometheus with full context
39+
// so it can be flattened and used in metrics reoorting end point
40+
picojson::object accept_prob_metrics;
41+
picojson::object accept_rate_metrics;
42+
picojson::object accept_len_metrics;
9443

95-
void EngineMetrics::UpdateBatchDecodeTime(int batch_size, double time) {
96-
if (batch_size > kMaxEffectiveBatchSize) {
97-
return;
98-
}
99-
if (batch_decode_time_list[batch_size].label.empty()) {
100-
batch_decode_time_list[batch_size].label = std::to_string(batch_size);
101-
}
102-
batch_decode_time_list[batch_size].Update(time);
103-
}
44+
double accept_len_value = 0;
10445

105-
void EngineMetrics::UpdateBatchDraftTime(int batch_size, double time) {
106-
if (batch_size > kMaxEffectiveBatchSize) {
107-
return;
108-
}
109-
if (batch_draft_time_list[batch_size].label.empty()) {
110-
batch_draft_time_list[batch_size].label = std::to_string(batch_size);
111-
}
112-
batch_draft_time_list[batch_size].Update(time);
113-
}
46+
for (size_t i = 0; draft_count.size(); ++i) {
47+
std::ostringstream accept_prob_label;
48+
accept_prob_label << "accept_prob{step=" << i << "}";
49+
double accept_prob_value =
50+
(static_cast<double>(accept_count[i]) / static_cast<double>(draft_count[i]));
51+
accept_prob_metrics[accept_prob_label.str()] = picojson::value(accept_prob_value);
52+
accept_len_value += accept_prob_value;
11453

115-
void EngineMetrics::UpdateBatchVerificationTime(int batch_size, double time) {
116-
if (batch_size > kMaxEffectiveBatchSize) {
117-
return;
118-
}
119-
if (batch_verification_time_list[batch_size].label.empty()) {
120-
batch_verification_time_list[batch_size].label = std::to_string(batch_size);
54+
std::ostringstream accept_len_label;
55+
accept_len_label << "accept_len{step=" << i << "}";
56+
accept_len_metrics[accept_len_label.str()] = picojson::value(accept_len_value);
57+
58+
if (i != 0) {
59+
std::ostringstream accept_rate_label;
60+
accept_rate_label << "accept_rate{step=" << i << "}";
61+
double accept_rate_value =
62+
(static_cast<double>(accept_count[i]) / static_cast<double>(accept_count[i - 1]));
63+
accept_rate_metrics[accept_rate_label.str()] = picojson::value(accept_rate_value);
64+
}
12165
}
122-
batch_verification_time_list[batch_size].Update(time);
66+
metrics["accept_prob"] = picojson::value(accept_prob_metrics);
67+
metrics["accept_rate"] = picojson::value(accept_rate_metrics);
68+
metrics["accept_len"] = picojson::value(accept_len_metrics);
69+
70+
return picojson::value(metrics);
12371
}
12472

125-
void EngineMetrics::UpdateSpecDecodingStats(int draft_length, int accept_length) {
126-
if (accept_count.size() < draft_length) {
127-
this->accept_count.resize(draft_length, 0);
128-
this->draft_count.resize(draft_length, 0);
129-
}
130-
for (int j = 0; j < draft_length; ++j) {
131-
if (j < accept_length) {
132-
this->accept_count[j]++;
73+
picojson::value EngineMetrics::AsJSON() const {
74+
picojson::object metrics;
75+
metrics["sum_engine_prefill_time"] = picojson::value(sum_engine_prefill_time);
76+
metrics["sum_engine_decode_time"] = picojson::value(sum_engine_decode_time);
77+
metrics["sum_num_input_tokens"] = picojson::value(sum_num_input_tokens);
78+
metrics["sum_num_prefill_tokens"] = picojson::value(sum_num_prefill_tokens);
79+
metrics["sum_num_output_tokens"] = picojson::value(sum_num_output_tokens);
80+
81+
metrics["last_finished_req_prefill_time"] = picojson::value(last_finished_req_prefill_time);
82+
metrics["last_finished_req_decode_time"] = picojson::value(last_finished_req_decode_time);
83+
metrics["last_finished_req_num_input_tokens"] =
84+
picojson::value(last_finished_req_num_input_tokens);
85+
metrics["last_finished_req_num_prefill_tokens"] =
86+
picojson::value(last_finished_req_num_prefill_tokens);
87+
metrics["last_finished_req_num_output_tokens"] =
88+
picojson::value(last_finished_req_num_output_tokens);
89+
90+
metrics["spec_decode"] = spec_decode.AsJSON();
91+
92+
auto f_create_time_list = [](const std::string& label_name,
93+
const std::vector<TimeCost>& time_list) {
94+
picojson::object result;
95+
for (size_t i = 1; i < time_list.size(); ++i) {
96+
const TimeCost& item = time_list[i];
97+
if (item.count == 0) continue;
98+
std::ostringstream label_mean;
99+
label_mean << "mean_" << label_name << "{batch_size=" << i << "}";
100+
double mean = item.sum / item.count;
101+
result[label_mean.str()] = picojson::value(mean);
102+
std::ostringstream label_count;
103+
label_count << "count_" << label_name << "{batch_size=" << i << "}";
104+
result[label_count.str()] = picojson::value(item.count);
133105
}
134-
this->draft_count[j]++;
135-
}
106+
return picojson::value(result);
107+
};
108+
109+
metrics["decode_time_by_batch_size"] =
110+
f_create_time_list("decode_time", decode_time_by_batch_size);
111+
metrics["draft_time_by_batch_size"] = f_create_time_list("draft_time", draft_time_by_batch_size);
112+
metrics["verify_time_by_batch_size"] =
113+
f_create_time_list("verify_time", verify_time_by_batch_size);
114+
115+
return picojson::value(metrics);
116+
}
117+
118+
void EngineMetrics::Reset() {
119+
sum_engine_prefill_time = 0.0;
120+
sum_engine_decode_time = 0.0;
121+
sum_num_input_tokens = 0;
122+
sum_num_prefill_tokens = 0;
123+
sum_num_output_tokens = 0;
124+
last_finished_req_prefill_time = 0.0;
125+
last_finished_req_decode_time = 0.0;
126+
last_finished_req_num_input_tokens = 0.0;
127+
last_finished_req_num_prefill_tokens = 0.0;
128+
last_finished_req_num_output_tokens = 0.0;
129+
spec_decode.Reset();
130+
decode_time_by_batch_size.clear();
131+
draft_time_by_batch_size.clear();
132+
verify_time_by_batch_size.clear();
133+
decode_time_by_batch_size.resize(kEndFineGrainedTrackingBatchSize);
134+
draft_time_by_batch_size.resize(kEndFineGrainedTrackingBatchSize);
135+
verify_time_by_batch_size.resize(kEndFineGrainedTrackingBatchSize);
136136
}
137137

138138
} // namespace serve

0 commit comments

Comments
 (0)