|
5 | 5 | */ |
6 | 6 | #include "metrics.h" |
7 | 7 |
|
| 8 | +#include <tvm/runtime/logging.h> |
| 9 | + |
| 10 | +#include <sstream> |
| 11 | + |
8 | 12 | namespace mlc { |
9 | 13 | namespace llm { |
10 | 14 | namespace serve { |
11 | 15 |
|
12 | | -picojson::value EngineMetrics::AsJSON() const { |
13 | | - picojson::object metrics; |
14 | | - metrics["sum_request_prefill_time"] = sum_request_prefill_time.AsJSON(); |
15 | | - metrics["sum_request_decode_time"] = sum_request_decode_time.AsJSON(); |
16 | | - metrics["sum_engine_prefill_time"] = sum_engine_prefill_time.AsJSON(); |
17 | | - metrics["sum_engine_decode_time"] = sum_engine_decode_time.AsJSON(); |
18 | | - metrics["sum_num_input_tokens"] = sum_num_input_tokens.AsJSON(); |
19 | | - metrics["sum_num_prefill_tokens"] = sum_num_prefill_tokens.AsJSON(); |
20 | | - metrics["sum_num_output_tokens"] = sum_num_output_tokens.AsJSON(); |
21 | | - metrics["sum_num_accepted_tokens"] = sum_num_accepted_tokens.AsJSON(); |
22 | | - metrics["sum_num_draft_tokens"] = sum_num_draft_tokens.AsJSON(); |
23 | | - |
24 | | - metrics["last_finished_req_prefill_time"] = last_finished_req_prefill_time.AsJSON(); |
25 | | - metrics["last_finished_req_decode_time"] = last_finished_req_decode_time.AsJSON(); |
26 | | - metrics["num_last_finished_req_input_tokens"] = num_last_finished_req_input_tokens.AsJSON(); |
27 | | - metrics["num_last_finished_req_prefill_tokens"] = num_last_finished_req_prefill_tokens.AsJSON(); |
28 | | - metrics["num_last_finished_req_output_tokens"] = num_last_finished_req_output_tokens.AsJSON(); |
29 | | - |
30 | | - picojson::array batch_decode_time_obj; |
31 | | - picojson::array batch_draft_time_obj; |
32 | | - picojson::array batch_verification_time_obj; |
33 | | - batch_decode_time_obj.reserve(batch_decode_time_list.size()); |
34 | | - batch_draft_time_obj.reserve(batch_draft_time_list.size()); |
35 | | - batch_verification_time_obj.reserve(batch_verification_time_list.size()); |
36 | | - for (const Metric& batch_decode_time : batch_decode_time_list) { |
37 | | - if (batch_decode_time.label.empty()) { |
38 | | - continue; |
39 | | - } |
40 | | - batch_decode_time_obj.push_back(batch_decode_time.AsJSON()); |
| 16 | +picojson::value TimeCost::AsJSON() const { |
| 17 | + picojson::object config; |
| 18 | + config["count"] = picojson::value(count); |
| 19 | + if (count != 0) { |
| 20 | + config["mean"] = picojson::value(sum / count); |
41 | 21 | } |
42 | | - for (const Metric& batch_draft_time : batch_draft_time_list) { |
43 | | - if (batch_draft_time.label.empty()) { |
44 | | - continue; |
45 | | - } |
46 | | - batch_draft_time_obj.push_back(batch_draft_time.AsJSON()); |
47 | | - } |
48 | | - for (const Metric& batch_verification_time : batch_verification_time_list) { |
49 | | - if (batch_verification_time.label.empty()) { |
50 | | - continue; |
51 | | - } |
52 | | - batch_verification_time_obj.push_back(batch_verification_time.AsJSON()); |
53 | | - } |
54 | | - metrics["batch_decode_time_per_batch_size"] = picojson::value(batch_decode_time_obj); |
55 | | - metrics["batch_draft_time_per_batch_size"] = picojson::value(batch_draft_time_obj); |
56 | | - metrics["batch_verification_time_per_batch_size"] = picojson::value(batch_verification_time_obj); |
| 22 | + return picojson::value(config); |
| 23 | +} |
57 | 24 |
|
| 25 | +picojson::value SpecDecodeMetrics::AsJSON() const { |
| 26 | + picojson::object metrics; |
58 | 27 | auto f_vector_to_array = [](const std::vector<int64_t>& vec) { |
59 | 28 | picojson::array arr; |
60 | 29 | for (int64_t v : vec) { |
61 | 30 | arr.push_back(picojson::value(v)); |
62 | 31 | } |
63 | 32 | return picojson::value(arr); |
64 | 33 | }; |
65 | | - metrics["accept_count"] = f_vector_to_array(accept_count); |
66 | 34 | metrics["draft_count"] = f_vector_to_array(draft_count); |
67 | | - return picojson::value(metrics); |
68 | | -} |
| 35 | + metrics["accept_count"] = f_vector_to_array(accept_count); |
69 | 36 |
|
70 | | -void EngineMetrics::Reset() { |
71 | | - sum_request_prefill_time.Reset(/*warmed_up=*/true); |
72 | | - sum_request_decode_time.Reset(/*warmed_up=*/true); |
73 | | - sum_engine_prefill_time.Reset(/*warmed_up=*/true); |
74 | | - sum_engine_decode_time.Reset(/*warmed_up=*/true); |
75 | | - sum_num_input_tokens.Reset(/*warmed_up=*/true); |
76 | | - sum_num_prefill_tokens.Reset(/*warmed_up=*/true); |
77 | | - sum_num_output_tokens.Reset(/*warmed_up=*/true); |
78 | | - sum_num_accepted_tokens.Reset(/*warmed_up=*/true); |
79 | | - sum_num_draft_tokens.Reset(/*warmed_up=*/true); |
80 | | - last_finished_req_prefill_time.Reset(/*warmed_up=*/true); |
81 | | - last_finished_req_decode_time.Reset(/*warmed_up=*/true); |
82 | | - num_last_finished_req_input_tokens.Reset(/*warmed_up=*/true); |
83 | | - num_last_finished_req_prefill_tokens.Reset(/*warmed_up=*/true); |
84 | | - num_last_finished_req_output_tokens.Reset(/*warmed_up=*/true); |
85 | | - batch_decode_time_list.clear(); |
86 | | - batch_draft_time_list.clear(); |
87 | | - batch_verification_time_list.clear(); |
88 | | - batch_decode_time_list.resize(kMaxEffectiveBatchSize); |
89 | | - batch_draft_time_list.resize(kMaxEffectiveBatchSize); |
90 | | - batch_verification_time_list.resize(kMaxEffectiveBatchSize); |
91 | | - accept_count.clear(); |
92 | | - draft_count.clear(); |
93 | | -} |
| 37 | + ICHECK_EQ(draft_count.size(), accept_count.size()); |
| 38 | + // NOTE: label follows prometheus with full context |
| 39 | + // so it can be flattened and used in metrics reoorting end point |
| 40 | + picojson::object accept_prob_metrics; |
| 41 | + picojson::object accept_rate_metrics; |
| 42 | + picojson::object accept_len_metrics; |
94 | 43 |
|
95 | | -void EngineMetrics::UpdateBatchDecodeTime(int batch_size, double time) { |
96 | | - if (batch_size > kMaxEffectiveBatchSize) { |
97 | | - return; |
98 | | - } |
99 | | - if (batch_decode_time_list[batch_size].label.empty()) { |
100 | | - batch_decode_time_list[batch_size].label = std::to_string(batch_size); |
101 | | - } |
102 | | - batch_decode_time_list[batch_size].Update(time); |
103 | | -} |
| 44 | + double accept_len_value = 0; |
104 | 45 |
|
105 | | -void EngineMetrics::UpdateBatchDraftTime(int batch_size, double time) { |
106 | | - if (batch_size > kMaxEffectiveBatchSize) { |
107 | | - return; |
108 | | - } |
109 | | - if (batch_draft_time_list[batch_size].label.empty()) { |
110 | | - batch_draft_time_list[batch_size].label = std::to_string(batch_size); |
111 | | - } |
112 | | - batch_draft_time_list[batch_size].Update(time); |
113 | | -} |
| 46 | + for (size_t i = 0; draft_count.size(); ++i) { |
| 47 | + std::ostringstream accept_prob_label; |
| 48 | + accept_prob_label << "accept_prob{step=" << i << "}"; |
| 49 | + double accept_prob_value = |
| 50 | + (static_cast<double>(accept_count[i]) / static_cast<double>(draft_count[i])); |
| 51 | + accept_prob_metrics[accept_prob_label.str()] = picojson::value(accept_prob_value); |
| 52 | + accept_len_value += accept_prob_value; |
114 | 53 |
|
115 | | -void EngineMetrics::UpdateBatchVerificationTime(int batch_size, double time) { |
116 | | - if (batch_size > kMaxEffectiveBatchSize) { |
117 | | - return; |
118 | | - } |
119 | | - if (batch_verification_time_list[batch_size].label.empty()) { |
120 | | - batch_verification_time_list[batch_size].label = std::to_string(batch_size); |
| 54 | + std::ostringstream accept_len_label; |
| 55 | + accept_len_label << "accept_len{step=" << i << "}"; |
| 56 | + accept_len_metrics[accept_len_label.str()] = picojson::value(accept_len_value); |
| 57 | + |
| 58 | + if (i != 0) { |
| 59 | + std::ostringstream accept_rate_label; |
| 60 | + accept_rate_label << "accept_rate{step=" << i << "}"; |
| 61 | + double accept_rate_value = |
| 62 | + (static_cast<double>(accept_count[i]) / static_cast<double>(accept_count[i - 1])); |
| 63 | + accept_rate_metrics[accept_rate_label.str()] = picojson::value(accept_rate_value); |
| 64 | + } |
121 | 65 | } |
122 | | - batch_verification_time_list[batch_size].Update(time); |
| 66 | + metrics["accept_prob"] = picojson::value(accept_prob_metrics); |
| 67 | + metrics["accept_rate"] = picojson::value(accept_rate_metrics); |
| 68 | + metrics["accept_len"] = picojson::value(accept_len_metrics); |
| 69 | + |
| 70 | + return picojson::value(metrics); |
123 | 71 | } |
124 | 72 |
|
125 | | -void EngineMetrics::UpdateSpecDecodingStats(int draft_length, int accept_length) { |
126 | | - if (accept_count.size() < draft_length) { |
127 | | - this->accept_count.resize(draft_length, 0); |
128 | | - this->draft_count.resize(draft_length, 0); |
129 | | - } |
130 | | - for (int j = 0; j < draft_length; ++j) { |
131 | | - if (j < accept_length) { |
132 | | - this->accept_count[j]++; |
| 73 | +picojson::value EngineMetrics::AsJSON() const { |
| 74 | + picojson::object metrics; |
| 75 | + metrics["sum_engine_prefill_time"] = picojson::value(sum_engine_prefill_time); |
| 76 | + metrics["sum_engine_decode_time"] = picojson::value(sum_engine_decode_time); |
| 77 | + metrics["sum_num_input_tokens"] = picojson::value(sum_num_input_tokens); |
| 78 | + metrics["sum_num_prefill_tokens"] = picojson::value(sum_num_prefill_tokens); |
| 79 | + metrics["sum_num_output_tokens"] = picojson::value(sum_num_output_tokens); |
| 80 | + |
| 81 | + metrics["last_finished_req_prefill_time"] = picojson::value(last_finished_req_prefill_time); |
| 82 | + metrics["last_finished_req_decode_time"] = picojson::value(last_finished_req_decode_time); |
| 83 | + metrics["last_finished_req_num_input_tokens"] = |
| 84 | + picojson::value(last_finished_req_num_input_tokens); |
| 85 | + metrics["last_finished_req_num_prefill_tokens"] = |
| 86 | + picojson::value(last_finished_req_num_prefill_tokens); |
| 87 | + metrics["last_finished_req_num_output_tokens"] = |
| 88 | + picojson::value(last_finished_req_num_output_tokens); |
| 89 | + |
| 90 | + metrics["spec_decode"] = spec_decode.AsJSON(); |
| 91 | + |
| 92 | + auto f_create_time_list = [](const std::string& label_name, |
| 93 | + const std::vector<TimeCost>& time_list) { |
| 94 | + picojson::object result; |
| 95 | + for (size_t i = 1; i < time_list.size(); ++i) { |
| 96 | + const TimeCost& item = time_list[i]; |
| 97 | + if (item.count == 0) continue; |
| 98 | + std::ostringstream label_mean; |
| 99 | + label_mean << "mean_" << label_name << "{batch_size=" << i << "}"; |
| 100 | + double mean = item.sum / item.count; |
| 101 | + result[label_mean.str()] = picojson::value(mean); |
| 102 | + std::ostringstream label_count; |
| 103 | + label_count << "count_" << label_name << "{batch_size=" << i << "}"; |
| 104 | + result[label_count.str()] = picojson::value(item.count); |
133 | 105 | } |
134 | | - this->draft_count[j]++; |
135 | | - } |
| 106 | + return picojson::value(result); |
| 107 | + }; |
| 108 | + |
| 109 | + metrics["decode_time_by_batch_size"] = |
| 110 | + f_create_time_list("decode_time", decode_time_by_batch_size); |
| 111 | + metrics["draft_time_by_batch_size"] = f_create_time_list("draft_time", draft_time_by_batch_size); |
| 112 | + metrics["verify_time_by_batch_size"] = |
| 113 | + f_create_time_list("verify_time", verify_time_by_batch_size); |
| 114 | + |
| 115 | + return picojson::value(metrics); |
| 116 | +} |
| 117 | + |
| 118 | +void EngineMetrics::Reset() { |
| 119 | + sum_engine_prefill_time = 0.0; |
| 120 | + sum_engine_decode_time = 0.0; |
| 121 | + sum_num_input_tokens = 0; |
| 122 | + sum_num_prefill_tokens = 0; |
| 123 | + sum_num_output_tokens = 0; |
| 124 | + last_finished_req_prefill_time = 0.0; |
| 125 | + last_finished_req_decode_time = 0.0; |
| 126 | + last_finished_req_num_input_tokens = 0.0; |
| 127 | + last_finished_req_num_prefill_tokens = 0.0; |
| 128 | + last_finished_req_num_output_tokens = 0.0; |
| 129 | + spec_decode.Reset(); |
| 130 | + decode_time_by_batch_size.clear(); |
| 131 | + draft_time_by_batch_size.clear(); |
| 132 | + verify_time_by_batch_size.clear(); |
| 133 | + decode_time_by_batch_size.resize(kEndFineGrainedTrackingBatchSize); |
| 134 | + draft_time_by_batch_size.resize(kEndFineGrainedTrackingBatchSize); |
| 135 | + verify_time_by_batch_size.resize(kEndFineGrainedTrackingBatchSize); |
136 | 136 | } |
137 | 137 |
|
138 | 138 | } // namespace serve |
|
0 commit comments