Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 5 additions & 7 deletions cpp/serve/engine_actions/action_commons.cc
Original file line number Diff line number Diff line change
Expand Up @@ -94,8 +94,6 @@ void ProcessFinishedRequestStateEntries(std::vector<RequestStateEntry> finished_
static_cast<double>((trequest_finish - rstate->tprefill_finish).count()) / 1e9;
estate->metrics.sum_num_input_tokens += rsentry->request->num_input_tokens;
estate->metrics.sum_num_prefill_tokens += root_rsentry->num_prefill_tokens;
estate->metrics.sum_request_prefill_time += prefill_elapsed_time;
estate->metrics.sum_request_decode_time += decode_elapsed_time;
int64_t num_output_tokens = 0;
for (const RequestStateEntry& entry : rstate->entries) {
num_output_tokens += entry->mstates[0]->committed_tokens.size();
Expand All @@ -105,11 +103,11 @@ void ProcessFinishedRequestStateEntries(std::vector<RequestStateEntry> finished_
num_output_tokens -= rsentry->request->generation_cfg->n;
estate->metrics.sum_num_output_tokens += num_output_tokens;

estate->metrics.num_last_finished_req_input_tokens.Set(rsentry->request->num_input_tokens);
estate->metrics.num_last_finished_req_prefill_tokens.Set(root_rsentry->num_prefill_tokens);
estate->metrics.num_last_finished_req_output_tokens.Set(num_output_tokens);
estate->metrics.last_finished_req_prefill_time.Set(prefill_elapsed_time);
estate->metrics.last_finished_req_decode_time.Set(decode_elapsed_time);
estate->metrics.last_finished_req_num_input_tokens = rsentry->request->num_input_tokens;
estate->metrics.last_finished_req_num_prefill_tokens = root_rsentry->num_prefill_tokens;
estate->metrics.last_finished_req_num_output_tokens = num_output_tokens;
estate->metrics.last_finished_req_prefill_time = prefill_elapsed_time;
estate->metrics.last_finished_req_decode_time = decode_elapsed_time;
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion cpp/serve/engine_actions/batch_decode.cc
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ class BatchDecodeActionObj : public EngineActionObj {
auto tend = std::chrono::high_resolution_clock::now();
double elapsed_time = static_cast<double>((tend - tstart).count()) / 1e9;
estate->metrics.sum_engine_decode_time += elapsed_time;
estate->metrics.UpdateBatchDecodeTime(num_rsentries, elapsed_time);
estate->metrics.UpdateDecodeTimeByBatchSize(num_rsentries, elapsed_time);

return estate->running_queue;
}
Expand Down
3 changes: 1 addition & 2 deletions cpp/serve/engine_actions/batch_draft.cc
Original file line number Diff line number Diff line change
Expand Up @@ -134,11 +134,10 @@ class BatchDraftActionObj : public EngineActionObj {
&model_workspaces_[0].draft_probs_storage);
for (int i = 0; i < num_rsentries; ++i) {
mstates[i]->AddDraftToken(sample_results[i], draft_token_slots_[i]);
estate->metrics.sum_num_draft_tokens += 1;
}

auto tdraft_end = std::chrono::high_resolution_clock::now();
estate->metrics.UpdateBatchDraftTime(
estate->metrics.UpdateDraftTimeByBatchSize(
num_rsentries, static_cast<double>((tdraft_end - tdraft_start).count()) / 1e9);
}
}
Expand Down
7 changes: 3 additions & 4 deletions cpp/serve/engine_actions/batch_verify.cc
Original file line number Diff line number Diff line change
Expand Up @@ -147,9 +147,8 @@ class BatchVerifyActionObj : public EngineActionObj {
rsentries[i]->mstates[verify_model_id_]->CommitToken(sample_result);
rsentries[i]->mstates[draft_model_id_]->CommitToken(sample_result);
}
estate->metrics.sum_num_accepted_tokens += accept_length;
estate->metrics.UpdateSpecDecodingStats(cum_verify_lengths[i + 1] - cum_verify_lengths[i],
accept_length);
estate->metrics.spec_decode.Update(cum_verify_lengths[i + 1] - cum_verify_lengths[i],
accept_length);
int rollback_length =
std::max(cum_verify_lengths[i + 1] - cum_verify_lengths[i] - accept_length, 0);
// rollback kv cache
Expand Down Expand Up @@ -210,7 +209,7 @@ class BatchVerifyActionObj : public EngineActionObj {
auto tend = std::chrono::high_resolution_clock::now();
double elapsed_time = static_cast<double>((tend - tstart).count()) / 1e9;
estate->metrics.sum_engine_decode_time += elapsed_time;
estate->metrics.UpdateBatchVerificationTime(total_verify_length, elapsed_time);
estate->metrics.UpdateVerifyTimeByBatchSize(total_verify_length, elapsed_time);

return estate->running_queue;
}
Expand Down
3 changes: 1 addition & 2 deletions cpp/serve/engine_actions/eagle_batch_draft.cc
Original file line number Diff line number Diff line change
Expand Up @@ -151,11 +151,10 @@ class EagleBatchDraftActionObj : public EngineActionObj {
// No need to save hidden states as they are not used by subsequent engine actions
for (int i = 0; i < num_rsentries; ++i) {
mstates[i]->AddDraftToken(sample_results[i], draft_token_slots_[i]);
estate->metrics.sum_num_draft_tokens += 1;
}

auto tdraft_end = std::chrono::high_resolution_clock::now();
estate->metrics.UpdateBatchDraftTime(
estate->metrics.UpdateDraftTimeByBatchSize(
num_rsentries, static_cast<double>((tdraft_end - tdraft_start).count()) / 1e9);
}
}
Expand Down
8 changes: 3 additions & 5 deletions cpp/serve/engine_actions/eagle_batch_verify.cc
Original file line number Diff line number Diff line change
Expand Up @@ -153,9 +153,8 @@ class EagleBatchVerifyActionObj : public EngineActionObj {
rsentries[i]->mstates[verify_model_id_]->CommitToken(sample_result);
rsentries[i]->mstates[draft_model_id_]->CommitToken(sample_result);
}
estate->metrics.UpdateSpecDecodingStats(cum_verify_lengths[i + 1] - cum_verify_lengths[i],
accept_length);
estate->metrics.sum_num_accepted_tokens += accept_length - 1;
estate->metrics.spec_decode.Update(cum_verify_lengths[i + 1] - cum_verify_lengths[i],
accept_length);
// - Minus one because the last draft token has no kv cache entry
// - Take max with 0 in case of all accepted.
int rollback_length =
Expand Down Expand Up @@ -304,7 +303,7 @@ class EagleBatchVerifyActionObj : public EngineActionObj {
auto tend = std::chrono::high_resolution_clock::now();
double elapsed_time = static_cast<double>((tend - tstart).count()) / 1e9;
estate->metrics.sum_engine_decode_time += elapsed_time;
estate->metrics.UpdateBatchVerificationTime(cum_verify_lengths.back(), elapsed_time);
estate->metrics.UpdateVerifyTimeByBatchSize(cum_verify_lengths.back(), elapsed_time);

return estate->running_queue;
}
Expand Down Expand Up @@ -380,7 +379,6 @@ class EagleBatchVerifyActionObj : public EngineActionObj {
}
for (int i = 0; i < static_cast<int>(mstates.size()); ++i) {
mstates[i]->AddDraftToken(sample_results[i], draft_token_slots_[i]);
estate->metrics.sum_num_draft_tokens += 1;
}
}
/*!
Expand Down
1 change: 0 additions & 1 deletion cpp/serve/engine_actions/eagle_new_request_prefill.cc
Original file line number Diff line number Diff line change
Expand Up @@ -340,7 +340,6 @@ class EagleNewRequestPrefillActionObj : public BatchPrefillBaseActionObj {
for (int i = 0; i < static_cast<int>(rsentries_for_sample.size()); ++i) {
rsentries_for_sample[i]->mstates[model_id]->AddDraftToken(sample_results[i],
draft_token_slots_[i]);
estate->metrics.sum_num_draft_tokens += 1;
}
}

Expand Down
212 changes: 106 additions & 106 deletions cpp/serve/metrics.cc
Original file line number Diff line number Diff line change
Expand Up @@ -5,134 +5,134 @@
*/
#include "metrics.h"

#include <tvm/runtime/logging.h>

#include <sstream>

namespace mlc {
namespace llm {
namespace serve {

picojson::value EngineMetrics::AsJSON() const {
picojson::object metrics;
metrics["sum_request_prefill_time"] = sum_request_prefill_time.AsJSON();
metrics["sum_request_decode_time"] = sum_request_decode_time.AsJSON();
metrics["sum_engine_prefill_time"] = sum_engine_prefill_time.AsJSON();
metrics["sum_engine_decode_time"] = sum_engine_decode_time.AsJSON();
metrics["sum_num_input_tokens"] = sum_num_input_tokens.AsJSON();
metrics["sum_num_prefill_tokens"] = sum_num_prefill_tokens.AsJSON();
metrics["sum_num_output_tokens"] = sum_num_output_tokens.AsJSON();
metrics["sum_num_accepted_tokens"] = sum_num_accepted_tokens.AsJSON();
metrics["sum_num_draft_tokens"] = sum_num_draft_tokens.AsJSON();

metrics["last_finished_req_prefill_time"] = last_finished_req_prefill_time.AsJSON();
metrics["last_finished_req_decode_time"] = last_finished_req_decode_time.AsJSON();
metrics["num_last_finished_req_input_tokens"] = num_last_finished_req_input_tokens.AsJSON();
metrics["num_last_finished_req_prefill_tokens"] = num_last_finished_req_prefill_tokens.AsJSON();
metrics["num_last_finished_req_output_tokens"] = num_last_finished_req_output_tokens.AsJSON();

picojson::array batch_decode_time_obj;
picojson::array batch_draft_time_obj;
picojson::array batch_verification_time_obj;
batch_decode_time_obj.reserve(batch_decode_time_list.size());
batch_draft_time_obj.reserve(batch_draft_time_list.size());
batch_verification_time_obj.reserve(batch_verification_time_list.size());
for (const Metric& batch_decode_time : batch_decode_time_list) {
if (batch_decode_time.label.empty()) {
continue;
}
batch_decode_time_obj.push_back(batch_decode_time.AsJSON());
picojson::value TimeCost::AsJSON() const {
picojson::object config;
config["count"] = picojson::value(count);
if (count != 0) {
config["mean"] = picojson::value(sum / count);
}
for (const Metric& batch_draft_time : batch_draft_time_list) {
if (batch_draft_time.label.empty()) {
continue;
}
batch_draft_time_obj.push_back(batch_draft_time.AsJSON());
}
for (const Metric& batch_verification_time : batch_verification_time_list) {
if (batch_verification_time.label.empty()) {
continue;
}
batch_verification_time_obj.push_back(batch_verification_time.AsJSON());
}
metrics["batch_decode_time_per_batch_size"] = picojson::value(batch_decode_time_obj);
metrics["batch_draft_time_per_batch_size"] = picojson::value(batch_draft_time_obj);
metrics["batch_verification_time_per_batch_size"] = picojson::value(batch_verification_time_obj);
return picojson::value(config);
}

picojson::value SpecDecodeMetrics::AsJSON() const {
picojson::object metrics;
auto f_vector_to_array = [](const std::vector<int64_t>& vec) {
picojson::array arr;
for (int64_t v : vec) {
arr.push_back(picojson::value(v));
}
return picojson::value(arr);
};
metrics["accept_count"] = f_vector_to_array(accept_count);
metrics["draft_count"] = f_vector_to_array(draft_count);
return picojson::value(metrics);
}
metrics["accept_count"] = f_vector_to_array(accept_count);

void EngineMetrics::Reset() {
sum_request_prefill_time.Reset(/*warmed_up=*/true);
sum_request_decode_time.Reset(/*warmed_up=*/true);
sum_engine_prefill_time.Reset(/*warmed_up=*/true);
sum_engine_decode_time.Reset(/*warmed_up=*/true);
sum_num_input_tokens.Reset(/*warmed_up=*/true);
sum_num_prefill_tokens.Reset(/*warmed_up=*/true);
sum_num_output_tokens.Reset(/*warmed_up=*/true);
sum_num_accepted_tokens.Reset(/*warmed_up=*/true);
sum_num_draft_tokens.Reset(/*warmed_up=*/true);
last_finished_req_prefill_time.Reset(/*warmed_up=*/true);
last_finished_req_decode_time.Reset(/*warmed_up=*/true);
num_last_finished_req_input_tokens.Reset(/*warmed_up=*/true);
num_last_finished_req_prefill_tokens.Reset(/*warmed_up=*/true);
num_last_finished_req_output_tokens.Reset(/*warmed_up=*/true);
batch_decode_time_list.clear();
batch_draft_time_list.clear();
batch_verification_time_list.clear();
batch_decode_time_list.resize(kMaxEffectiveBatchSize);
batch_draft_time_list.resize(kMaxEffectiveBatchSize);
batch_verification_time_list.resize(kMaxEffectiveBatchSize);
accept_count.clear();
draft_count.clear();
}
ICHECK_EQ(draft_count.size(), accept_count.size());
// NOTE: label follows prometheus with full context
// so it can be flattened and used in metrics reoorting end point
picojson::object accept_prob_metrics;
picojson::object accept_rate_metrics;
picojson::object accept_len_metrics;

void EngineMetrics::UpdateBatchDecodeTime(int batch_size, double time) {
if (batch_size > kMaxEffectiveBatchSize) {
return;
}
if (batch_decode_time_list[batch_size].label.empty()) {
batch_decode_time_list[batch_size].label = std::to_string(batch_size);
}
batch_decode_time_list[batch_size].Update(time);
}
double accept_len_value = 0;

void EngineMetrics::UpdateBatchDraftTime(int batch_size, double time) {
if (batch_size > kMaxEffectiveBatchSize) {
return;
}
if (batch_draft_time_list[batch_size].label.empty()) {
batch_draft_time_list[batch_size].label = std::to_string(batch_size);
}
batch_draft_time_list[batch_size].Update(time);
}
for (size_t i = 0; draft_count.size(); ++i) {
std::ostringstream accept_prob_label;
accept_prob_label << "accept_prob{step=" << i << "}";
double accept_prob_value =
(static_cast<double>(accept_count[i]) / static_cast<double>(draft_count[i]));
accept_prob_metrics[accept_prob_label.str()] = picojson::value(accept_prob_value);
accept_len_value += accept_prob_value;

void EngineMetrics::UpdateBatchVerificationTime(int batch_size, double time) {
if (batch_size > kMaxEffectiveBatchSize) {
return;
}
if (batch_verification_time_list[batch_size].label.empty()) {
batch_verification_time_list[batch_size].label = std::to_string(batch_size);
std::ostringstream accept_len_label;
accept_len_label << "accept_len{step=" << i << "}";
accept_len_metrics[accept_len_label.str()] = picojson::value(accept_len_value);

if (i != 0) {
std::ostringstream accept_rate_label;
accept_rate_label << "accept_rate{step=" << i << "}";
double accept_rate_value =
(static_cast<double>(accept_count[i]) / static_cast<double>(accept_count[i - 1]));
accept_rate_metrics[accept_rate_label.str()] = picojson::value(accept_rate_value);
}
}
batch_verification_time_list[batch_size].Update(time);
metrics["accept_prob"] = picojson::value(accept_prob_metrics);
metrics["accept_rate"] = picojson::value(accept_rate_metrics);
metrics["accept_len"] = picojson::value(accept_len_metrics);

return picojson::value(metrics);
}

void EngineMetrics::UpdateSpecDecodingStats(int draft_length, int accept_length) {
if (accept_count.size() < draft_length) {
this->accept_count.resize(draft_length, 0);
this->draft_count.resize(draft_length, 0);
}
for (int j = 0; j < draft_length; ++j) {
if (j < accept_length) {
this->accept_count[j]++;
picojson::value EngineMetrics::AsJSON() const {
picojson::object metrics;
metrics["sum_engine_prefill_time"] = picojson::value(sum_engine_prefill_time);
metrics["sum_engine_decode_time"] = picojson::value(sum_engine_decode_time);
metrics["sum_num_input_tokens"] = picojson::value(sum_num_input_tokens);
metrics["sum_num_prefill_tokens"] = picojson::value(sum_num_prefill_tokens);
metrics["sum_num_output_tokens"] = picojson::value(sum_num_output_tokens);

metrics["last_finished_req_prefill_time"] = picojson::value(last_finished_req_prefill_time);
metrics["last_finished_req_decode_time"] = picojson::value(last_finished_req_decode_time);
metrics["last_finished_req_num_input_tokens"] =
picojson::value(last_finished_req_num_input_tokens);
metrics["last_finished_req_num_prefill_tokens"] =
picojson::value(last_finished_req_num_prefill_tokens);
metrics["last_finished_req_num_output_tokens"] =
picojson::value(last_finished_req_num_output_tokens);

metrics["spec_decode"] = spec_decode.AsJSON();

auto f_create_time_list = [](const std::string& label_name,
const std::vector<TimeCost>& time_list) {
picojson::object result;
for (size_t i = 1; i < time_list.size(); ++i) {
const TimeCost& item = time_list[i];
if (item.count == 0) continue;
std::ostringstream label_mean;
label_mean << "mean_" << label_name << "{batch_size=" << i << "}";
double mean = item.sum / item.count;
result[label_mean.str()] = picojson::value(mean);
std::ostringstream label_count;
label_count << "count_" << label_name << "{batch_size=" << i << "}";
result[label_count.str()] = picojson::value(item.count);
}
this->draft_count[j]++;
}
return picojson::value(result);
};

metrics["decode_time_by_batch_size"] =
f_create_time_list("decode_time", decode_time_by_batch_size);
metrics["draft_time_by_batch_size"] = f_create_time_list("draft_time", draft_time_by_batch_size);
metrics["verify_time_by_batch_size"] =
f_create_time_list("verify_time", verify_time_by_batch_size);

return picojson::value(metrics);
}

void EngineMetrics::Reset() {
sum_engine_prefill_time = 0.0;
sum_engine_decode_time = 0.0;
sum_num_input_tokens = 0;
sum_num_prefill_tokens = 0;
sum_num_output_tokens = 0;
last_finished_req_prefill_time = 0.0;
last_finished_req_decode_time = 0.0;
last_finished_req_num_input_tokens = 0.0;
last_finished_req_num_prefill_tokens = 0.0;
last_finished_req_num_output_tokens = 0.0;
spec_decode.Reset();
decode_time_by_batch_size.clear();
draft_time_by_batch_size.clear();
verify_time_by_batch_size.clear();
decode_time_by_batch_size.resize(kEndFineGrainedTrackingBatchSize);
draft_time_by_batch_size.resize(kEndFineGrainedTrackingBatchSize);
verify_time_by_batch_size.resize(kEndFineGrainedTrackingBatchSize);
}

} // namespace serve
Expand Down
Loading