|
| 1 | +syntax = "proto3"; |
| 2 | + |
| 3 | +package tpu.monitoring.runtime; |
| 4 | + |
| 5 | +import "google/protobuf/timestamp.proto"; |
| 6 | + |
| 7 | + |
| 8 | + |
| 9 | +option java_multiple_files = true; |
| 10 | +option objc_class_prefix = "GRPC"; |
| 11 | +option java_package = "com.google.tpu.monitoring.runtime.service.proto"; |
| 12 | + |
| 13 | + |
| 14 | +message Exemplar { |
| 15 | + double value = 1; |
| 16 | + .google.protobuf.Timestamp timestamp = 2; |
| 17 | + repeated Attribute attributes = 3; |
| 18 | +} |
| 19 | + |
| 20 | +message Distribution { |
| 21 | + int64 count = 1; |
| 22 | + double mean = 2; |
| 23 | + double min = 3; |
| 24 | + double max = 4; |
| 25 | + double sum_of_squared_deviation = 5; |
| 26 | + |
| 27 | + message BucketOptions { |
| 28 | + oneof options { |
| 29 | + Regular regular_buckets = 1 [deprecated = true]; |
| 30 | + Exponential exponential_buckets = 2; |
| 31 | + Explicit explicit_buckets = 3; |
| 32 | + Linear linear_buckets = 4; |
| 33 | + } |
| 34 | + message Regular { |
| 35 | + option deprecated = true; |
| 36 | + |
| 37 | + int32 num_finite_buckets = 1; |
| 38 | + // A linear distribution has only one bound with overall width and offset |
| 39 | + // of the lowest bucket. |
| 40 | + // An explicit distribution will have monotonically increasing buckets |
| 41 | + // with width and the offset from the previous bucket. |
| 42 | + repeated Bound bounds = 2; |
| 43 | + } |
| 44 | + message Exponential { |
| 45 | + // Must be greater than 0. |
| 46 | + int32 num_finite_buckets = 1; |
| 47 | + // Must be greater than 1. |
| 48 | + double growth_factor = 2; |
| 49 | + // Must be greater than 0. |
| 50 | + double scale = 3; |
| 51 | + } |
| 52 | + message Bound { |
| 53 | + option deprecated = true; |
| 54 | + |
| 55 | + double width = 1; |
| 56 | + double offset = 2; |
| 57 | + } |
| 58 | + |
| 59 | + // Specifies a linear sequence of buckets that all have the same width |
| 60 | + // (except overflow and underflow). Each bucket represents a constant |
| 61 | + // absolute uncertainty on the specific value in the bucket. |
| 62 | + // |
| 63 | + // There are `num_finite_buckets + 2` (= N) buckets. Bucket `i` has the |
| 64 | + // following boundaries: |
| 65 | + // |
| 66 | + // Upper bound (0 <= i < N-1): offset + (width * i). |
| 67 | + // |
| 68 | + // Lower bound (1 <= i < N): offset + (width * (i - 1)). |
| 69 | + message Linear { |
| 70 | + // Must be greater than 0. |
| 71 | + int32 num_finite_buckets = 1; |
| 72 | + |
| 73 | + // Must be greater than 0. |
| 74 | + double width = 2; |
| 75 | + |
| 76 | + // Lower bound of the first bucket. |
| 77 | + double offset = 3; |
| 78 | + } |
| 79 | + |
| 80 | + // Specifies a set of buckets with arbitrary widths. |
| 81 | + // |
| 82 | + // There are `size(bounds) + 1` (= N) buckets. Bucket `i` has the following |
| 83 | + // boundaries: |
| 84 | + // |
| 85 | + // Upper bound (0 <= i < N-1): bounds[i] |
| 86 | + // Lower bound (1 <= i < N); bounds[i - 1] |
| 87 | + // |
| 88 | + // The `bounds` field must contain at least one element. If `bounds` has |
| 89 | + // only one element, then there are no finite buckets, and that single |
| 90 | + // element is the common boundary of the overflow and underflow buckets. |
| 91 | + message Explicit { |
| 92 | + // The values must be monotonically increasing. |
| 93 | + repeated double bounds = 1; |
| 94 | + } |
| 95 | + } |
| 96 | + |
| 97 | + // Defines the histogram bucket boundaries. |
| 98 | + BucketOptions bucket_options = 6; |
| 99 | + repeated int64 bucket_counts = 7; |
| 100 | + repeated Exemplar exemplars = 8; |
| 101 | +} |
| 102 | + |
| 103 | +// Gauge represents a single-point measure. |
| 104 | +message Gauge { |
| 105 | + oneof value { |
| 106 | + double as_double = 1; |
| 107 | + int64 as_int = 2; |
| 108 | + string as_string = 3; |
| 109 | + bool as_bool = 4; |
| 110 | + } |
| 111 | +} |
| 112 | + |
| 113 | +// Counter is a monotonically increasing measure (until reset to zero). |
| 114 | +message Counter { |
| 115 | + // The value MUST not be negative. |
| 116 | + oneof value { |
| 117 | + double as_double = 1; |
| 118 | + uint64 as_int = 2; |
| 119 | + } |
| 120 | + Exemplar exemplar = 3; |
| 121 | +} |
| 122 | + |
| 123 | +// Quantile represents the value at a given quantile of a distribution. |
| 124 | +message Quantile { |
| 125 | + // The quantile of a distribution. Must be in the interval [0.0, 1.0]. |
| 126 | + double quantile = 1; |
| 127 | + // The value at the given quantile of a distribution. |
| 128 | + // Quantile values must NOT be negative. |
| 129 | + double value = 2; |
| 130 | +} |
| 131 | + |
| 132 | +// Summary represents observed sampling for different quantiles including |
| 133 | +// sum of all the observations and total count of observations. |
| 134 | +message Summary { |
| 135 | + uint64 sample_count = 1; |
| 136 | + double sample_sum = 2; |
| 137 | + repeated Quantile quantile = 3; |
| 138 | +} |
| 139 | + |
| 140 | +// AttrValue represents an attribute value. |
| 141 | +// AttrValue is considered to be "empty" if all values are unspecified. |
| 142 | +message AttrValue { |
| 143 | + oneof attr { |
| 144 | + string string_attr = 1; |
| 145 | + bool bool_attr = 2; |
| 146 | + int64 int_attr = 3; |
| 147 | + double double_attr = 4; |
| 148 | + ArrayAttrValue array_attr = 5; |
| 149 | + KeyValueList kvlist_attr = 6; |
| 150 | + bytes bytes_attr = 7; |
| 151 | + } |
| 152 | +} |
| 153 | + |
| 154 | +// ArrayAttrValue is a list of AttrValue messages. |
| 155 | +message ArrayAttrValue { |
| 156 | + // Array of attribute. The array may be empty (contain 0 elements). |
| 157 | + repeated AttrValue attrs = 1; |
| 158 | +} |
| 159 | + |
| 160 | +// KeyValueList is a list of Key-AttrValue messages. |
| 161 | +message KeyValueList { |
| 162 | + // A collection of key/value attributes. The list may be empty. |
| 163 | + // The keys in attributes MUST be unique. |
| 164 | + repeated Attribute attributes = 1; |
| 165 | +} |
| 166 | + |
| 167 | +// Attribute is a key-value pair to store the attributes of a metric. |
| 168 | +// For example, device-id of the metric, host-id of the metric. |
| 169 | +message Attribute { |
| 170 | + string key = 1; |
| 171 | + AttrValue value = 2; |
| 172 | +} |
| 173 | + |
| 174 | +// Metric represents a metric datapoint. |
| 175 | +// A metric has a reporting time, attribute and a measure value. |
| 176 | +message Metric { |
| 177 | + Attribute attribute = 1; |
| 178 | + .google.protobuf.Timestamp timestamp = 2; |
| 179 | + oneof measure { |
| 180 | + Gauge gauge = 3; |
| 181 | + Counter counter = 4; |
| 182 | + Distribution distribution = 5; |
| 183 | + Summary summary = 6; |
| 184 | + } |
| 185 | +} |
| 186 | + |
| 187 | +// TPUMetric is a standalone metric object, exposed externally to a consumer. |
| 188 | +message TPUMetric { |
| 189 | + string name = 1; |
| 190 | + string description = 2; |
| 191 | + repeated Metric metrics = 3; |
| 192 | +} |
| 193 | + |
| 194 | +// MetricRequest is the request object to fetch metrics from LibTPU. |
| 195 | +// MetricRequest contains the metric name with which metrics can be fetched |
| 196 | +// from the RuntimeMetricsService.GetRuntimeMetric. |
| 197 | +message MetricRequest { |
| 198 | + string metric_name = 1; |
| 199 | + // skip_node_aggregation provides options to the client to skip aggregated |
| 200 | + // lookup of metrics for a worker node. If the field is unset or set as false, |
| 201 | + // an aggregated view of metrics for a TPU worker node would be provided. |
| 202 | + // The aggregation feature is enabled by libTPU during initialization. |
| 203 | + // By default, the worker node aggregation would be turned on in libTPU if the |
| 204 | + // metrics server is supported. If the libTPU initialization turns off the |
| 205 | + // feature explicitly, then the aggregated view would not be provided. |
| 206 | + bool skip_node_aggregation = 2; |
| 207 | +} |
| 208 | + |
| 209 | +// MetricResponse is the response object for RuntimeService.GetRuntimeMetric. |
| 210 | +// The response contains the TPUMetric as response which holds the metric data |
| 211 | +// for the requested metric. |
| 212 | +message MetricResponse { |
| 213 | + TPUMetric metric = 1; |
| 214 | +} |
| 215 | + |
| 216 | +// ListSupportedMetricsRequest is the request object for |
| 217 | +// RuntimeService.ListSupportedMetrics. |
| 218 | +// Empty request means no filters. All the metrics supported from the LibTPU |
| 219 | +// would be returned as the response. |
| 220 | +message ListSupportedMetricsRequest { |
| 221 | + // A regex filter to apply to the supported metrics. |
| 222 | + // If the field is empty or not set, no filter is applied. All the supported |
| 223 | + // metrics are returned. |
| 224 | + // |
| 225 | + // Example: `.*memory.*`, `.*memory.*|.*duty_cycle.*` |
| 226 | + string filter = 1; |
| 227 | +} |
| 228 | + |
| 229 | +message SupportedMetric { |
| 230 | + string metric_name = 1; |
| 231 | +} |
| 232 | + |
| 233 | +// ListSupportedMetricsResponse is the response object for |
| 234 | +// RuntimeService.ListSupportedMetrics. |
| 235 | +// It contains all the metrics supported in the LibTPU for the |
| 236 | +// ListSupportedMetricsRequest. |
| 237 | +message ListSupportedMetricsResponse { |
| 238 | + // List of supported metric. |
| 239 | + repeated SupportedMetric supported_metric = 1; |
| 240 | +} |
| 241 | + |
| 242 | +service RuntimeMetricService { |
| 243 | + // GetRuntimeMetric returns the TPU metrics data for the MetricRequest. |
| 244 | + rpc GetRuntimeMetric(MetricRequest) returns (MetricResponse); |
| 245 | + |
| 246 | + // ListSupportedMetrics lists the supported metrics for |
| 247 | + // ListSupportedMetricsRequest. |
| 248 | + rpc ListSupportedMetrics(ListSupportedMetricsRequest) |
| 249 | + returns (ListSupportedMetricsResponse); |
| 250 | +} |
0 commit comments