Skip to content

Commit 6439522

Browse files
authored
Add tpu_metric_service.proto (#1)
1 parent d2985f4 commit 6439522

File tree

1 file changed

+250
-0
lines changed

1 file changed

+250
-0
lines changed
Lines changed: 250 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,250 @@
1+
syntax = "proto3";
2+
3+
package tpu.monitoring.runtime;
4+
5+
import "google/protobuf/timestamp.proto";
6+
7+
8+
9+
option java_multiple_files = true;
10+
option objc_class_prefix = "GRPC";
11+
option java_package = "com.google.tpu.monitoring.runtime.service.proto";
12+
13+
14+
message Exemplar {
15+
double value = 1;
16+
.google.protobuf.Timestamp timestamp = 2;
17+
repeated Attribute attributes = 3;
18+
}
19+
20+
message Distribution {
21+
int64 count = 1;
22+
double mean = 2;
23+
double min = 3;
24+
double max = 4;
25+
double sum_of_squared_deviation = 5;
26+
27+
message BucketOptions {
28+
oneof options {
29+
Regular regular_buckets = 1 [deprecated = true];
30+
Exponential exponential_buckets = 2;
31+
Explicit explicit_buckets = 3;
32+
Linear linear_buckets = 4;
33+
}
34+
message Regular {
35+
option deprecated = true;
36+
37+
int32 num_finite_buckets = 1;
38+
// A linear distribution has only one bound with overall width and offset
39+
// of the lowest bucket.
40+
// An explicit distribution will have monotonically increasing buckets
41+
// with width and the offset from the previous bucket.
42+
repeated Bound bounds = 2;
43+
}
44+
message Exponential {
45+
// Must be greater than 0.
46+
int32 num_finite_buckets = 1;
47+
// Must be greater than 1.
48+
double growth_factor = 2;
49+
// Must be greater than 0.
50+
double scale = 3;
51+
}
52+
message Bound {
53+
option deprecated = true;
54+
55+
double width = 1;
56+
double offset = 2;
57+
}
58+
59+
// Specifies a linear sequence of buckets that all have the same width
60+
// (except overflow and underflow). Each bucket represents a constant
61+
// absolute uncertainty on the specific value in the bucket.
62+
//
63+
// There are `num_finite_buckets + 2` (= N) buckets. Bucket `i` has the
64+
// following boundaries:
65+
//
66+
// Upper bound (0 <= i < N-1): offset + (width * i).
67+
//
68+
// Lower bound (1 <= i < N): offset + (width * (i - 1)).
69+
message Linear {
70+
// Must be greater than 0.
71+
int32 num_finite_buckets = 1;
72+
73+
// Must be greater than 0.
74+
double width = 2;
75+
76+
// Lower bound of the first bucket.
77+
double offset = 3;
78+
}
79+
80+
// Specifies a set of buckets with arbitrary widths.
81+
//
82+
// There are `size(bounds) + 1` (= N) buckets. Bucket `i` has the following
83+
// boundaries:
84+
//
85+
// Upper bound (0 <= i < N-1): bounds[i]
86+
// Lower bound (1 <= i < N); bounds[i - 1]
87+
//
88+
// The `bounds` field must contain at least one element. If `bounds` has
89+
// only one element, then there are no finite buckets, and that single
90+
// element is the common boundary of the overflow and underflow buckets.
91+
message Explicit {
92+
// The values must be monotonically increasing.
93+
repeated double bounds = 1;
94+
}
95+
}
96+
97+
// Defines the histogram bucket boundaries.
98+
BucketOptions bucket_options = 6;
99+
repeated int64 bucket_counts = 7;
100+
repeated Exemplar exemplars = 8;
101+
}
102+
103+
// Gauge represents a single-point measure.
104+
message Gauge {
105+
oneof value {
106+
double as_double = 1;
107+
int64 as_int = 2;
108+
string as_string = 3;
109+
bool as_bool = 4;
110+
}
111+
}
112+
113+
// Counter is a monotonically increasing measure (until reset to zero).
114+
message Counter {
115+
// The value MUST not be negative.
116+
oneof value {
117+
double as_double = 1;
118+
uint64 as_int = 2;
119+
}
120+
Exemplar exemplar = 3;
121+
}
122+
123+
// Quantile represents the value at a given quantile of a distribution.
124+
message Quantile {
125+
// The quantile of a distribution. Must be in the interval [0.0, 1.0].
126+
double quantile = 1;
127+
// The value at the given quantile of a distribution.
128+
// Quantile values must NOT be negative.
129+
double value = 2;
130+
}
131+
132+
// Summary represents observed sampling for different quantiles including
133+
// sum of all the observations and total count of observations.
134+
message Summary {
135+
uint64 sample_count = 1;
136+
double sample_sum = 2;
137+
repeated Quantile quantile = 3;
138+
}
139+
140+
// AttrValue represents an attribute value.
141+
// AttrValue is considered to be "empty" if all values are unspecified.
142+
message AttrValue {
143+
oneof attr {
144+
string string_attr = 1;
145+
bool bool_attr = 2;
146+
int64 int_attr = 3;
147+
double double_attr = 4;
148+
ArrayAttrValue array_attr = 5;
149+
KeyValueList kvlist_attr = 6;
150+
bytes bytes_attr = 7;
151+
}
152+
}
153+
154+
// ArrayAttrValue is a list of AttrValue messages.
155+
message ArrayAttrValue {
156+
// Array of attribute. The array may be empty (contain 0 elements).
157+
repeated AttrValue attrs = 1;
158+
}
159+
160+
// KeyValueList is a list of Key-AttrValue messages.
161+
message KeyValueList {
162+
// A collection of key/value attributes. The list may be empty.
163+
// The keys in attributes MUST be unique.
164+
repeated Attribute attributes = 1;
165+
}
166+
167+
// Attribute is a key-value pair to store the attributes of a metric.
168+
// For example, device-id of the metric, host-id of the metric.
169+
message Attribute {
170+
string key = 1;
171+
AttrValue value = 2;
172+
}
173+
174+
// Metric represents a metric datapoint.
175+
// A metric has a reporting time, attribute and a measure value.
176+
message Metric {
177+
Attribute attribute = 1;
178+
.google.protobuf.Timestamp timestamp = 2;
179+
oneof measure {
180+
Gauge gauge = 3;
181+
Counter counter = 4;
182+
Distribution distribution = 5;
183+
Summary summary = 6;
184+
}
185+
}
186+
187+
// TPUMetric is a standalone metric object, exposed externally to a consumer.
188+
message TPUMetric {
189+
string name = 1;
190+
string description = 2;
191+
repeated Metric metrics = 3;
192+
}
193+
194+
// MetricRequest is the request object to fetch metrics from LibTPU.
195+
// MetricRequest contains the metric name with which metrics can be fetched
196+
// from the RuntimeMetricsService.GetRuntimeMetric.
197+
message MetricRequest {
198+
string metric_name = 1;
199+
// skip_node_aggregation provides options to the client to skip aggregated
200+
// lookup of metrics for a worker node. If the field is unset or set as false,
201+
// an aggregated view of metrics for a TPU worker node would be provided.
202+
// The aggregation feature is enabled by libTPU during initialization.
203+
// By default, the worker node aggregation would be turned on in libTPU if the
204+
// metrics server is supported. If the libTPU initialization turns off the
205+
// feature explicitly, then the aggregated view would not be provided.
206+
bool skip_node_aggregation = 2;
207+
}
208+
209+
// MetricResponse is the response object for RuntimeService.GetRuntimeMetric.
210+
// The response contains the TPUMetric as response which holds the metric data
211+
// for the requested metric.
212+
message MetricResponse {
213+
TPUMetric metric = 1;
214+
}
215+
216+
// ListSupportedMetricsRequest is the request object for
217+
// RuntimeService.ListSupportedMetrics.
218+
// Empty request means no filters. All the metrics supported from the LibTPU
219+
// would be returned as the response.
220+
message ListSupportedMetricsRequest {
221+
// A regex filter to apply to the supported metrics.
222+
// If the field is empty or not set, no filter is applied. All the supported
223+
// metrics are returned.
224+
//
225+
// Example: `.*memory.*`, `.*memory.*|.*duty_cycle.*`
226+
string filter = 1;
227+
}
228+
229+
message SupportedMetric {
230+
string metric_name = 1;
231+
}
232+
233+
// ListSupportedMetricsResponse is the response object for
234+
// RuntimeService.ListSupportedMetrics.
235+
// It contains all the metrics supported in the LibTPU for the
236+
// ListSupportedMetricsRequest.
237+
message ListSupportedMetricsResponse {
238+
// List of supported metric.
239+
repeated SupportedMetric supported_metric = 1;
240+
}
241+
242+
service RuntimeMetricService {
243+
// GetRuntimeMetric returns the TPU metrics data for the MetricRequest.
244+
rpc GetRuntimeMetric(MetricRequest) returns (MetricResponse);
245+
246+
// ListSupportedMetrics lists the supported metrics for
247+
// ListSupportedMetricsRequest.
248+
rpc ListSupportedMetrics(ListSupportedMetricsRequest)
249+
returns (ListSupportedMetricsResponse);
250+
}

0 commit comments

Comments
 (0)