3131 "VoiceSelectionParams" ,
3232 "AudioConfig" ,
3333 "SynthesizeSpeechResponse" ,
34+ "Timepoint" ,
3435 },
3536)
3637
@@ -52,7 +53,9 @@ class AudioEncoding(proto.Enum):
5253 AUDIO_ENCODING_UNSPECIFIED = 0
5354 LINEAR16 = 1
5455 MP3 = 2
56+ MP3_64_KBPS = 4
5557 OGG_OPUS = 3
58+ MULAW = 5
5659
5760
5861class ListVoicesRequest (proto .Message ):
@@ -128,14 +131,26 @@ class SynthesizeSpeechRequest(proto.Message):
128131 audio_config (~.cloud_tts.AudioConfig):
129132 Required. The configuration of the
130133 synthesized audio.
134+ enable_time_pointing (Sequence[~.cloud_tts.SynthesizeSpeechRequest.TimepointType]):
135+ Whether and what timepoints should be
136+ returned in the response.
131137 """
132138
139+ class TimepointType (proto .Enum ):
140+ r"""The type of timepoint information that is returned in the
141+ response.
142+ """
143+ TIMEPOINT_TYPE_UNSPECIFIED = 0
144+ SSML_MARK = 1
145+
133146 input = proto .Field (proto .MESSAGE , number = 1 , message = "SynthesisInput" )
134147
135148 voice = proto .Field (proto .MESSAGE , number = 2 , message = "VoiceSelectionParams" )
136149
137150 audio_config = proto .Field (proto .MESSAGE , number = 3 , message = "AudioConfig" )
138151
152+ enable_time_pointing = proto .RepeatedField (proto .ENUM , number = 4 , enum = TimepointType )
153+
139154
140155class SynthesisInput (proto .Message ):
141156 r"""Contains text input to be synthesized. Either ``text`` or ``ssml``
@@ -270,9 +285,37 @@ class SynthesizeSpeechResponse(proto.Message):
270285 include the WAV header. Note: as with all bytes fields,
271286 protobuffers use a pure binary representation, whereas JSON
272287 representations use base64.
288+ timepoints (Sequence[~.cloud_tts.Timepoint]):
289+ A link between a position in the original request input and
290+ a corresponding time in the output audio. It's only
291+ supported via ``<mark>`` of SSML input.
292+ audio_config (~.cloud_tts.AudioConfig):
293+ The audio metadata of ``audio_content``.
273294 """
274295
275296 audio_content = proto .Field (proto .BYTES , number = 1 )
276297
298+ timepoints = proto .RepeatedField (proto .MESSAGE , number = 2 , message = "Timepoint" )
299+
300+ audio_config = proto .Field (proto .MESSAGE , number = 4 , message = AudioConfig )
301+
302+
303+ class Timepoint (proto .Message ):
304+ r"""This contains a mapping between a certain point in the input
305+ text and a corresponding time in the output audio.
306+
307+ Attributes:
308+ mark_name (str):
309+ Timepoint name as received from the client within ``<mark>``
310+ tag.
311+ time_seconds (float):
312+ Time offset in seconds from the start of the
313+ synthesized audio.
314+ """
315+
316+ mark_name = proto .Field (proto .STRING , number = 4 )
317+
318+ time_seconds = proto .Field (proto .DOUBLE , number = 3 )
319+
277320
278321__all__ = tuple (sorted (__protobuf__ .manifest ))
0 commit comments