Skip to content

Commit feb2680

Browse files
Implement wait_for_complete() for Speaker class
1 parent b233b60 commit feb2680

File tree

8 files changed

+113
-9
lines changed

8 files changed

+113
-9
lines changed

deepgram/audio/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,4 +18,5 @@
1818
CHANNELS as OUTPUT_CHANNELS,
1919
RATE as OUTPUT_RATE,
2020
CHUNK as OUTPUT_CHUNK,
21+
PLAYBACK_DELTA as OUTPUT_PLAYBACK_DELTA,
2122
)

deepgram/audio/speaker/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,4 @@
44

55
from .speaker import Speaker
66
from .errors import DeepgramSpeakerError
7-
from .constants import LOGGING, CHANNELS, RATE, CHUNK
7+
from .constants import LOGGING, CHANNELS, RATE, CHUNK, PLAYBACK_DELTA

deepgram/audio/speaker/constants.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,12 @@
44

55
from ...utils import verboselogs
66

7-
# Constants for microphone
7+
# Constants for speaker
88
LOGGING = verboselogs.WARNING
99
TIMEOUT = 0.050
1010
CHANNELS = 1
1111
RATE = 16000
1212
CHUNK = 8194
13+
14+
# Constants for speaker
15+
PLAYBACK_DELTA = 2000

deepgram/audio/speaker/speaker.py

Lines changed: 52 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,15 +8,18 @@
88
import threading
99
from typing import Optional, Callable, Union, TYPE_CHECKING
1010
import logging
11+
from datetime import datetime
1112

1213
import websockets
1314

1415
from ...utils import verboselogs
15-
from .constants import LOGGING, CHANNELS, RATE, CHUNK, TIMEOUT
16+
from .constants import LOGGING, CHANNELS, RATE, CHUNK, TIMEOUT, PLAYBACK_DELTA
1617

1718
if TYPE_CHECKING:
1819
import pyaudio
1920

21+
HALF_SECOND = 0.5
22+
2023

2124
class Speaker: # pylint: disable=too-many-instance-attributes
2225
"""
@@ -33,6 +36,11 @@ class Speaker: # pylint: disable=too-many-instance-attributes
3336
_channels: int
3437
_output_device_index: Optional[int] = None
3538

39+
# last time we received audio
40+
_last_datagram: datetime = datetime.now()
41+
_last_play_delta_in_ms: int
42+
_lock_wait: threading.Lock
43+
3644
_queue: queue.Queue
3745
_exit: threading.Event
3846

@@ -56,6 +64,7 @@ def __init__(
5664
rate: int = RATE,
5765
chunk: int = CHUNK,
5866
channels: int = CHANNELS,
67+
last_play_delta_in_ms: int = PLAYBACK_DELTA,
5968
output_device_index: Optional[int] = None,
6069
): # pylint: disable=too-many-positional-arguments
6170
# dynamic import of pyaudio as not to force the requirements on the SDK (and users)
@@ -68,11 +77,15 @@ def __init__(
6877
self._exit = threading.Event()
6978
self._queue = queue.Queue()
7079

80+
self._last_datagram = datetime.now()
81+
self._lock_wait = threading.Lock()
82+
7183
self._audio = pyaudio.PyAudio()
7284
self._chunk = chunk
7385
self._rate = rate
7486
self._format = pyaudio.paInt16
7587
self._channels = channels
88+
self._last_play_delta_in_ms = last_play_delta_in_ms
7689
self._output_device_index = output_device_index
7790

7891
self._push_callback_org = push_callback
@@ -192,6 +205,42 @@ def start(self, active_loop: Optional[asyncio.AbstractEventLoop] = None) -> bool
192205

193206
return True
194207

208+
def wait_for_complete(self):
209+
"""
210+
This method will block until the speak is done playing sound.
211+
"""
212+
self._logger.debug("Speaker.wait_for_complete ENTER")
213+
214+
delta_in_ms = float(self._last_play_delta_in_ms)
215+
self._logger.debug("Last Play delta: %f", delta_in_ms)
216+
217+
# set to now
218+
with self._lock_wait:
219+
self._last_datagram = datetime.now()
220+
221+
while True:
222+
# sleep for a bit
223+
self._exit.wait(HALF_SECOND)
224+
225+
# check if we should exit
226+
if self._exit.is_set():
227+
self._logger.debug("Exiting wait_for_complete _exit is set")
228+
break
229+
230+
# check the time
231+
with self._lock_wait:
232+
delta = datetime.now() - self._last_datagram
233+
diff_in_ms = delta.total_seconds() * 1000
234+
if diff_in_ms < delta_in_ms:
235+
self._logger.debug("LastPlay delta is less than threshold")
236+
continue
237+
238+
# if we get here, we are done playing audio
239+
self._logger.debug("LastPlay delta is greater than threshold. Exit wait!")
240+
break
241+
242+
self._logger.debug("Speaker.wait_for_complete LEAVE")
243+
195244
def _start_receiver(self):
196245
# Check if the socket is an asyncio WebSocket
197246
if inspect.iscoroutinefunction(self._pull_callback_org):
@@ -315,6 +364,8 @@ def _play(self, audio_out, stream, stop):
315364
while not stop.is_set():
316365
try:
317366
data = audio_out.get(True, TIMEOUT)
367+
with self._lock_wait:
368+
self._last_datagram = datetime.now()
318369
stream.write(data)
319370
except queue.Empty:
320371
pass

deepgram/clients/speak/v1/websocket/async_client.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
)
2828
from .options import SpeakWSOptions
2929

30-
from .....audio.speaker import Speaker, RATE, CHANNELS
30+
from .....audio.speaker import Speaker, RATE, CHANNELS, PLAYBACK_DELTA
3131

3232
ONE_SECOND = 1
3333
HALF_SECOND = 0.5
@@ -93,6 +93,11 @@ def __init__(self, config: DeepgramClientOptions):
9393
channels = self._config.options.get("speaker_playback_channels")
9494
if channels is None:
9595
channels = CHANNELS
96+
playback_delta_in_ms = self._config.options.get(
97+
"speaker_playback_delta_in_ms"
98+
)
99+
if playback_delta_in_ms is None:
100+
playback_delta_in_ms = PLAYBACK_DELTA
96101
device_index = self._config.options.get("speaker_playback_device_index")
97102

98103
self._logger.debug("rate: %s", rate)
@@ -103,13 +108,15 @@ def __init__(self, config: DeepgramClientOptions):
103108
self._speaker = Speaker(
104109
rate=rate,
105110
channels=channels,
111+
last_play_delta_in_ms=playback_delta_in_ms,
106112
verbose=self._config.verbose,
107113
output_device_index=device_index,
108114
)
109115
else:
110116
self._speaker = Speaker(
111117
rate=rate,
112118
channels=channels,
119+
last_play_delta_in_ms=playback_delta_in_ms,
113120
verbose=self._config.verbose,
114121
)
115122

@@ -590,6 +597,21 @@ async def clear(self) -> bool:
590597

591598
return True
592599

600+
async def wait_for_complete(self):
601+
"""
602+
This method will block until the speak is done playing sound.
603+
"""
604+
self._logger.spam("AsyncSpeakWebSocketClient.wait_for_complete ENTER")
605+
606+
if self._speaker is None:
607+
self._logger.error("speaker is None. Return immediately")
608+
return
609+
610+
loop = asyncio.get_event_loop()
611+
await loop.run_in_executor(None, self._speaker.wait_for_complete)
612+
self._logger.notice("wait_for_complete succeeded")
613+
self._logger.spam("AsyncSpeakWebSocketClient.wait_for_complete LEAVE")
614+
593615
async def _close_message(self) -> bool:
594616
return await self.send_control(SpeakWebSocketMessage.Close)
595617

deepgram/clients/speak/v1/websocket/client.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
)
2828
from .options import SpeakWSOptions
2929

30-
from .....audio.speaker import Speaker, RATE, CHANNELS
30+
from .....audio.speaker import Speaker, RATE, CHANNELS, PLAYBACK_DELTA
3131

3232
ONE_SECOND = 1
3333
HALF_SECOND = 0.5
@@ -96,6 +96,11 @@ def __init__(self, config: DeepgramClientOptions):
9696
channels = self._config.options.get("speaker_playback_channels")
9797
if channels is None:
9898
channels = CHANNELS
99+
playback_delta_in_ms = self._config.options.get(
100+
"speaker_playback_delta_in_ms"
101+
)
102+
if playback_delta_in_ms is None:
103+
playback_delta_in_ms = PLAYBACK_DELTA
99104
device_index = self._config.options.get("speaker_playback_device_index")
100105

101106
self._logger.debug("rate: %s", rate)
@@ -106,13 +111,15 @@ def __init__(self, config: DeepgramClientOptions):
106111
self._speaker = Speaker(
107112
rate=rate,
108113
channels=channels,
114+
last_play_delta_in_ms=playback_delta_in_ms,
109115
verbose=self._config.verbose,
110116
output_device_index=device_index,
111117
)
112118
else:
113119
self._speaker = Speaker(
114120
rate=rate,
115121
channels=channels,
122+
last_play_delta_in_ms=playback_delta_in_ms,
116123
verbose=self._config.verbose,
117124
)
118125

@@ -589,6 +596,20 @@ def clear(self) -> bool:
589596

590597
return True
591598

599+
def wait_for_complete(self):
600+
"""
601+
This method will block until the speak is done playing sound.
602+
"""
603+
self._logger.spam("SpeakWebSocketClient.wait_for_complete ENTER")
604+
605+
if self._speaker is None:
606+
self._logger.error("speaker is None. Return immediately")
607+
return
608+
609+
self._speaker.wait_for_complete()
610+
self._logger.notice("wait_for_complete succeeded")
611+
self._logger.spam("SpeakWebSocketClient.wait_for_complete LEAVE")
612+
592613
def _close_message(self) -> bool:
593614
return self.send_control(SpeakWebSocketMessage.Close)
594615

examples/text-to-speech/websocket/async_complete/main.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,11 @@ async def main():
3232

3333
# example of setting up a client config. logging values: WARNING, VERBOSE, DEBUG, SPAM
3434
config: DeepgramClientOptions = DeepgramClientOptions(
35-
options={"auto_flush_speak_delta": "500", "speaker_playback": "true"},
36-
# verbose=verboselogs.SPAM,
35+
options={
36+
# "auto_flush_speak_delta": "500",
37+
"speaker_playback": "true"
38+
},
39+
verbose=verboselogs.SPAM,
3740
)
3841
deepgram: DeepgramClient = DeepgramClient("", config)
3942

@@ -99,11 +102,12 @@ async def on_unhandled(self, unhandled, **kwargs):
99102

100103
# send the text to Deepgram
101104
await dg_connection.send_text(TTS_TEXT)
105+
102106
# if auto_flush_speak_delta is not used, you must flush the connection by calling flush()
103107
await dg_connection.flush()
104108

105109
# Indicate that we've finished
106-
await asyncio.sleep(7)
110+
await dg_connection.wait_for_complete()
107111

108112
# Close the connection
109113
await dg_connection.finish()

examples/text-to-speech/websocket/complete/main.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,11 +93,13 @@ def on_unhandled(self, unhandled, **kwargs):
9393

9494
# send the text to Deepgram
9595
dg_connection.send_text(TTS_TEXT)
96+
9697
# if auto_flush_speak_delta is not used, you must flush the connection by calling flush()
9798
dg_connection.flush()
9899

99100
# Indicate that we've finished
100-
time.sleep(5)
101+
dg_connection.wait_for_complete()
102+
101103
print("\n\nPress Enter to stop...\n\n")
102104
input()
103105

0 commit comments

Comments
 (0)