diff --git a/wyoming_handler.py b/wyoming_handler.py index 3d573a4..6a18d2f 100644 --- a/wyoming_handler.py +++ b/wyoming_handler.py @@ -3,7 +3,6 @@ import logging import time from typing import Dict, Optional -import numpy as np from wyoming.audio import AudioChunk, AudioStart, AudioStop from wyoming.event import Event from wyoming.info import Describe, Info @@ -94,10 +93,14 @@ class ChatterboxWyomingHandler(AsyncEventHandler): return True if Synthesize.is_type(event.type): + if self._streaming: + # Ignore duplicate Synthesize events sent alongside streaming protocol + return True synth = Synthesize.from_event(event) voice_name = synth.voice.name if synth.voice else None await self._synthesize_and_stream(synth.text, voice_name) - await self.write_event(SynthesizeStopped().event()) + # NOTE: SynthesizeStopped is NOT sent here — it belongs only to the + # streaming protocol (SynthesizeStop path). Sending it here confuses HA. return True return True @@ -134,27 +137,23 @@ class ChatterboxWyomingHandler(AsyncEventHandler): logger.exception(f"Synthesis failed for chunk {i+1}") continue - audio_np = audio_tensor.cpu().numpy().squeeze() - audio_int16 = (audio_np * 32767).clip(-32768, 32767).astype(np.int16) + audio_bytes = ( + audio_tensor.cpu().numpy().squeeze() * 32767 + ).astype("int16").tobytes() if first_chunk: ttfa = time.monotonic() - start_time logger.info(f"Time to first audio: {ttfa:.3f}s") first_chunk = False - # Send in small sub-chunks so HA can begin playback immediately - # rather than waiting for the full audio blob to arrive. - audio_chunk_samples = 4096 - for offset in range(0, len(audio_int16), audio_chunk_samples): - sub = audio_int16[offset : offset + audio_chunk_samples] - await self.write_event( - AudioChunk( - audio=sub.tobytes(), - rate=sample_rate, - width=2, - channels=1, - ).event() - ) + await self.write_event( + AudioChunk( + audio=audio_bytes, + rate=sample_rate, + width=2, + channels=1, + ).event() + ) await self.write_event(AudioStop().event()) total = time.monotonic() - start_time