From a8e3e62dbce7acecc77e1da3f3e5611baf4e9be2 Mon Sep 17 00:00:00 2001 From: scott Date: Sun, 5 Apr 2026 13:25:54 -0400 Subject: [PATCH] Stream audio in 4096-sample sub-chunks for immediate HA playback Previously the entire synthesized audio for a sentence was sent as one AudioChunk event. HA buffers until it arrives in full, so playback didn't start until synthesis was complete. Splitting into 4096-sample chunks lets HA begin playing as data arrives. Co-Authored-By: Claude Sonnet 4.6 --- wyoming_handler.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/wyoming_handler.py b/wyoming_handler.py index bcb96a7..3d573a4 100644 --- a/wyoming_handler.py +++ b/wyoming_handler.py @@ -135,21 +135,26 @@ class ChatterboxWyomingHandler(AsyncEventHandler): continue audio_np = audio_tensor.cpu().numpy().squeeze() - audio_bytes = (audio_np * 32767).clip(-32768, 32767).astype(np.int16).tobytes() + audio_int16 = (audio_np * 32767).clip(-32768, 32767).astype(np.int16) if first_chunk: ttfa = time.monotonic() - start_time logger.info(f"Time to first audio: {ttfa:.3f}s") first_chunk = False - await self.write_event( - AudioChunk( - audio=audio_bytes, - rate=sample_rate, - width=2, - channels=1, - ).event() - ) + # Send in small sub-chunks so HA can begin playback immediately + # rather than waiting for the full audio blob to arrive. + audio_chunk_samples = 4096 + for offset in range(0, len(audio_int16), audio_chunk_samples): + sub = audio_int16[offset : offset + audio_chunk_samples] + await self.write_event( + AudioChunk( + audio=sub.tobytes(), + rate=sample_rate, + width=2, + channels=1, + ).event() + ) await self.write_event(AudioStop().event()) total = time.monotonic() - start_time