From a8e3e62dbce7acecc77e1da3f3e5611baf4e9be2 Mon Sep 17 00:00:00 2001
From: scott <scott@sdgarren.com>
Date: Sun, 5 Apr 2026 13:25:54 -0400
Subject: [PATCH] Stream audio in 4096-sample sub-chunks for immediate HA
 playback

Previously the entire synthesized audio for a sentence was sent as one
AudioChunk event. HA buffers until it arrives in full, so playback didn't
start until synthesis was complete. Splitting into 4096-sample chunks lets
HA begin playing as data arrives.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 wyoming_handler.py | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/wyoming_handler.py b/wyoming_handler.py
index bcb96a7..3d573a4 100644
--- a/wyoming_handler.py
+++ b/wyoming_handler.py
@@ -135,21 +135,26 @@ class ChatterboxWyomingHandler(AsyncEventHandler):
                 continue
 
             audio_np = audio_tensor.cpu().numpy().squeeze()
-            audio_bytes = (audio_np * 32767).clip(-32768, 32767).astype(np.int16).tobytes()
+            audio_int16 = (audio_np * 32767).clip(-32768, 32767).astype(np.int16)
 
             if first_chunk:
                 ttfa = time.monotonic() - start_time
                 logger.info(f"Time to first audio: {ttfa:.3f}s")
                 first_chunk = False
 
-            await self.write_event(
-                AudioChunk(
-                    audio=audio_bytes,
-                    rate=sample_rate,
-                    width=2,
-                    channels=1,
-                ).event()
-            )
+            # Send in small sub-chunks so HA can begin playback immediately
+            # rather than waiting for the full audio blob to arrive.
+            audio_chunk_samples = 4096
+            for offset in range(0, len(audio_int16), audio_chunk_samples):
+                sub = audio_int16[offset : offset + audio_chunk_samples]
+                await self.write_event(
+                    AudioChunk(
+                        audio=sub.tobytes(),
+                        rate=sample_rate,
+                        width=2,
+                        channels=1,
+                    ).event()
+                )
 
         await self.write_event(AudioStop().event())
         total = time.monotonic() - start_time