Stream audio in 4096-sample sub-chunks for immediate HA playback

Previously the entire synthesized audio for a sentence was sent as one AudioChunk event. HA buffers until it arrives in full, so playback didn't start until synthesis was complete. Splitting into 4096-sample chunks lets HA begin playing as data arrives. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-05 13:25:54 -04:00
parent 514bbad0e9
commit a8e3e62dbc
1 changed files with 14 additions and 9 deletions
--- a/wyoming_handler.py
+++ b/wyoming_handler.py
@@ -135,21 +135,26 @@ class ChatterboxWyomingHandler(AsyncEventHandler):
                continue

            audio_np = audio_tensor.cpu().numpy().squeeze()
-            audio_bytes = (audio_np * 32767).clip(-32768, 32767).astype(np.int16).tobytes()
+            audio_int16 = (audio_np * 32767).clip(-32768, 32767).astype(np.int16)

            if first_chunk:
                ttfa = time.monotonic() - start_time
                logger.info(f"Time to first audio: {ttfa:.3f}s")
                first_chunk = False

-            await self.write_event(
-                AudioChunk(
-                    audio=audio_bytes,
-                    rate=sample_rate,
-                    width=2,
-                    channels=1,
-                ).event()
-            )
+            # Send in small sub-chunks so HA can begin playback immediately
+            # rather than waiting for the full audio blob to arrive.
+            audio_chunk_samples = 4096
+            for offset in range(0, len(audio_int16), audio_chunk_samples):
+                sub = audio_int16[offset : offset + audio_chunk_samples]
+                await self.write_event(
+                    AudioChunk(
+                        audio=sub.tobytes(),
+                        rate=sample_rate,
+                        width=2,
+                        channels=1,
+                    ).event()
+                )

        await self.write_event(AudioStop().event())
        total = time.monotonic() - start_time