Stream audio in 4096-sample sub-chunks for immediate HA playback

Previously the entire synthesized audio for a sentence was sent as one AudioChunk event. HA buffers until it arrives in full, so playback didn't start until synthesis was complete. Splitting into 4096-sample chunks lets HA begin playing as data arrives. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-05 13:25:54 -04:00
parent 514bbad0e9
commit a8e3e62dbc
1 changed files with 14 additions and 9 deletions
--- a/wyoming_handler.py
+++ b/wyoming_handler.py
@@ -135,21 +135,26 @@ class ChatterboxWyomingHandler(AsyncEventHandler):
                continue
            audio_np = audio_tensor.cpu().numpy().squeeze()
-            audio_bytes = (audio_np * 32767).clip(-32768, 32767).astype(np.int16).tobytes()
+            audio_int16 = (audio_np * 32767).clip(-32768, 32767).astype(np.int16)
            if first_chunk:
                ttfa = time.monotonic() - start_time
                logger.info(f"Time to first audio: {ttfa:.3f}s")
                first_chunk = False
-            await self.write_event(
+            # Send in small sub-chunks so HA can begin playback immediately
-                AudioChunk(
+            # rather than waiting for the full audio blob to arrive.
-                    audio=audio_bytes,
+            audio_chunk_samples = 4096
-                    rate=sample_rate,
+            for offset in range(0, len(audio_int16), audio_chunk_samples):
-                    width=2,
+                sub = audio_int16[offset : offset + audio_chunk_samples]
-                    channels=1,
+                await self.write_event(
-                ).event()
+                    AudioChunk(
-            )
+                        audio=sub.tobytes(),
                        rate=sample_rate,
                        width=2,
                        channels=1,
                    ).event()
                )
        await self.write_event(AudioStop().event())
        total = time.monotonic() - start_time