Stream audio in 4096-sample sub-chunks for immediate HA playback
All checks were successful
Build ROCm Image / build (push) Successful in 4m20s

Previously the entire synthesized audio for a sentence was sent as one
AudioChunk event. HA buffers until it arrives in full, so playback didn't
start until synthesis was complete. Splitting into 4096-sample chunks lets
HA begin playing as data arrives.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-05 13:25:54 -04:00
parent 514bbad0e9
commit a8e3e62dbc

View File

@@ -135,21 +135,26 @@ class ChatterboxWyomingHandler(AsyncEventHandler):
continue continue
audio_np = audio_tensor.cpu().numpy().squeeze() audio_np = audio_tensor.cpu().numpy().squeeze()
audio_bytes = (audio_np * 32767).clip(-32768, 32767).astype(np.int16).tobytes() audio_int16 = (audio_np * 32767).clip(-32768, 32767).astype(np.int16)
if first_chunk: if first_chunk:
ttfa = time.monotonic() - start_time ttfa = time.monotonic() - start_time
logger.info(f"Time to first audio: {ttfa:.3f}s") logger.info(f"Time to first audio: {ttfa:.3f}s")
first_chunk = False first_chunk = False
await self.write_event( # Send in small sub-chunks so HA can begin playback immediately
AudioChunk( # rather than waiting for the full audio blob to arrive.
audio=audio_bytes, audio_chunk_samples = 4096
rate=sample_rate, for offset in range(0, len(audio_int16), audio_chunk_samples):
width=2, sub = audio_int16[offset : offset + audio_chunk_samples]
channels=1, await self.write_event(
).event() AudioChunk(
) audio=sub.tobytes(),
rate=sample_rate,
width=2,
channels=1,
).event()
)
await self.write_event(AudioStop().event()) await self.write_event(AudioStop().event())
total = time.monotonic() - start_time total = time.monotonic() - start_time