Stream audio in 4096-sample sub-chunks for immediate HA playback
All checks were successful
Build ROCm Image / build (push) Successful in 4m20s

Previously the entire synthesized audio for a sentence was sent as one
AudioChunk event. HA buffers until it arrives in full, so playback didn't
start until synthesis was complete. Splitting into 4096-sample chunks lets
HA begin playing as data arrives.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-05 13:25:54 -04:00
parent 514bbad0e9
commit a8e3e62dbc

View File

@@ -135,16 +135,21 @@ class ChatterboxWyomingHandler(AsyncEventHandler):
continue continue
audio_np = audio_tensor.cpu().numpy().squeeze() audio_np = audio_tensor.cpu().numpy().squeeze()
audio_bytes = (audio_np * 32767).clip(-32768, 32767).astype(np.int16).tobytes() audio_int16 = (audio_np * 32767).clip(-32768, 32767).astype(np.int16)
if first_chunk: if first_chunk:
ttfa = time.monotonic() - start_time ttfa = time.monotonic() - start_time
logger.info(f"Time to first audio: {ttfa:.3f}s") logger.info(f"Time to first audio: {ttfa:.3f}s")
first_chunk = False first_chunk = False
# Send in small sub-chunks so HA can begin playback immediately
# rather than waiting for the full audio blob to arrive.
audio_chunk_samples = 4096
for offset in range(0, len(audio_int16), audio_chunk_samples):
sub = audio_int16[offset : offset + audio_chunk_samples]
await self.write_event( await self.write_event(
AudioChunk( AudioChunk(
audio=audio_bytes, audio=sub.tobytes(),
rate=sample_rate, rate=sample_rate,
width=2, width=2,
channels=1, channels=1,