Stream audio in 4096-sample sub-chunks for immediate HA playback
All checks were successful
Build ROCm Image / build (push) Successful in 4m20s

Previously the entire synthesized audio for a sentence was sent as one
AudioChunk event. HA buffers until it arrives in full, so playback didn't
start until synthesis was complete. Splitting into 4096-sample chunks lets
HA begin playing as data arrives.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-05 13:25:54 -04:00
parent 514bbad0e9
commit a8e3e62dbc

View File

@@ -135,21 +135,26 @@ class ChatterboxWyomingHandler(AsyncEventHandler):
continue
audio_np = audio_tensor.cpu().numpy().squeeze()
audio_bytes = (audio_np * 32767).clip(-32768, 32767).astype(np.int16).tobytes()
audio_int16 = (audio_np * 32767).clip(-32768, 32767).astype(np.int16)
if first_chunk:
ttfa = time.monotonic() - start_time
logger.info(f"Time to first audio: {ttfa:.3f}s")
first_chunk = False
await self.write_event(
AudioChunk(
audio=audio_bytes,
rate=sample_rate,
width=2,
channels=1,
).event()
)
# Send in small sub-chunks so HA can begin playback immediately
# rather than waiting for the full audio blob to arrive.
audio_chunk_samples = 4096
for offset in range(0, len(audio_int16), audio_chunk_samples):
sub = audio_int16[offset : offset + audio_chunk_samples]
await self.write_event(
AudioChunk(
audio=sub.tobytes(),
rate=sample_rate,
width=2,
channels=1,
).event()
)
await self.write_event(AudioStop().event())
total = time.monotonic() - start_time