Fix Wyoming protocol: remove SynthesizeStopped from Synthesize path
Some checks failed
Build ROCm Image / build (push) Has been cancelled

The plain Synthesize event (HA's standard TTS path) should NOT be
followed by SynthesizeStopped. That event belongs only to the streaming
protocol (SynthesizeStart/Chunk/Stop). Sending it after Synthesize
confuses HA's Wyoming client, causing it to hang indefinitely.

Also:
- Guard Synthesize path against duplicate events during streaming
- Send audio as one AudioChunk per sentence (matches working reference)
- Remove numpy import (no longer needed)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-05 16:22:47 -04:00
parent 59731084cd
commit a196294d4a

View File

@@ -3,7 +3,6 @@ import logging
import time
from typing import Dict, Optional
import numpy as np
from wyoming.audio import AudioChunk, AudioStart, AudioStop
from wyoming.event import Event
from wyoming.info import Describe, Info
@@ -94,10 +93,14 @@ class ChatterboxWyomingHandler(AsyncEventHandler):
return True
if Synthesize.is_type(event.type):
if self._streaming:
# Ignore duplicate Synthesize events sent alongside streaming protocol
return True
synth = Synthesize.from_event(event)
voice_name = synth.voice.name if synth.voice else None
await self._synthesize_and_stream(synth.text, voice_name)
await self.write_event(SynthesizeStopped().event())
# NOTE: SynthesizeStopped is NOT sent here — it belongs only to the
# streaming protocol (SynthesizeStop path). Sending it here confuses HA.
return True
return True
@@ -134,22 +137,18 @@ class ChatterboxWyomingHandler(AsyncEventHandler):
logger.exception(f"Synthesis failed for chunk {i+1}")
continue
audio_np = audio_tensor.cpu().numpy().squeeze()
audio_int16 = (audio_np * 32767).clip(-32768, 32767).astype(np.int16)
audio_bytes = (
audio_tensor.cpu().numpy().squeeze() * 32767
).astype("int16").tobytes()
if first_chunk:
ttfa = time.monotonic() - start_time
logger.info(f"Time to first audio: {ttfa:.3f}s")
first_chunk = False
# Send in small sub-chunks so HA can begin playback immediately
# rather than waiting for the full audio blob to arrive.
audio_chunk_samples = 4096
for offset in range(0, len(audio_int16), audio_chunk_samples):
sub = audio_int16[offset : offset + audio_chunk_samples]
await self.write_event(
AudioChunk(
audio=sub.tobytes(),
audio=audio_bytes,
rate=sample_rate,
width=2,
channels=1,