Fix Wyoming protocol: remove SynthesizeStopped from Synthesize path
Some checks failed
Build ROCm Image / build (push) Has been cancelled

The plain Synthesize event (HA's standard TTS path) should NOT be
followed by SynthesizeStopped. That event belongs only to the streaming
protocol (SynthesizeStart/Chunk/Stop). Sending it after Synthesize
confuses HA's Wyoming client, causing it to hang indefinitely.

Also:
- Guard Synthesize path against duplicate events during streaming
- Send audio as one AudioChunk per sentence (matches working reference)
- Remove numpy import (no longer needed)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-05 16:22:47 -04:00
parent 59731084cd
commit a196294d4a

View File

@@ -3,7 +3,6 @@ import logging
import time import time
from typing import Dict, Optional from typing import Dict, Optional
import numpy as np
from wyoming.audio import AudioChunk, AudioStart, AudioStop from wyoming.audio import AudioChunk, AudioStart, AudioStop
from wyoming.event import Event from wyoming.event import Event
from wyoming.info import Describe, Info from wyoming.info import Describe, Info
@@ -94,10 +93,14 @@ class ChatterboxWyomingHandler(AsyncEventHandler):
return True return True
if Synthesize.is_type(event.type): if Synthesize.is_type(event.type):
if self._streaming:
# Ignore duplicate Synthesize events sent alongside streaming protocol
return True
synth = Synthesize.from_event(event) synth = Synthesize.from_event(event)
voice_name = synth.voice.name if synth.voice else None voice_name = synth.voice.name if synth.voice else None
await self._synthesize_and_stream(synth.text, voice_name) await self._synthesize_and_stream(synth.text, voice_name)
await self.write_event(SynthesizeStopped().event()) # NOTE: SynthesizeStopped is NOT sent here — it belongs only to the
# streaming protocol (SynthesizeStop path). Sending it here confuses HA.
return True return True
return True return True
@@ -134,22 +137,18 @@ class ChatterboxWyomingHandler(AsyncEventHandler):
logger.exception(f"Synthesis failed for chunk {i+1}") logger.exception(f"Synthesis failed for chunk {i+1}")
continue continue
audio_np = audio_tensor.cpu().numpy().squeeze() audio_bytes = (
audio_int16 = (audio_np * 32767).clip(-32768, 32767).astype(np.int16) audio_tensor.cpu().numpy().squeeze() * 32767
).astype("int16").tobytes()
if first_chunk: if first_chunk:
ttfa = time.monotonic() - start_time ttfa = time.monotonic() - start_time
logger.info(f"Time to first audio: {ttfa:.3f}s") logger.info(f"Time to first audio: {ttfa:.3f}s")
first_chunk = False first_chunk = False
# Send in small sub-chunks so HA can begin playback immediately
# rather than waiting for the full audio blob to arrive.
audio_chunk_samples = 4096
for offset in range(0, len(audio_int16), audio_chunk_samples):
sub = audio_int16[offset : offset + audio_chunk_samples]
await self.write_event( await self.write_event(
AudioChunk( AudioChunk(
audio=sub.tobytes(), audio=audio_bytes,
rate=sample_rate, rate=sample_rate,
width=2, width=2,
channels=1, channels=1,