Fix Wyoming protocol: remove SynthesizeStopped from Synthesize path
Some checks failed
Build ROCm Image / build (push) Has been cancelled
Some checks failed
Build ROCm Image / build (push) Has been cancelled
The plain Synthesize event (HA's standard TTS path) should NOT be followed by SynthesizeStopped. That event belongs only to the streaming protocol (SynthesizeStart/Chunk/Stop). Sending it after Synthesize confuses HA's Wyoming client, causing it to hang indefinitely. Also: - Guard Synthesize path against duplicate events during streaming - Send audio as one AudioChunk per sentence (matches working reference) - Remove numpy import (no longer needed) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -3,7 +3,6 @@ import logging
|
||||
import time
|
||||
from typing import Dict, Optional
|
||||
|
||||
import numpy as np
|
||||
from wyoming.audio import AudioChunk, AudioStart, AudioStop
|
||||
from wyoming.event import Event
|
||||
from wyoming.info import Describe, Info
|
||||
@@ -94,10 +93,14 @@ class ChatterboxWyomingHandler(AsyncEventHandler):
|
||||
return True
|
||||
|
||||
if Synthesize.is_type(event.type):
|
||||
if self._streaming:
|
||||
# Ignore duplicate Synthesize events sent alongside streaming protocol
|
||||
return True
|
||||
synth = Synthesize.from_event(event)
|
||||
voice_name = synth.voice.name if synth.voice else None
|
||||
await self._synthesize_and_stream(synth.text, voice_name)
|
||||
await self.write_event(SynthesizeStopped().event())
|
||||
# NOTE: SynthesizeStopped is NOT sent here — it belongs only to the
|
||||
# streaming protocol (SynthesizeStop path). Sending it here confuses HA.
|
||||
return True
|
||||
|
||||
return True
|
||||
@@ -134,22 +137,18 @@ class ChatterboxWyomingHandler(AsyncEventHandler):
|
||||
logger.exception(f"Synthesis failed for chunk {i+1}")
|
||||
continue
|
||||
|
||||
audio_np = audio_tensor.cpu().numpy().squeeze()
|
||||
audio_int16 = (audio_np * 32767).clip(-32768, 32767).astype(np.int16)
|
||||
audio_bytes = (
|
||||
audio_tensor.cpu().numpy().squeeze() * 32767
|
||||
).astype("int16").tobytes()
|
||||
|
||||
if first_chunk:
|
||||
ttfa = time.monotonic() - start_time
|
||||
logger.info(f"Time to first audio: {ttfa:.3f}s")
|
||||
first_chunk = False
|
||||
|
||||
# Send in small sub-chunks so HA can begin playback immediately
|
||||
# rather than waiting for the full audio blob to arrive.
|
||||
audio_chunk_samples = 4096
|
||||
for offset in range(0, len(audio_int16), audio_chunk_samples):
|
||||
sub = audio_int16[offset : offset + audio_chunk_samples]
|
||||
await self.write_event(
|
||||
AudioChunk(
|
||||
audio=sub.tobytes(),
|
||||
audio=audio_bytes,
|
||||
rate=sample_rate,
|
||||
width=2,
|
||||
channels=1,
|
||||
|
||||
Reference in New Issue
Block a user