Fix Wyoming protocol: remove SynthesizeStopped from Synthesize path
Some checks failed
Build ROCm Image / build (push) Has been cancelled
Some checks failed
Build ROCm Image / build (push) Has been cancelled
The plain Synthesize event (HA's standard TTS path) should NOT be followed by SynthesizeStopped. That event belongs only to the streaming protocol (SynthesizeStart/Chunk/Stop). Sending it after Synthesize confuses HA's Wyoming client, causing it to hang indefinitely. Also: - Guard Synthesize path against duplicate events during streaming - Send audio as one AudioChunk per sentence (matches working reference) - Remove numpy import (no longer needed) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -3,7 +3,6 @@ import logging
|
|||||||
import time
|
import time
|
||||||
from typing import Dict, Optional
|
from typing import Dict, Optional
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
from wyoming.audio import AudioChunk, AudioStart, AudioStop
|
from wyoming.audio import AudioChunk, AudioStart, AudioStop
|
||||||
from wyoming.event import Event
|
from wyoming.event import Event
|
||||||
from wyoming.info import Describe, Info
|
from wyoming.info import Describe, Info
|
||||||
@@ -94,10 +93,14 @@ class ChatterboxWyomingHandler(AsyncEventHandler):
|
|||||||
return True
|
return True
|
||||||
|
|
||||||
if Synthesize.is_type(event.type):
|
if Synthesize.is_type(event.type):
|
||||||
|
if self._streaming:
|
||||||
|
# Ignore duplicate Synthesize events sent alongside streaming protocol
|
||||||
|
return True
|
||||||
synth = Synthesize.from_event(event)
|
synth = Synthesize.from_event(event)
|
||||||
voice_name = synth.voice.name if synth.voice else None
|
voice_name = synth.voice.name if synth.voice else None
|
||||||
await self._synthesize_and_stream(synth.text, voice_name)
|
await self._synthesize_and_stream(synth.text, voice_name)
|
||||||
await self.write_event(SynthesizeStopped().event())
|
# NOTE: SynthesizeStopped is NOT sent here — it belongs only to the
|
||||||
|
# streaming protocol (SynthesizeStop path). Sending it here confuses HA.
|
||||||
return True
|
return True
|
||||||
|
|
||||||
return True
|
return True
|
||||||
@@ -134,22 +137,18 @@ class ChatterboxWyomingHandler(AsyncEventHandler):
|
|||||||
logger.exception(f"Synthesis failed for chunk {i+1}")
|
logger.exception(f"Synthesis failed for chunk {i+1}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
audio_np = audio_tensor.cpu().numpy().squeeze()
|
audio_bytes = (
|
||||||
audio_int16 = (audio_np * 32767).clip(-32768, 32767).astype(np.int16)
|
audio_tensor.cpu().numpy().squeeze() * 32767
|
||||||
|
).astype("int16").tobytes()
|
||||||
|
|
||||||
if first_chunk:
|
if first_chunk:
|
||||||
ttfa = time.monotonic() - start_time
|
ttfa = time.monotonic() - start_time
|
||||||
logger.info(f"Time to first audio: {ttfa:.3f}s")
|
logger.info(f"Time to first audio: {ttfa:.3f}s")
|
||||||
first_chunk = False
|
first_chunk = False
|
||||||
|
|
||||||
# Send in small sub-chunks so HA can begin playback immediately
|
|
||||||
# rather than waiting for the full audio blob to arrive.
|
|
||||||
audio_chunk_samples = 4096
|
|
||||||
for offset in range(0, len(audio_int16), audio_chunk_samples):
|
|
||||||
sub = audio_int16[offset : offset + audio_chunk_samples]
|
|
||||||
await self.write_event(
|
await self.write_event(
|
||||||
AudioChunk(
|
AudioChunk(
|
||||||
audio=sub.tobytes(),
|
audio=audio_bytes,
|
||||||
rate=sample_rate,
|
rate=sample_rate,
|
||||||
width=2,
|
width=2,
|
||||||
channels=1,
|
channels=1,
|
||||||
|
|||||||
Reference in New Issue
Block a user