Fix Wyoming protocol: remove SynthesizeStopped from Synthesize path

The plain Synthesize event (HA's standard TTS path) should NOT be followed by SynthesizeStopped. That event belongs only to the streaming protocol (SynthesizeStart/Chunk/Stop). Sending it after Synthesize confuses HA's Wyoming client, causing it to hang indefinitely. Also: - Guard Synthesize path against duplicate events during streaming - Send audio as one AudioChunk per sentence (matches working reference) - Remove numpy import (no longer needed) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-05 16:22:47 -04:00
parent 59731084cd
commit a196294d4a
1 changed files with 16 additions and 17 deletions
--- a/wyoming_handler.py
+++ b/wyoming_handler.py
@@ -3,7 +3,6 @@ import logging
 import time
 from typing import Dict, Optional

-import numpy as np
 from wyoming.audio import AudioChunk, AudioStart, AudioStop
 from wyoming.event import Event
 from wyoming.info import Describe, Info
@@ -94,10 +93,14 @@ class ChatterboxWyomingHandler(AsyncEventHandler):
            return True

        if Synthesize.is_type(event.type):
+            if self._streaming:
+                # Ignore duplicate Synthesize events sent alongside streaming protocol
+                return True
            synth = Synthesize.from_event(event)
            voice_name = synth.voice.name if synth.voice else None
            await self._synthesize_and_stream(synth.text, voice_name)
-            await self.write_event(SynthesizeStopped().event())
+            # NOTE: SynthesizeStopped is NOT sent here — it belongs only to the
+            # streaming protocol (SynthesizeStop path). Sending it here confuses HA.
            return True

        return True
@@ -134,22 +137,18 @@ class ChatterboxWyomingHandler(AsyncEventHandler):
                logger.exception(f"Synthesis failed for chunk {i+1}")
                continue

-            audio_np = audio_tensor.cpu().numpy().squeeze()
-            audio_int16 = (audio_np * 32767).clip(-32768, 32767).astype(np.int16)
+            audio_bytes = (
+                audio_tensor.cpu().numpy().squeeze() * 32767
+            ).astype("int16").tobytes()

            if first_chunk:
                ttfa = time.monotonic() - start_time
                logger.info(f"Time to first audio: {ttfa:.3f}s")
                first_chunk = False

-            # Send in small sub-chunks so HA can begin playback immediately
-            # rather than waiting for the full audio blob to arrive.
-            audio_chunk_samples = 4096
-            for offset in range(0, len(audio_int16), audio_chunk_samples):
-                sub = audio_int16[offset : offset + audio_chunk_samples]
            await self.write_event(
                AudioChunk(
-                        audio=sub.tobytes(),
+                    audio=audio_bytes,
                    rate=sample_rate,
                    width=2,
                    channels=1,