Initial implementation: Chatterbox TTS with ROCm and Wyoming

Wyoming-only server built around the official chatterbox TTS model. Includes ROCm/AMD GPU support, sentence-level streaming, config.yaml management, and Gitea CI for container builds. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-05 09:51:09 -04:00
parent 4b15e44181
commit 16ea2853f5
12 changed files with 691 additions and 0 deletions
--- a/wyoming_voices.py
+++ b/wyoming_voices.py
@@ -0,0 +1,99 @@
+import logging
+from pathlib import Path
+from typing import Dict, Optional
+
+from wyoming.info import Attribution, Info, TtsProgram, TtsVoice, TtsVoiceSpeaker
+
+logger = logging.getLogger(__name__)
+
+VOICE_EXTENSIONS = {".wav", ".mp3", ".flac", ".ogg"}
+
+
+def load_voices() -> Dict[str, str]:
+    """Scan voice directories and return {voice_name: file_path} mapping."""
+    from config import get_predefined_voices_path, get_reference_audio_path
+
+    voices: Dict[str, str] = {}
+
+    def _scan_dir(directory: Path) -> None:
+        if not directory.exists():
+            return
+        for f in sorted(directory.iterdir()):
+            if f.suffix.lower() in VOICE_EXTENSIONS:
+                name = f.stem
+                if name not in voices:
+                    voices[name] = str(f)
+
+    # Reference audio first so predefined voices take priority on collision
+    _scan_dir(get_reference_audio_path())
+    _scan_dir(get_predefined_voices_path())
+
+    logger.info(f"Discovered {len(voices)} voice(s): {list(voices.keys())}")
+    return voices
+
+
+def resolve_voice(voice_name: Optional[str], voices: Dict[str, str]) -> Optional[str]:
+    """Resolve a voice name to its audio file path."""
+    from config import get_predefined_voices_path, get_reference_audio_path, get_default_voice_id
+
+    if not voice_name:
+        default = get_default_voice_id()
+        voice_name = Path(default).stem
+
+    # Exact name match in discovered voices
+    if voice_name in voices:
+        return voices[voice_name]
+
+    # Try predefined voices dir with extensions
+    for ext in VOICE_EXTENSIONS:
+        p = get_predefined_voices_path() / f"{voice_name}{ext}"
+        if p.exists():
+            return str(p)
+
+    # Try reference audio dir with extensions
+    for ext in VOICE_EXTENSIONS:
+        p = get_reference_audio_path() / f"{voice_name}{ext}"
+        if p.exists():
+            return str(p)
+
+    # Fall back to any voice
+    if voices:
+        fallback = next(iter(voices.values()))
+        logger.warning(f"Voice '{voice_name}' not found, falling back to '{fallback}'")
+        return fallback
+
+    return None
+
+
+def create_wyoming_info(sample_rate: int, voices: Dict[str, str]) -> Info:
+    """Build the Wyoming Info object advertised to Home Assistant."""
+    tts_voices = [
+        TtsVoice(
+            name=name,
+            description=f"Chatterbox voice: {name}",
+            attribution=Attribution(
+                name="ResembleAI",
+                url="https://github.com/resemble-ai/chatterbox",
+            ),
+            installed=True,
+            languages=["en"],
+            speakers=[TtsVoiceSpeaker(name=name)],
+        )
+        for name in sorted(voices.keys())
+    ]
+
+    return Info(
+        tts=[
+            TtsProgram(
+                name="chatterbox",
+                description="Chatterbox TTS with ROCm/AMD GPU support",
+                attribution=Attribution(
+                    name="ResembleAI",
+                    url="https://github.com/resemble-ai/chatterbox",
+                ),
+                installed=True,
+                voices=tts_voices,
+                version="1.0",
+            )
+        ]
+    )