Initial implementation: Chatterbox TTS with ROCm and Wyoming
All checks were successful
Build ROCm Image / build (push) Successful in 15m27s

Wyoming-only server built around the official chatterbox TTS model.
Includes ROCm/AMD GPU support, sentence-level streaming, config.yaml
management, and Gitea CI for container builds.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-05 09:51:09 -04:00
parent 4b15e44181
commit 16ea2853f5
12 changed files with 691 additions and 0 deletions

99
wyoming_voices.py Normal file
View File

@@ -0,0 +1,99 @@
import logging
from pathlib import Path
from typing import Dict, Optional
from wyoming.info import Attribution, Info, TtsProgram, TtsVoice, TtsVoiceSpeaker
logger = logging.getLogger(__name__)
VOICE_EXTENSIONS = {".wav", ".mp3", ".flac", ".ogg"}
def load_voices() -> Dict[str, str]:
"""Scan voice directories and return {voice_name: file_path} mapping."""
from config import get_predefined_voices_path, get_reference_audio_path
voices: Dict[str, str] = {}
def _scan_dir(directory: Path) -> None:
if not directory.exists():
return
for f in sorted(directory.iterdir()):
if f.suffix.lower() in VOICE_EXTENSIONS:
name = f.stem
if name not in voices:
voices[name] = str(f)
# Reference audio first so predefined voices take priority on collision
_scan_dir(get_reference_audio_path())
_scan_dir(get_predefined_voices_path())
logger.info(f"Discovered {len(voices)} voice(s): {list(voices.keys())}")
return voices
def resolve_voice(voice_name: Optional[str], voices: Dict[str, str]) -> Optional[str]:
"""Resolve a voice name to its audio file path."""
from config import get_predefined_voices_path, get_reference_audio_path, get_default_voice_id
if not voice_name:
default = get_default_voice_id()
voice_name = Path(default).stem
# Exact name match in discovered voices
if voice_name in voices:
return voices[voice_name]
# Try predefined voices dir with extensions
for ext in VOICE_EXTENSIONS:
p = get_predefined_voices_path() / f"{voice_name}{ext}"
if p.exists():
return str(p)
# Try reference audio dir with extensions
for ext in VOICE_EXTENSIONS:
p = get_reference_audio_path() / f"{voice_name}{ext}"
if p.exists():
return str(p)
# Fall back to any voice
if voices:
fallback = next(iter(voices.values()))
logger.warning(f"Voice '{voice_name}' not found, falling back to '{fallback}'")
return fallback
return None
def create_wyoming_info(sample_rate: int, voices: Dict[str, str]) -> Info:
"""Build the Wyoming Info object advertised to Home Assistant."""
tts_voices = [
TtsVoice(
name=name,
description=f"Chatterbox voice: {name}",
attribution=Attribution(
name="ResembleAI",
url="https://github.com/resemble-ai/chatterbox",
),
installed=True,
languages=["en"],
speakers=[TtsVoiceSpeaker(name=name)],
)
for name in sorted(voices.keys())
]
return Info(
tts=[
TtsProgram(
name="chatterbox",
description="Chatterbox TTS with ROCm/AMD GPU support",
attribution=Attribution(
name="ResembleAI",
url="https://github.com/resemble-ai/chatterbox",
),
installed=True,
voices=tts_voices,
version="1.0",
)
]
)