Initial implementation: Chatterbox TTS with ROCm and Wyoming

Wyoming-only server built around the official chatterbox TTS model. Includes ROCm/AMD GPU support, sentence-level streaming, config.yaml management, and Gitea CI for container builds. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-05 09:51:09 -04:00
parent 4b15e44181
commit 16ea2853f5
12 changed files with 691 additions and 0 deletions
--- a/.gitea/workflows/build.yml
+++ b/.gitea/workflows/build.yml
@@ -0,0 +1,33 @@
 name: Build ROCm Image
 on:
  push:
    branches:
      - main
 jobs:
  build:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
        uses: actions/checkout@v4
      - name: Setup Docker Buildx
        uses: docker/setup-buildx-action@v3
      - name: Login to Gitea Registry
        uses: docker/login-action@v3
        with:
          registry: git.sdgarren.com
          username: ${{ secrets.REGISTRY_USERNAME }}
          password: ${{ secrets.REGISTRY_TOKEN }}
      - name: Build and Push
        uses: docker/build-push-action@v6
        with:
          context: .
          file: Dockerfile.rocm
          push: true
          tags: |
            git.sdgarren.com/scott/rocm-chatterbox-whisper:latest
            git.sdgarren.com/scott/rocm-chatterbox-whisper:${{ gitea.sha }}
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,16 @@
 __pycache__/
 *.py[cod]
 *.egg-info/
 .env
 # Voice and audio files (mount via volume)
 voices/
 reference_audio/
 outputs/
 # Model cache
 hf_cache/
 # Logs
 logs/
 *.log
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -0,0 +1,43 @@
 FROM rocm/dev-ubuntu-22.04:latest
 ENV DEBIAN_FRONTEND=noninteractive \
    PYTHONDONTWRITEBYTECODE=1 \
    PYTHONUNBUFFERED=1 \
    HF_HOME=/app/hf_cache \
    PIP_NO_CACHE_DIR=1
 RUN apt-get update && apt-get install -y --no-install-recommends \
    python3 \
    python3-pip \
    python3-dev \
    git \
    ffmpeg \
    libsndfile1 \
    && rm -rf /var/lib/apt/lists/*
 WORKDIR /app
 # Step 1: Install ROCm-compatible PyTorch stack first.
 # This must happen before anything else to prevent pip from pulling CPU wheels.
 COPY requirements-rocm-init.txt .
 RUN pip3 install -r requirements-rocm-init.txt
 # Step 2: Install remaining dependencies (pinned to avoid overwriting torch).
 COPY requirements-rocm.txt .
 RUN pip3 install -r requirements-rocm.txt
 # Step 3: Install chatterbox with --no-deps so pip cannot replace ROCm torch.
 RUN pip3 install --no-deps chatterbox-tts
 # Application source
 COPY engine.py config.py wyoming_handler.py wyoming_voices.py main.py ./
 # Default config (can be overridden by volume mount)
 COPY config.yaml .
 # Create default directories
 RUN mkdir -p voices reference_audio hf_cache
 EXPOSE 10200
 CMD ["python3", "main.py"]
--- a/config.py
+++ b/config.py
@@ -0,0 +1,115 @@
 import logging
 import os
 from pathlib import Path
 from typing import Optional
 import yaml
 logger = logging.getLogger(__name__)
 _config: dict = {}
 _config_path = Path(os.environ.get("CONFIG_PATH", "config.yaml"))
 DEFAULTS = {
    "model": {
        "repo_id": "chatterbox-turbo",
    },
    "tts_engine": {
        "device": "",  # empty = auto-detect
        "predefined_voices_path": "voices",
        "reference_audio_path": "reference_audio",
        "default_voice_id": "default.wav",
    },
    "generation_defaults": {
        "temperature": 0.8,
        "exaggeration": 0.5,
        "cfg_weight": 0.5,
        "seed": 0,
    },
    "wyoming": {
        "host": "0.0.0.0",
        "port": 10200,
        "chunk_size": 300,
    },
    "paths": {
        "model_cache": "/app/hf_cache",
    },
 }
 def _deep_merge(base: dict, override: dict) -> dict:
    result = base.copy()
    for key, value in override.items():
        if key in result and isinstance(result[key], dict) and isinstance(value, dict):
            result[key] = _deep_merge(result[key], value)
        else:
            result[key] = value
    return result
 def load_config() -> None:
    global _config
    _config = _deep_merge(DEFAULTS, {})
    if _config_path.exists():
        try:
            with open(_config_path) as f:
                user_config = yaml.safe_load(f) or {}
            _config = _deep_merge(_config, user_config)
            logger.info(f"Loaded config from {_config_path}")
        except Exception:
            logger.exception(f"Failed to load {_config_path}, using defaults")
    else:
        logger.warning(f"Config file not found at {_config_path}, using defaults")
    # Set HuggingFace cache path from config
    cache_path = _config.get("paths", {}).get("model_cache", "")
    if cache_path:
        os.environ.setdefault("HF_HOME", cache_path)
 def get_model_repo_id() -> str:
    return _config.get("model", {}).get("repo_id", "chatterbox-turbo")
 def get_device_override() -> Optional[str]:
    return _config.get("tts_engine", {}).get("device") or None
 def get_predefined_voices_path() -> Path:
    return Path(_config.get("tts_engine", {}).get("predefined_voices_path", "voices"))
 def get_reference_audio_path() -> Path:
    return Path(_config.get("tts_engine", {}).get("reference_audio_path", "reference_audio"))
 def get_default_voice_id() -> str:
    return _config.get("tts_engine", {}).get("default_voice_id", "default.wav")
 def get_wyoming_host() -> str:
    return _config.get("wyoming", {}).get("host", "0.0.0.0")
 def get_wyoming_port() -> int:
    return int(_config.get("wyoming", {}).get("port", 10200))
 def get_wyoming_chunk_size() -> int:
    return int(_config.get("wyoming", {}).get("chunk_size", 300))
 def get_gen_temperature() -> float:
    return float(_config.get("generation_defaults", {}).get("temperature", 0.8))
 def get_gen_exaggeration() -> float:
    return float(_config.get("generation_defaults", {}).get("exaggeration", 0.5))
 def get_gen_cfg_weight() -> float:
    return float(_config.get("generation_defaults", {}).get("cfg_weight", 0.5))
 def get_gen_seed() -> int:
    return int(_config.get("generation_defaults", {}).get("seed", 0))
--- a/config.yaml
+++ b/config.yaml
@@ -0,0 +1,29 @@
 model:
  # Options: chatterbox, chatterbox-turbo
  repo_id: chatterbox-turbo
 tts_engine:
  # Device: cuda, cpu, or leave empty for auto-detect
  device: ""
  predefined_voices_path: voices
  reference_audio_path: reference_audio
  # Fallback voice (stem name, e.g. "default" matches default.wav)
  default_voice_id: default.wav
 generation_defaults:
  # Turbo model: uses temperature only (exaggeration/cfg_weight ignored)
  # Standard model: uses exaggeration and cfg_weight (temperature ignored)
  temperature: 0.8
  exaggeration: 0.5
  cfg_weight: 0.5
  # seed: 0 = random each call, >0 = reproducible output
  seed: 0
 wyoming:
  host: "0.0.0.0"
  port: 10200
  # Max characters per synthesis chunk (split at sentence boundaries)
  chunk_size: 300
 paths:
  model_cache: /app/hf_cache
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -0,0 +1,35 @@
 services:
  chatterbox-whisper:
    image: git.sdgarren.com/scott/rocm-chatterbox-whisper:latest
    build:
      context: .
      dockerfile: Dockerfile.rocm
    restart: unless-stopped
    ports:
      - "${WYOMING_PORT:-10200}:10200"
    devices:
      - /dev/kfd
      - /dev/dri
    group_add:
      - video
      - render
    ipc: host
    shm_size: 8g
    security_opt:
      - seccomp=unconfined
    volumes:
      - ./config.yaml:/app/config.yaml
      - ./voices:/app/voices
      - ./reference_audio:/app/reference_audio
      - hf_cache:/app/hf_cache
    environment:
      - HF_HUB_ENABLE_HF_TRANSFER=1
      # Set your GPU architecture:
      #   10.3.0 = RX 5000/6000 series
      #   11.0.0 = RX 7000 series
      #   9.0.6  = Vega
      - HSA_OVERRIDE_GFX_VERSION=10.3.0
      # - HF_TOKEN=your_token_here
 volumes:
  hf_cache:
--- a/engine.py
+++ b/engine.py
@@ -0,0 +1,91 @@
 import logging
 import torch
 from typing import Optional, Tuple
 logger = logging.getLogger(__name__)
 chatterbox_model = None
 _sample_rate = 24000
 _is_turbo = False
 def _test_cuda() -> bool:
    try:
        if torch.cuda.is_available():
            torch.zeros(1).cuda()
            return True
    except Exception:
        pass
    return False
 def detect_device() -> str:
    return "cuda" if _test_cuda() else "cpu"
 def load_model() -> bool:
    global chatterbox_model, _sample_rate, _is_turbo
    from config import get_model_repo_id, get_device_override
    device = get_device_override() or detect_device()
    repo_id = get_model_repo_id()
    logger.info(f"Loading model '{repo_id}' on device '{device}'")
    try:
        if "turbo" in repo_id.lower():
            from chatterbox.tts_turbo import ChatterboxTurboTTS
            chatterbox_model = ChatterboxTurboTTS.from_pretrained(device)
            _is_turbo = True
        else:
            from chatterbox.tts import ChatterboxTTS
            chatterbox_model = ChatterboxTTS.from_pretrained(device)
            _is_turbo = False
        _sample_rate = 24000
        logger.info("Model loaded successfully")
        return True
    except Exception:
        logger.exception("Failed to load model")
        return False
 def get_sample_rate() -> int:
    return _sample_rate
 def synthesize(
    text: str,
    audio_prompt_path: Optional[str] = None,
    exaggeration: float = 0.5,
    cfg_weight: float = 0.5,
    temperature: float = 0.8,
    seed: int = 0,
 ) -> Tuple[torch.Tensor, int]:
    if chatterbox_model is None:
        raise RuntimeError("Model not loaded. Call load_model() first.")
    if seed > 0:
        torch.manual_seed(seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed_all(seed)
    kwargs: dict = {}
    if audio_prompt_path:
        kwargs["audio_prompt_path"] = audio_prompt_path
    if _is_turbo:
        kwargs["temperature"] = temperature
    else:
        kwargs["exaggeration"] = exaggeration
        kwargs["cfg_weight"] = cfg_weight
    with torch.inference_mode():
        wav = chatterbox_model.generate(text=text, **kwargs)
    if torch.cuda.is_available():
        torch.cuda.synchronize()
        torch.cuda.empty_cache()
    return wav, _sample_rate
--- a/main.py
+++ b/main.py
@@ -0,0 +1,42 @@
 import asyncio
 import logging
 import sys
 from functools import partial
 from wyoming.server import AsyncServer
 import engine
 from config import get_wyoming_host, get_wyoming_port, load_config
 from wyoming_handler import ChatterboxWyomingHandler
 from wyoming_voices import create_wyoming_info, load_voices
 logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s %(levelname)s %(name)s: %(message)s",
    stream=sys.stdout,
 )
 logger = logging.getLogger(__name__)
 async def main() -> None:
    load_config()
    logger.info("Loading TTS model...")
    if not engine.load_model():
        logger.error("Failed to load model, exiting")
        sys.exit(1)
    voices = load_voices()
    wyoming_info = create_wyoming_info(engine.get_sample_rate(), voices)
    host = get_wyoming_host()
    port = get_wyoming_port()
    uri = f"tcp://{host}:{port}"
    logger.info(f"Starting Wyoming server on {uri}")
    server = AsyncServer.from_uri(uri)
    await server.run(partial(ChatterboxWyomingHandler, wyoming_info, voices))
 if __name__ == "__main__":
    asyncio.run(main())
--- a/requirements-rocm-init.txt
+++ b/requirements-rocm-init.txt
@@ -0,0 +1,5 @@
 --index-url https://download.pytorch.org/whl/rocm6.1
 torch==2.5.1
 torchaudio==2.5.1
 torchvision==0.20.1
 pytorch_triton_rocm==3.1.0
--- a/requirements-rocm.txt
+++ b/requirements-rocm.txt
@@ -0,0 +1,17 @@
 # Audio processing
 numpy
 soundfile
 librosa
 # ML dependencies (pinned to match chatterbox without overwriting ROCm torch)
 transformers==4.46.3
 diffusers==0.29.0
 safetensors
 huggingface-hub
 accelerate
 # Wyoming protocol
 wyoming>=1.5.4
 # Config / utilities
 PyYAML>=6.0
--- a/wyoming_handler.py
+++ b/wyoming_handler.py
@@ -0,0 +1,166 @@
 import asyncio
 import logging
 import time
 from typing import Dict, Optional
 import numpy as np
 from wyoming.audio import AudioChunk, AudioStart, AudioStop
 from wyoming.event import Event
 from wyoming.info import Describe, Info
 from wyoming.server import AsyncEventHandler
 from wyoming.tts import Synthesize, SynthesizeChunk, SynthesizeStart, SynthesizeStopped, SynthesizeStop
 import engine
 from config import (
    get_gen_cfg_weight,
    get_gen_exaggeration,
    get_gen_seed,
    get_gen_temperature,
    get_wyoming_chunk_size,
 )
 from wyoming_voices import resolve_voice
 logger = logging.getLogger(__name__)
 def _split_text(text: str, max_chunk: int = 300) -> list[str]:
    """Split text at sentence boundaries to keep chunks under max_chunk chars."""
    import re
    # Split on sentence-ending punctuation
    sentences = re.split(r"(?<=[.!?])\s+", text.strip())
    chunks = []
    current = ""
    for sentence in sentences:
        if not sentence:
            continue
        if current and len(current) + 1 + len(sentence) > max_chunk:
            chunks.append(current.strip())
            current = sentence
        else:
            current = (current + " " + sentence).strip() if current else sentence
    if current:
        chunks.append(current.strip())
    return chunks or [text]
 class ChatterboxWyomingHandler(AsyncEventHandler):
    def __init__(
        self,
        wyoming_info: Info,
        voices: Dict[str, str],
        *args,
        **kwargs,
    ) -> None:
        super().__init__(*args, **kwargs)
        self._info = wyoming_info
        self._voices = voices
        self._streaming = False
        self._streaming_text = ""
        self._streaming_voice: Optional[str] = None
    async def handle_event(self, event: Event) -> bool:
        if Describe.is_type(event.type):
            await self.write_event(self._info.event())
            return True
        if SynthesizeStart.is_type(event.type):
            start = SynthesizeStart.from_event(event)
            self._streaming = True
            self._streaming_text = ""
            self._streaming_voice = start.voice.name if start.voice else None
            return True
        if SynthesizeChunk.is_type(event.type):
            chunk = SynthesizeChunk.from_event(event)
            if self._streaming:
                self._streaming_text += chunk.text
            return True
        if SynthesizeStop.is_type(event.type):
            if self._streaming and self._streaming_text:
                await self._synthesize_and_stream(
                    self._streaming_text,
                    self._streaming_voice,
                )
            self._streaming = False
            self._streaming_text = ""
            self._streaming_voice = None
            await self.write_event(SynthesizeStopped().event())
            return True
        if Synthesize.is_type(event.type):
            synth = Synthesize.from_event(event)
            voice_name = synth.voice.name if synth.voice else None
            await self._synthesize_and_stream(synth.text, voice_name)
            await self.write_event(SynthesizeStopped().event())
            return True
        return True
    async def _synthesize_and_stream(self, text: str, voice_name: Optional[str]) -> None:
        audio_prompt = resolve_voice(voice_name, self._voices)
        sample_rate = engine.get_sample_rate()
        chunk_size = get_wyoming_chunk_size()
        chunks = _split_text(text, max_chunk=chunk_size)
        logger.info(
            f"Synthesizing {len(chunks)} chunk(s) for voice='{voice_name}', "
            f"prompt='{audio_prompt}'"
        )
        await self.write_event(
            AudioStart(rate=sample_rate, width=2, channels=1).event()
        )
        first_chunk = True
        start_time = time.monotonic()
        for i, chunk_text in enumerate(chunks):
            logger.debug(f"Chunk {i+1}/{len(chunks)}: {chunk_text[:60]!r}")
            try:
                audio_tensor, sr = await asyncio.get_event_loop().run_in_executor(
                    None,
                    self._synthesize_chunk,
                    chunk_text,
                    audio_prompt,
                )
            except Exception:
                logger.exception(f"Synthesis failed for chunk {i+1}")
                continue
            audio_np = audio_tensor.cpu().numpy().squeeze()
            audio_bytes = (audio_np * 32767).clip(-32768, 32767).astype(np.int16).tobytes()
            if first_chunk:
                ttfa = time.monotonic() - start_time
                logger.info(f"Time to first audio: {ttfa:.3f}s")
                first_chunk = False
            await self.write_event(
                AudioChunk(
                    audio=audio_bytes,
                    rate=sample_rate,
                    width=2,
                    channels=1,
                ).event()
            )
        await self.write_event(AudioStop().event())
        total = time.monotonic() - start_time
        logger.info(f"Synthesis complete in {total:.3f}s")
    def _synthesize_chunk(self, text: str, audio_prompt: Optional[str]):
        return engine.synthesize(
            text=text,
            audio_prompt_path=audio_prompt,
            temperature=get_gen_temperature(),
            exaggeration=get_gen_exaggeration(),
            cfg_weight=get_gen_cfg_weight(),
            seed=get_gen_seed(),
        )
--- a/wyoming_voices.py
+++ b/wyoming_voices.py
@@ -0,0 +1,99 @@
 import logging
 from pathlib import Path
 from typing import Dict, Optional
 from wyoming.info import Attribution, Info, TtsProgram, TtsVoice, TtsVoiceSpeaker
 logger = logging.getLogger(__name__)
 VOICE_EXTENSIONS = {".wav", ".mp3", ".flac", ".ogg"}
 def load_voices() -> Dict[str, str]:
    """Scan voice directories and return {voice_name: file_path} mapping."""
    from config import get_predefined_voices_path, get_reference_audio_path
    voices: Dict[str, str] = {}
    def _scan_dir(directory: Path) -> None:
        if not directory.exists():
            return
        for f in sorted(directory.iterdir()):
            if f.suffix.lower() in VOICE_EXTENSIONS:
                name = f.stem
                if name not in voices:
                    voices[name] = str(f)
    # Reference audio first so predefined voices take priority on collision
    _scan_dir(get_reference_audio_path())
    _scan_dir(get_predefined_voices_path())
    logger.info(f"Discovered {len(voices)} voice(s): {list(voices.keys())}")
    return voices
 def resolve_voice(voice_name: Optional[str], voices: Dict[str, str]) -> Optional[str]:
    """Resolve a voice name to its audio file path."""
    from config import get_predefined_voices_path, get_reference_audio_path, get_default_voice_id
    if not voice_name:
        default = get_default_voice_id()
        voice_name = Path(default).stem
    # Exact name match in discovered voices
    if voice_name in voices:
        return voices[voice_name]
    # Try predefined voices dir with extensions
    for ext in VOICE_EXTENSIONS:
        p = get_predefined_voices_path() / f"{voice_name}{ext}"
        if p.exists():
            return str(p)
    # Try reference audio dir with extensions
    for ext in VOICE_EXTENSIONS:
        p = get_reference_audio_path() / f"{voice_name}{ext}"
        if p.exists():
            return str(p)
    # Fall back to any voice
    if voices:
        fallback = next(iter(voices.values()))
        logger.warning(f"Voice '{voice_name}' not found, falling back to '{fallback}'")
        return fallback
    return None
 def create_wyoming_info(sample_rate: int, voices: Dict[str, str]) -> Info:
    """Build the Wyoming Info object advertised to Home Assistant."""
    tts_voices = [
        TtsVoice(
            name=name,
            description=f"Chatterbox voice: {name}",
            attribution=Attribution(
                name="ResembleAI",
                url="https://github.com/resemble-ai/chatterbox",
            ),
            installed=True,
            languages=["en"],
            speakers=[TtsVoiceSpeaker(name=name)],
        )
        for name in sorted(voices.keys())
    ]
    return Info(
        tts=[
            TtsProgram(
                name="chatterbox",
                description="Chatterbox TTS with ROCm/AMD GPU support",
                attribution=Attribution(
                    name="ResembleAI",
                    url="https://github.com/resemble-ai/chatterbox",
                ),
                installed=True,
                voices=tts_voices,
                version="1.0",
            )
        ]
    )