rocm-chatterbox-whisper/main.py

import asyncio
import logging
import sys
from functools import partial

from wyoming.server import AsyncServer

import engine
from config import get_wyoming_host, get_wyoming_port, load_config
from wyoming_handler import ChatterboxWyomingHandler
from wyoming_voices import create_wyoming_info, load_voices

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s %(levelname)s %(name)s: %(message)s",
    stream=sys.stdout,
)
logger = logging.getLogger(__name__)


def _warmup(voices: dict) -> None:
    from wyoming_voices import resolve_voice
    audio_prompt = resolve_voice(None, voices) if voices else None
    logger.info("Running warmup synthesis to populate MIOpen kernel cache...")
    try:
        engine.synthesize(text="Warmup.", audio_prompt_path=audio_prompt)
        logger.info("Warmup complete — MIOpen cache populated")
    except Exception:
        logger.warning("Warmup synthesis failed (non-fatal)", exc_info=True)


async def main() -> None:
    load_config()

    logger.info("Loading TTS model...")
    if not engine.load_model():
        logger.error("Failed to load model, exiting")
        sys.exit(1)

    voices = load_voices()
    wyoming_info = create_wyoming_info(engine.get_sample_rate(), voices)

    # Run a warmup synthesis before accepting connections so MIOpen benchmarks
    # and caches the best convolution algorithms for all layer shapes. Without
    # this, the first real HA request triggers benchmarking (hundreds of runs)
    # and times out before any audio is returned.
    _warmup(voices)

    host = get_wyoming_host()
    port = get_wyoming_port()
    uri = f"tcp://{host}:{port}"

    logger.info(f"Starting Wyoming server on {uri}")
    server = AsyncServer.from_uri(uri)
    await server.run(partial(ChatterboxWyomingHandler, wyoming_info, voices))


if __name__ == "__main__":
    asyncio.run(main())