From bdde4a2480034b2e6a8dabd7a841a2522793bd2b Mon Sep 17 00:00:00 2001 From: scott Date: Sun, 5 Apr 2026 13:48:41 -0400 Subject: [PATCH] Add startup warmup and make fp16 autocast fault-tolerant Warmup: run a synthesis before accepting Wyoming connections so MIOpen benchmarks and caches all conv layer shapes. Without this, the first HA request triggers hundreds of benchmark runs and times out. fp16: wrap in try/except so a failed autocast retries in fp32 rather than dropping the request silently. Co-Authored-By: Claude Sonnet 4.6 --- engine.py | 9 +++++++-- main.py | 17 +++++++++++++++++ 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/engine.py b/engine.py index b1f6cbb..0b951d2 100644 --- a/engine.py +++ b/engine.py @@ -122,8 +122,13 @@ def synthesize( kwargs["exaggeration"] = exaggeration kwargs["cfg_weight"] = cfg_weight - with torch.inference_mode(), torch.autocast("cuda", dtype=torch.float16): - wav = chatterbox_model.generate(text=text, **kwargs) + try: + with torch.inference_mode(), torch.autocast("cuda", dtype=torch.float16, enabled=torch.cuda.is_available()): + wav = chatterbox_model.generate(text=text, **kwargs) + except Exception: + logger.warning("fp16 autocast failed, retrying in fp32", exc_info=True) + with torch.inference_mode(): + wav = chatterbox_model.generate(text=text, **kwargs) if torch.cuda.is_available(): torch.cuda.synchronize() diff --git a/main.py b/main.py index 8bbc332..d86d4c9 100644 --- a/main.py +++ b/main.py @@ -18,6 +18,17 @@ logging.basicConfig( logger = logging.getLogger(__name__) +def _warmup(voices: dict) -> None: + from wyoming_voices import resolve_voice + audio_prompt = resolve_voice(None, voices) if voices else None + logger.info("Running warmup synthesis to populate MIOpen kernel cache...") + try: + engine.synthesize(text="Warmup.", audio_prompt_path=audio_prompt) + logger.info("Warmup complete — MIOpen cache populated") + except Exception: + logger.warning("Warmup synthesis failed (non-fatal)", exc_info=True) + + async def main() -> None: load_config() @@ -29,6 +40,12 @@ async def main() -> None: voices = load_voices() wyoming_info = create_wyoming_info(engine.get_sample_rate(), voices) + # Run a warmup synthesis before accepting connections so MIOpen benchmarks + # and caches the best convolution algorithms for all layer shapes. Without + # this, the first real HA request triggers benchmarking (hundreds of runs) + # and times out before any audio is returned. + _warmup(voices) + host = get_wyoming_host() port = get_wyoming_port() uri = f"tcp://{host}:{port}"