From 9b62fce5c5c4ef917e36fe96ce868b1d813d92c0 Mon Sep 17 00:00:00 2001 From: scott Date: Sun, 5 Apr 2026 20:34:33 -0400 Subject: [PATCH] [dev-fp16] Convert model weights to fp16 at load time Converting t3/s3gen/ve to fp16 once at load time means: - Warmup runs in fp16, covering the right dtypes for all real requests - No per-call autocast casting overhead - ~2x faster matrix ops and convolutions on RDNA 2 hardware Co-Authored-By: Claude Sonnet 4.6 --- engine.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/engine.py b/engine.py index 766ed87..a66d95f 100644 --- a/engine.py +++ b/engine.py @@ -50,6 +50,19 @@ def load_model() -> bool: _is_turbo = False _sample_rate = 24000 + + # Convert weights to fp16. Done once at load time so the warmup + # covers the right dtypes and there's no per-call casting overhead. + if torch.cuda.is_available(): + try: + for attr in ("t3", "s3gen", "ve"): + m = getattr(chatterbox_model, attr, None) + if m is not None: + m.half() + logger.info("Model converted to fp16") + except Exception: + logger.warning("fp16 conversion failed, running in fp32", exc_info=True) + logger.info("Model loaded successfully") return True except Exception: