From 967ed41239253e95dd5128291d2931c38808fb48 Mon Sep 17 00:00:00 2001 From: scott Date: Sun, 5 Apr 2026 20:30:49 -0400 Subject: [PATCH] =?UTF-8?q?Revert=20FP16=20autocast=20=E2=80=94=20increase?= =?UTF-8?q?s=20TTFA=20on=20first=20request?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit autocast triggers fp16 kernel selection at first call for each tensor shape. Since the warmup uses short text, real requests re-trigger selection and are slower net. Keeping FP32 + conditionals cache. Co-Authored-By: Claude Sonnet 4.6 --- engine.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/engine.py b/engine.py index 55b457b..766ed87 100644 --- a/engine.py +++ b/engine.py @@ -116,8 +116,7 @@ def synthesize( kwargs["cfg_weight"] = cfg_weight with torch.inference_mode(): - with torch.amp.autocast(device_type="cuda", dtype=torch.float16): - wav = chatterbox_model.generate(text=text, **kwargs) + wav = chatterbox_model.generate(text=text, **kwargs) if torch.cuda.is_available(): torch.cuda.synchronize()