From f20699aed3ad7c54541bbb2e8afd9316d762437b Mon Sep 17 00:00:00 2001
From: scott <scott@sdgarren.com>
Date: Sun, 5 Apr 2026 13:34:21 -0400
Subject: [PATCH] Add fp16 autocast to synthesis for faster GPU throughput

The 6700 XT has significantly higher fp16 throughput than fp32.
autocast("cuda") uses fp16 for matmuls and convolutions (HiFiGAN,
S3 tokenizer, flow matching) while keeping fp32 for precision-sensitive
ops like softmax and layer norm.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 engine.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/engine.py b/engine.py
index 1cafcb2..b1f6cbb 100644
--- a/engine.py
+++ b/engine.py
@@ -122,7 +122,7 @@ def synthesize(
         kwargs["exaggeration"] = exaggeration
         kwargs["cfg_weight"] = cfg_weight
 
-    with torch.inference_mode():
+    with torch.inference_mode(), torch.autocast("cuda", dtype=torch.float16):
         wav = chatterbox_model.generate(text=text, **kwargs)
 
     if torch.cuda.is_available():