diff --git a/engine.py b/engine.py
index 0b951d2..557b2c1 100644
--- a/engine.py
+++ b/engine.py
@@ -46,13 +46,18 @@ def load_model() -> bool:
 
         _sample_rate = 24000
 
-        # Enable MIOpen algorithm benchmarking. Without this, PyTorch picks
-        # convolution algorithms heuristically and passes ptr=0/size=0 workspace
-        # to MIOpen, forcing a slow fallback on every conv op. With benchmark=True,
-        # PyTorch evaluates algorithms with proper workspace on first use and caches
-        # the best result (persisted via MIOPEN_USER_DB_PATH volume mount).
         if torch.cuda.is_available():
-            torch.backends.cudnn.benchmark = True
+            # torch.compile replaces MIOpen's convolution path with Triton-generated
+            # kernels, bypassing the workspace=0 fallback entirely. We compile only
+            # s3gen (HiFiGAN vocoder + flow matching) since that's the bottleneck.
+            # suppress_errors=True falls back to eager for any op compile can't handle.
+            try:
+                import torch._dynamo
+                torch._dynamo.config.suppress_errors = True
+                chatterbox_model.s3gen = torch.compile(chatterbox_model.s3gen, dynamic=True)
+                logger.info("s3gen compiled with torch.compile")
+            except Exception:
+                logger.warning("torch.compile unavailable, running s3gen in eager mode", exc_info=True)
 
         _patch_timing(chatterbox_model)
         logger.info("Model loaded successfully")
@@ -122,13 +127,8 @@ def synthesize(
         kwargs["exaggeration"] = exaggeration
         kwargs["cfg_weight"] = cfg_weight
 
-    try:
-        with torch.inference_mode(), torch.autocast("cuda", dtype=torch.float16, enabled=torch.cuda.is_available()):
-            wav = chatterbox_model.generate(text=text, **kwargs)
-    except Exception:
-        logger.warning("fp16 autocast failed, retrying in fp32", exc_info=True)
-        with torch.inference_mode():
-            wav = chatterbox_model.generate(text=text, **kwargs)
+    with torch.inference_mode():
+        wav = chatterbox_model.generate(text=text, **kwargs)
 
     if torch.cuda.is_available():
         torch.cuda.synchronize()