From 514bbad0e905cce0c60b68f8488285d6a47f178f Mon Sep 17 00:00:00 2001 From: scott Date: Sun, 5 Apr 2026 13:24:05 -0400 Subject: [PATCH] Enable cudnn.benchmark to fix MIOpen workspace=0 on convolutions Timing showed s3gen.inference (HiFiGAN vocoder) taking 22s and ref audio processing ~18s - both dominated by Conv1d ops hitting MIOpen fallback. With benchmark=False (default), PyTorch passes ptr=0 size=0 workspace to MIOpen causing GemmFwdRest to fail and fall back to a slow path every call. With benchmark=True, PyTorch evaluates convolution algorithms with proper workspace allocation and caches the best result via MIOPEN_USER_DB_PATH. First inference will be slower while benchmarking; subsequent calls use cache. Co-Authored-By: Claude Sonnet 4.6 --- engine.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/engine.py b/engine.py index dd42a88..1cafcb2 100644 --- a/engine.py +++ b/engine.py @@ -45,6 +45,15 @@ def load_model() -> bool: _is_turbo = False _sample_rate = 24000 + + # Enable MIOpen algorithm benchmarking. Without this, PyTorch picks + # convolution algorithms heuristically and passes ptr=0/size=0 workspace + # to MIOpen, forcing a slow fallback on every conv op. With benchmark=True, + # PyTorch evaluates algorithms with proper workspace on first use and caches + # the best result (persisted via MIOPEN_USER_DB_PATH volume mount). + if torch.cuda.is_available(): + torch.backends.cudnn.benchmark = True + _patch_timing(chatterbox_model) logger.info("Model loaded successfully") return True