Add startup warmup and make fp16 autocast fault-tolerant
All checks were successful
Build ROCm Image / build (push) Successful in 3m10s
All checks were successful
Build ROCm Image / build (push) Successful in 3m10s
Warmup: run a synthesis before accepting Wyoming connections so MIOpen benchmarks and caches all conv layer shapes. Without this, the first HA request triggers hundreds of benchmark runs and times out. fp16: wrap in try/except so a failed autocast retries in fp32 rather than dropping the request silently. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -122,7 +122,12 @@ def synthesize(
|
|||||||
kwargs["exaggeration"] = exaggeration
|
kwargs["exaggeration"] = exaggeration
|
||||||
kwargs["cfg_weight"] = cfg_weight
|
kwargs["cfg_weight"] = cfg_weight
|
||||||
|
|
||||||
with torch.inference_mode(), torch.autocast("cuda", dtype=torch.float16):
|
try:
|
||||||
|
with torch.inference_mode(), torch.autocast("cuda", dtype=torch.float16, enabled=torch.cuda.is_available()):
|
||||||
|
wav = chatterbox_model.generate(text=text, **kwargs)
|
||||||
|
except Exception:
|
||||||
|
logger.warning("fp16 autocast failed, retrying in fp32", exc_info=True)
|
||||||
|
with torch.inference_mode():
|
||||||
wav = chatterbox_model.generate(text=text, **kwargs)
|
wav = chatterbox_model.generate(text=text, **kwargs)
|
||||||
|
|
||||||
if torch.cuda.is_available():
|
if torch.cuda.is_available():
|
||||||
|
|||||||
17
main.py
17
main.py
@@ -18,6 +18,17 @@ logging.basicConfig(
|
|||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def _warmup(voices: dict) -> None:
|
||||||
|
from wyoming_voices import resolve_voice
|
||||||
|
audio_prompt = resolve_voice(None, voices) if voices else None
|
||||||
|
logger.info("Running warmup synthesis to populate MIOpen kernel cache...")
|
||||||
|
try:
|
||||||
|
engine.synthesize(text="Warmup.", audio_prompt_path=audio_prompt)
|
||||||
|
logger.info("Warmup complete — MIOpen cache populated")
|
||||||
|
except Exception:
|
||||||
|
logger.warning("Warmup synthesis failed (non-fatal)", exc_info=True)
|
||||||
|
|
||||||
|
|
||||||
async def main() -> None:
|
async def main() -> None:
|
||||||
load_config()
|
load_config()
|
||||||
|
|
||||||
@@ -29,6 +40,12 @@ async def main() -> None:
|
|||||||
voices = load_voices()
|
voices = load_voices()
|
||||||
wyoming_info = create_wyoming_info(engine.get_sample_rate(), voices)
|
wyoming_info = create_wyoming_info(engine.get_sample_rate(), voices)
|
||||||
|
|
||||||
|
# Run a warmup synthesis before accepting connections so MIOpen benchmarks
|
||||||
|
# and caches the best convolution algorithms for all layer shapes. Without
|
||||||
|
# this, the first real HA request triggers benchmarking (hundreds of runs)
|
||||||
|
# and times out before any audio is returned.
|
||||||
|
_warmup(voices)
|
||||||
|
|
||||||
host = get_wyoming_host()
|
host = get_wyoming_host()
|
||||||
port = get_wyoming_port()
|
port = get_wyoming_port()
|
||||||
uri = f"tcp://{host}:{port}"
|
uri = f"tcp://{host}:{port}"
|
||||||
|
|||||||
Reference in New Issue
Block a user