From 1ce987cf52c84517e06ea43ec0564590a0b83e9e Mon Sep 17 00:00:00 2001 From: User Date: Fri, 26 Sep 2025 00:53:47 -0300 Subject: [PATCH] Fix Python 3.12 VRAM spikes with CUDNN benchmark Disables torch.backends.cudnn.benchmark on Python 3.12 to prevent severe VRAM allocation spikes that occur during model operations. The CUDNN benchmarking feature, introduced in v0.3.57 (commit e2d1e5da), tests multiple convolution algorithms and allocates temporary VRAM. This interacts poorly with Python 3.12's garbage collection behavior, causing multi-GB VRAM spikes before and after model inference. Solution: - Preserves CUDNN benchmarking performance benefit on other Python versions - Only disables the problematic behavior on Python 3.12 - Maintains full functionality while fixing memory management issues - No impact on users not using --fast autotune flag Tested with TTS model wrappers that reproduce the issue consistently on Python 3.12 with ComfyUI v0.3.57+. Fixes: VRAM spikes in Python 3.12 environments Related: ComfyUI v0.3.57 regression affecting model memory management --- comfy/ops.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/comfy/ops.py b/comfy/ops.py index 9d7dedd374b6..7ed5dcec099d 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -53,7 +53,10 @@ def scaled_dot_product_attention(q, k, v, *args, **kwargs): cast_to = comfy.model_management.cast_to #TODO: remove once no more references if torch.cuda.is_available() and torch.backends.cudnn.is_available() and PerformanceFeature.AutoTune in args.fast: - torch.backends.cudnn.benchmark = True + import sys + # Skip CUDNN benchmark on Python 3.12 due to VRAM allocation issues with model wrappers + if sys.version_info[:2] != (3, 12): + torch.backends.cudnn.benchmark = True def cast_to_input(weight, input, non_blocking=False, copy=True): return comfy.model_management.cast_to(weight, input.dtype, input.device, non_blocking=non_blocking, copy=copy)