From 1ce987cf52c84517e06ea43ec0564590a0b83e9e Mon Sep 17 00:00:00 2001
From: User <user@example.com>
Date: Fri, 26 Sep 2025 00:53:47 -0300
Subject: [PATCH] Fix Python 3.12 VRAM spikes with CUDNN benchmark

Disables torch.backends.cudnn.benchmark on Python 3.12 to prevent
severe VRAM allocation spikes that occur during model operations.

The CUDNN benchmarking feature, introduced in v0.3.57 (commit e2d1e5da),
tests multiple convolution algorithms and allocates temporary VRAM.
This interacts poorly with Python 3.12's garbage collection behavior,
causing multi-GB VRAM spikes before and after model inference.

Solution:
- Preserves CUDNN benchmarking performance benefit on other Python versions
- Only disables the problematic behavior on Python 3.12
- Maintains full functionality while fixing memory management issues
- No impact on users not using --fast autotune flag

Tested with TTS model wrappers that reproduce the issue consistently
on Python 3.12 with ComfyUI v0.3.57+.

Fixes: VRAM spikes in Python 3.12 environments
Related: ComfyUI v0.3.57 regression affecting model memory management
---
 comfy/ops.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/comfy/ops.py b/comfy/ops.py
index 9d7dedd374b6..7ed5dcec099d 100644
--- a/comfy/ops.py
+++ b/comfy/ops.py
@@ -53,7 +53,10 @@ def scaled_dot_product_attention(q, k, v, *args, **kwargs):
 cast_to = comfy.model_management.cast_to #TODO: remove once no more references
 
 if torch.cuda.is_available() and torch.backends.cudnn.is_available() and PerformanceFeature.AutoTune in args.fast:
-    torch.backends.cudnn.benchmark = True
+    import sys
+    # Skip CUDNN benchmark on Python 3.12 due to VRAM allocation issues with model wrappers
+    if sys.version_info[:2] != (3, 12):
+        torch.backends.cudnn.benchmark = True
 
 def cast_to_input(weight, input, non_blocking=False, copy=True):
     return comfy.model_management.cast_to(weight, input.dtype, input.device, non_blocking=non_blocking, copy=copy)