comfyanonymous
diff --git a/‎comfy/model_management.py‎
Lines changed: 3 additions & 0 deletions b/‎comfy/model_management.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎comfy/ops.py‎
Lines changed: 0 additions & 3 deletions b/‎comfy/ops.py‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎cuda_malloc.py‎
Lines changed: 4 additions & 3 deletions b/‎cuda_malloc.py‎
Lines changed: 4 additions & 3 deletions
@@ -371,6 +371,9 @@ def amd_min_version(device=None, min_rdna_version=0):
 except:
  pass
 
+if torch.cuda.is_available() and torch.backends.cudnn.is_available() and PerformanceFeature.AutoTune in args.fast:
+ torch.backends.cudnn.benchmark = True
+
 try:
  if torch_version_numeric >= (2, 5):
  torch.backends.cuda.allow_fp16_bf16_reduction_math_sdp(True)
 
@@ -67,9 +67,6 @@ def scaled_dot_product_attention(q, k, v, *args, **kwargs):
 
 cast_to = comfy.model_management.cast_to #TODO: remove once no more references
 
-if torch.cuda.is_available() and torch.backends.cudnn.is_available() and PerformanceFeature.AutoTune in args.fast:
- torch.backends.cudnn.benchmark = True
-
 def cast_to_input(weight, input, non_blocking=False, copy=True):
  return comfy.model_management.cast_to(weight, input.dtype, input.device, non_blocking=non_blocking, copy=copy)
 
 
@@ -1,6 +1,6 @@
 import os
 import importlib.util
-from comfy.cli_args import args
+from comfy.cli_args import args, PerformanceFeature
 import subprocess
 
 #Can't use pytorch to get the GPU names because the cuda malloc has to be set before the first import.
@@ -75,8 +75,9 @@ def cuda_malloc_supported():
  spec.loader.exec_module(module)
  version = module.__version__
 
- if int(version[0]) >= 2 and "+cu" in version: #enable by default for torch version 2.0 and up only on cuda torch
- args.cuda_malloc = cuda_malloc_supported()
+ if int(version[0]) >= 2 and "+cu" in version: # enable by default for torch version 2.0 and up only on cuda torch
+ if PerformanceFeature.AutoTune not in args.fast: # Autotune has issues with cuda malloc
+ args.cuda_malloc = cuda_malloc_supported()
  except:
  pass