meta-pytorch · xuzhao9 · Sep 17, 2025 · Sep 10, 2025 · Sep 10, 2025 · Sep 10, 2025
diff --git a/tritonbench/components/do_bench/run.py b/tritonbench/components/do_bench/run.py
@@ -185,6 +185,14 @@ def _do_bench_profiler(
     Returns:
         List of measured kernel times in milliseconds (if return_mode="all") or single value.
     """
+    # we don't want any outside errors propagating into benchmarking
+    torch.cuda.synchronize()
+
+    # warmup `fn` (and catches any failures in the process)
+    for _ in range(3):
+        fn()
+    torch.cuda.synchronize()
+
     # Get cache for L2 cache clearing
     cache = triton.runtime.driver.active.get_empty_cache_for_benchmark()
 
@@ -193,36 +201,28 @@ def _do_bench_profiler(
 
     # Calculate number of iterations based on target rep time
     if estimate_ms == 0:
-        n_repeat = 100  # Default if function is very fast
+        n_repeat = 1000  # Default if function is very fast
     else:
         n_repeat = max(1, int(rep / estimate_ms))
 
     # Helper function to execute one iteration
-    def run_iteration():
+    def run_iteration(should_clear_cache: bool):
         if grad_to_none is not None:
             for x in grad_to_none:
                 x.grad = None
-        cache.zero_()
+        if should_clear_cache:
+            cache.zero_()
         fn()
 
     if use_cudagraph:
         # Create CUDA graph
         g = torch.cuda.CUDAGraph()
         with torch.cuda.graph(g):
             for _ in range(n_repeat):
-                run_iteration()
-        torch.cuda.synchronize()
-    else:
-        # Regular mode warmup
-        n_warmup = max(1, int(warmup / estimate_ms)) if estimate_ms > 0 else 25
-
-        torch.cuda.synchronize()
-        for _ in range(n_warmup):
-            run_iteration()
+                run_iteration(should_clear_cache=False)
         torch.cuda.synchronize()
 
-    n_profiler_runs = 5
-    iterations_per_profiler_run = n_repeat
+    n_profiler_runs = 10
 
     # Benchmark phase - collect kernel times for each iteration
     all_kernel_times = []
@@ -243,8 +243,8 @@ def run_iteration():
                 g.replay()
             else:
                 # Execute multiple iterations for regular mode
-                for _ in range(iterations_per_profiler_run):
-                    run_iteration()
+                for _ in range(n_repeat):
+                    run_iteration(should_clear_cache=True)
             torch.cuda.synchronize()
 
         # Collect all kernel execution intervals
@@ -299,9 +299,7 @@ def run_iteration():
             )
 
         # Convert to milliseconds and normalize by iterations
-        total_kernel_time_ms = (
-            total_kernel_time_us / 1000.0
-        ) / iterations_per_profiler_run
+        total_kernel_time_ms = (total_kernel_time_us / 1000.0) / n_repeat
         all_kernel_times.append(total_kernel_time_ms)
 
     times = torch.tensor(all_kernel_times, dtype=torch.float)