Use events for pmpp_v2 benchmarking

achalpandeyy · achalpandeyy · commit 5f7edb501c80 · 2025-09-11T07:02:18.000+01:00
diff --git a/problems/pmpp_v2/eval.py b/problems/pmpp_v2/eval.py
@@ -11,7 +11,7 @@
 
 import torch.cuda
 
-from utils import set_seed
+from utils import set_seed, clear_l2_cache
 try:
     from task import TestSpec
 except ImportError:
@@ -218,18 +218,23 @@ def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_t
             data = generate_input(**test.args)
             check_copy = _clone_data(data)
         torch.cuda.synchronize()
-        start = time.perf_counter_ns()
+        start_event = torch.cuda.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
+        clear_l2_cache()
+
+        start_event.record()
         output = custom_kernel(data)
+        end_event.record()
         torch.cuda.synchronize()
-        end = time.perf_counter_ns()
+        duration = start_event.elapsed_time(end_event) * 1e6  # Convert ms to ns
 
         if recheck:
             good, message = check_implementation(check_copy, output)
             if not good:
                 return message
 
         del output
-        durations.append(end - start)
+        durations.append(duration)
 
         if i > 1:
             total_bm_duration = time.perf_counter_ns() - bm_start_time
diff --git a/problems/pmpp_v2/utils.py b/problems/pmpp_v2/utils.py
@@ -165,3 +165,12 @@ def __exit__(self, exc_type, exc_value, traceback):
         torch.backends.cudnn.deterministic = self.deterministic
         torch.use_deterministic_algorithms(False)
         os.environ['CUBLAS_WORKSPACE_CONFIG'] = self.cublas
+
+def clear_l2_cache():
+    # import cupy as cp
+    # cp.cuda.runtime.deviceSetLimit(cp.cuda.runtime.cudaLimitPersistingL2CacheSize, 0)
+    # create a large dummy tensor
+    dummy = torch.empty((32, 1024, 1024), dtype=torch.int64, device="cuda")
+    # write stuff to
+    dummy.fill_(42)
+    del dummy