Skip to content

Commit 5f7edb5

Browse files
committed
Use events for pmpp_v2 benchmarking
1 parent 4c5405d commit 5f7edb5

File tree

2 files changed

+18
-4
lines changed

2 files changed

+18
-4
lines changed

problems/pmpp_v2/eval.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212
import torch.cuda
1313

14-
from utils import set_seed
14+
from utils import set_seed, clear_l2_cache
1515
try:
1616
from task import TestSpec
1717
except ImportError:
@@ -218,18 +218,23 @@ def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_t
218218
data = generate_input(**test.args)
219219
check_copy = _clone_data(data)
220220
torch.cuda.synchronize()
221-
start = time.perf_counter_ns()
221+
start_event = torch.cuda.Event(enable_timing=True)
222+
end_event = torch.cuda.Event(enable_timing=True)
223+
clear_l2_cache()
224+
225+
start_event.record()
222226
output = custom_kernel(data)
227+
end_event.record()
223228
torch.cuda.synchronize()
224-
end = time.perf_counter_ns()
229+
duration = start_event.elapsed_time(end_event) * 1e6 # Convert ms to ns
225230

226231
if recheck:
227232
good, message = check_implementation(check_copy, output)
228233
if not good:
229234
return message
230235

231236
del output
232-
durations.append(end - start)
237+
durations.append(duration)
233238

234239
if i > 1:
235240
total_bm_duration = time.perf_counter_ns() - bm_start_time

problems/pmpp_v2/utils.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,3 +165,12 @@ def __exit__(self, exc_type, exc_value, traceback):
165165
torch.backends.cudnn.deterministic = self.deterministic
166166
torch.use_deterministic_algorithms(False)
167167
os.environ['CUBLAS_WORKSPACE_CONFIG'] = self.cublas
168+
169+
def clear_l2_cache():
170+
# import cupy as cp
171+
# cp.cuda.runtime.deviceSetLimit(cp.cuda.runtime.cudaLimitPersistingL2CacheSize, 0)
172+
# create a large dummy tensor
173+
dummy = torch.empty((32, 1024, 1024), dtype=torch.int64, device="cuda")
174+
# write stuff to
175+
dummy.fill_(42)
176+
del dummy

0 commit comments

Comments
 (0)