From 931a65c2ac3b3db36f97c42ecd4aebab71b314ff Mon Sep 17 00:00:00 2001 From: Robin Voetter Date: Thu, 28 Aug 2025 23:15:44 +0200 Subject: [PATCH 1/3] rocprof: implement ROCm profiling This uses rocPROF to fetch some interesting data and put it in the profile_data directory, the download link of which is then returned to the user. --- src/libkernelbot/run_eval.py | 61 ++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/src/libkernelbot/run_eval.py b/src/libkernelbot/run_eval.py index e8722ba7..8c6fd602 100644 --- a/src/libkernelbot/run_eval.py +++ b/src/libkernelbot/run_eval.py @@ -297,6 +297,64 @@ def run_program( ) +def profile_program( + system: SystemInfo, + call: list[str], + seed: Optional[int], + timeout: int, + multi_gpu: bool, +) -> tuple[RunResult, Optional[ProfileResult]]: + # The runner-specific configuration should implement logic + # to fetch the data in this directory and return it as + # ProfileResult.download_url. + output_dir = Path('profile_data') + + if system.runtime == "ROCm": + # Wrap program in rocprof + output_dir.mkdir() + call = [ + "rocprofv3", + "--log-level", + "fatal", + "--hip-trace", + "--kernel-trace", + "--rccl-trace", + "--marker-trace", + "--hip-trace", + "--memory-copy-trace", + # New? Doesn't work in the runner + # "--memory-allocation-trace", + "--scratch-memory-trace", + # The HSA trace output is very large, so skip it for now + # "--hsa-trace", + "--output-format", + "pftrace", + "csv", + "-d", + str(output_dir), + # Just store the files as %pid%_tracename.ext instead of putting them in an + # additional directory named after the hostname. + "-o", + # Insert an extra path here so that the resulting zip has all files + # in the profile_data/ directory rather than the root. + "profile_data/%pid%", + "--", + ] + call + + run_result = run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu) + profile_result = None + + if run_result.success: + profile_result = ProfileResult( + profiler='rocPROF', + download_url=None, + ) + + return run_result, profile_result + else: + # TODO: Implement profiling for other platforms + return run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu), None + def run_single_evaluation( system: SystemInfo, call: list[str], @@ -332,6 +390,9 @@ def run_single_evaluation( call += [mode, cases.name] + if mode == "profile": + return profile_program(system, call, seed=seed, timeout=timeout, multi_gpu=multi_gpu) + return run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu), None From 335ae52ebcd20533ded338f52a978707fef46eb8 Mon Sep 17 00:00:00 2001 From: Robin Voetter Date: Sun, 31 Aug 2025 12:54:43 +0200 Subject: [PATCH 2/3] rocprof: post-process rocprof results rocPROF generates one trace for every process. Simply combine them together into a single trace for ease of use. Also remove the individual traces are they are no longer useful afterwards. --- src/libkernelbot/run_eval.py | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/src/libkernelbot/run_eval.py b/src/libkernelbot/run_eval.py index 8c6fd602..2058d6ca 100644 --- a/src/libkernelbot/run_eval.py +++ b/src/libkernelbot/run_eval.py @@ -4,6 +4,7 @@ import json import os import shlex +import shutil import subprocess import tempfile import time @@ -307,11 +308,13 @@ def profile_program( # The runner-specific configuration should implement logic # to fetch the data in this directory and return it as # ProfileResult.download_url. - output_dir = Path('profile_data') + # Insert an extra nested nested path here so that the resulting zip has all files + # in the profile_data/ directory rather than directly in the root. + output_dir = Path(".") / "profile_data" / "profile_data" + output_dir.mkdir(parents=True, exist_ok=True) if system.runtime == "ROCm": # Wrap program in rocprof - output_dir.mkdir() call = [ "rocprofv3", "--log-level", @@ -337,7 +340,7 @@ def profile_program( "-o", # Insert an extra path here so that the resulting zip has all files # in the profile_data/ directory rather than the root. - "profile_data/%pid%", + "%pid%", "--", ] + call @@ -345,6 +348,20 @@ def profile_program( profile_result = None if run_result.success: + # Post-process trace data. + # rocPROF generates one trace for every process, but its more useful to + # have all traces be in the same file. Fortunately we can do that by + # concatenating. + traces = list(output_dir.glob("*.pftrace")) + with (output_dir / "combined.pftrace").open("wb") as combined: + for trace_path in traces: + with trace_path.open("rb") as trace: + shutil.copyfileobj(trace, combined) + + # After we've created the combined trace, there is no point in + # keeping the individual traces around. + trace_path.unlink() + profile_result = ProfileResult( profiler='rocPROF', download_url=None, From 3de6d82ba72ef89a39f2521b0e5dc5192606c74c Mon Sep 17 00:00:00 2001 From: Robin Voetter Date: Sun, 31 Aug 2025 17:44:00 +0200 Subject: [PATCH 3/3] rocprof: also output code objects --- src/libkernelbot/run_eval.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/src/libkernelbot/run_eval.py b/src/libkernelbot/run_eval.py index 2058d6ca..c0897baf 100644 --- a/src/libkernelbot/run_eval.py +++ b/src/libkernelbot/run_eval.py @@ -233,11 +233,18 @@ def compile_cuda_script( # # noqa: C901 def run_program( - args: list[str], seed: Optional[int], timeout: int, multi_gpu: bool = False + args: list[str], + seed: Optional[int], + timeout: int, + multi_gpu: bool = False, + extra_env: Optional[dict[str, str]] = None, ) -> RunResult: print("[Running]") # set up a pipe so the tester can communicate its verdict with us env = os.environ.copy() + if extra_env is not None: + env.update(extra_env) + pipe_read, pipe_write = os.pipe() env["POPCORN_FD"] = str(pipe_write) if seed is not None: @@ -344,7 +351,10 @@ def profile_program( "--", ] + call - run_result = run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu) + run_result = run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu, extra_env={ + "GPU_DUMP_CODE_OBJECT": "1", + }) + profile_result = None if run_result.success: @@ -362,6 +372,10 @@ def profile_program( # keeping the individual traces around. trace_path.unlink() + # Also move the code objects to the profiling output directory. + for code_obj in list(Path.cwd().glob("_code_object*.o")): + code_obj.rename(output_dir / code_obj.name) + profile_result = ProfileResult( profiler='rocPROF', download_url=None,