From 931a65c2ac3b3db36f97c42ecd4aebab71b314ff Mon Sep 17 00:00:00 2001
From: Robin Voetter <robin@voetter.nl>
Date: Thu, 28 Aug 2025 23:15:44 +0200
Subject: [PATCH 1/3] rocprof: implement ROCm profiling

This uses rocPROF to fetch some interesting data and put it
in the profile_data directory, the download link of which
is then returned to the user.
---
 src/libkernelbot/run_eval.py | 61 ++++++++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)

diff --git a/src/libkernelbot/run_eval.py b/src/libkernelbot/run_eval.py
index e8722ba7..8c6fd602 100644
--- a/src/libkernelbot/run_eval.py
+++ b/src/libkernelbot/run_eval.py
@@ -297,6 +297,64 @@ def run_program(
     )
 
 
+def profile_program(
+    system: SystemInfo,
+    call: list[str],
+    seed: Optional[int],
+    timeout: int,
+    multi_gpu: bool,
+) -> tuple[RunResult, Optional[ProfileResult]]:
+    # The runner-specific configuration should implement logic
+    # to fetch the data in this directory and return it as
+    # ProfileResult.download_url.
+    output_dir = Path('profile_data')
+
+    if system.runtime == "ROCm":
+        # Wrap program in rocprof
+        output_dir.mkdir()
+        call = [
+            "rocprofv3",
+            "--log-level",
+            "fatal",
+            "--hip-trace",
+            "--kernel-trace",
+            "--rccl-trace",
+            "--marker-trace",
+            "--hip-trace",
+            "--memory-copy-trace",
+            # New? Doesn't work in the runner
+            # "--memory-allocation-trace",
+            "--scratch-memory-trace",
+            # The HSA trace output is very large, so skip it for now
+            # "--hsa-trace",
+            "--output-format",
+            "pftrace",
+            "csv",
+            "-d",
+            str(output_dir),
+            # Just store the files as %pid%_tracename.ext instead of putting them in an
+            # additional directory named after the hostname.
+            "-o",
+            # Insert an extra path here so that the resulting zip has all files
+            # in the profile_data/ directory rather than the root.
+            "profile_data/%pid%",
+            "--",
+        ] + call
+
+        run_result = run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu)
+        profile_result = None
+
+        if run_result.success:
+            profile_result = ProfileResult(
+                profiler='rocPROF',
+                download_url=None,
+            )
+
+        return run_result, profile_result
+    else:
+        # TODO: Implement profiling for other platforms
+        return run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu), None
+
 def run_single_evaluation(
     system: SystemInfo,
     call: list[str],
@@ -332,6 +390,9 @@ def run_single_evaluation(
 
         call += [mode, cases.name]
 
+        if mode == "profile":
+            return profile_program(system, call, seed=seed, timeout=timeout, multi_gpu=multi_gpu)
+
         return run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu), None
 
 

From 335ae52ebcd20533ded338f52a978707fef46eb8 Mon Sep 17 00:00:00 2001
From: Robin Voetter <robin@voetter.nl>
Date: Sun, 31 Aug 2025 12:54:43 +0200
Subject: [PATCH 2/3] rocprof: post-process rocprof results

rocPROF generates one trace for every process. Simply combine them
together into a single trace for ease of use. Also remove the
individual traces are they are no longer useful afterwards.
---
 src/libkernelbot/run_eval.py | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/src/libkernelbot/run_eval.py b/src/libkernelbot/run_eval.py
index 8c6fd602..2058d6ca 100644
--- a/src/libkernelbot/run_eval.py
+++ b/src/libkernelbot/run_eval.py
@@ -4,6 +4,7 @@
 import json
 import os
 import shlex
+import shutil
 import subprocess
 import tempfile
 import time
@@ -307,11 +308,13 @@ def profile_program(
     # The runner-specific configuration should implement logic
     # to fetch the data in this directory and return it as
     # ProfileResult.download_url.
-    output_dir = Path('profile_data')
+    # Insert an extra nested nested path here so that the resulting zip has all files
+    # in the profile_data/ directory rather than directly in the root.
+    output_dir = Path(".") / "profile_data" / "profile_data"
+    output_dir.mkdir(parents=True, exist_ok=True)
 
     if system.runtime == "ROCm":
         # Wrap program in rocprof
-        output_dir.mkdir()
         call = [
             "rocprofv3",
             "--log-level",
@@ -337,7 +340,7 @@ def profile_program(
             "-o",
             # Insert an extra path here so that the resulting zip has all files
             # in the profile_data/ directory rather than the root.
-            "profile_data/%pid%",
+            "%pid%",
             "--",
         ] + call
 
@@ -345,6 +348,20 @@ def profile_program(
         profile_result = None
 
         if run_result.success:
+            # Post-process trace data.
+            # rocPROF generates one trace for every process, but its more useful to
+            # have all traces be in the same file. Fortunately we can do that by
+            # concatenating.
+            traces = list(output_dir.glob("*.pftrace"))
+            with (output_dir / "combined.pftrace").open("wb") as combined:
+                for trace_path in traces:
+                    with trace_path.open("rb") as trace:
+                        shutil.copyfileobj(trace, combined)
+
+                    # After we've created the combined trace, there is no point in
+                    # keeping the individual traces around.
+                    trace_path.unlink()
+
             profile_result = ProfileResult(
                 profiler='rocPROF',
                 download_url=None,

From 3de6d82ba72ef89a39f2521b0e5dc5192606c74c Mon Sep 17 00:00:00 2001
From: Robin Voetter <robin@voetter.nl>
Date: Sun, 31 Aug 2025 17:44:00 +0200
Subject: [PATCH 3/3] rocprof: also output code objects

---
 src/libkernelbot/run_eval.py | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/src/libkernelbot/run_eval.py b/src/libkernelbot/run_eval.py
index 2058d6ca..c0897baf 100644
--- a/src/libkernelbot/run_eval.py
+++ b/src/libkernelbot/run_eval.py
@@ -233,11 +233,18 @@ def compile_cuda_script(  # # noqa: C901
 
 
 def run_program(
-    args: list[str], seed: Optional[int], timeout: int, multi_gpu: bool = False
+    args: list[str],
+    seed: Optional[int],
+    timeout: int,
+    multi_gpu: bool = False,
+    extra_env: Optional[dict[str, str]] = None,
 ) -> RunResult:
     print("[Running]")
     # set up a pipe so the tester can communicate its verdict with us
     env = os.environ.copy()
+    if extra_env is not None:
+        env.update(extra_env)
+
     pipe_read, pipe_write = os.pipe()
     env["POPCORN_FD"] = str(pipe_write)
     if seed is not None:
@@ -344,7 +351,10 @@ def profile_program(
             "--",
         ] + call
 
-        run_result = run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu)
+        run_result = run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu, extra_env={
+            "GPU_DUMP_CODE_OBJECT": "1",
+        })
+
         profile_result = None
 
         if run_result.success:
@@ -362,6 +372,10 @@ def profile_program(
                     # keeping the individual traces around.
                     trace_path.unlink()
 
+            # Also move the code objects to the profiling output directory.
+            for code_obj in list(Path.cwd().glob("_code_object*.o")):
+                code_obj.rename(output_dir / code_obj.name)
+
             profile_result = ProfileResult(
                 profiler='rocPROF',
                 download_url=None,