From 77d04e1f610cf7987c3308365caef072801c9397 Mon Sep 17 00:00:00 2001
From: Sahan Paliskara <sahanp@meta.com>
Date: Wed, 28 May 2025 13:08:01 -0700
Subject: [PATCH 01/42] push to test workflow

---
 .github/workflows/nvidia_workflow.yml         |   2 +
 .../cogs/leaderboard_cog.py                   | 101 ++++++++++++++----
 src/discord-cluster-manager/consts.py         |   4 +
 src/discord-cluster-manager/eval.py           |  47 +++++---
 .../launchers/github.py                       |   1 +
 src/discord-cluster-manager/leaderboard_db.py |  26 +++++
 src/discord-cluster-manager/run_eval.py       |   5 +-
 src/discord-cluster-manager/submission.py     |  37 ++++---
 src/discord-cluster-manager/utils.py          |   5 +-
 9 files changed, 176 insertions(+), 52 deletions(-)

diff --git a/.github/workflows/nvidia_workflow.yml b/.github/workflows/nvidia_workflow.yml
index 7852377d..5cf7c0a3 100644
--- a/.github/workflows/nvidia_workflow.yml
+++ b/.github/workflows/nvidia_workflow.yml
@@ -28,6 +28,8 @@ jobs:
     - name: Create input files
       shell: bash
       run: |
+        # install jq
+        apt update && apt install -y jq
         # Extract the payload content without printing it
         PAYLOAD=$(jq -r '.inputs.payload' $GITHUB_EVENT_PATH)
         
diff --git a/src/discord-cluster-manager/cogs/leaderboard_cog.py b/src/discord-cluster-manager/cogs/leaderboard_cog.py
index fadc9c41..cc4366bf 100644
--- a/src/discord-cluster-manager/cogs/leaderboard_cog.py
+++ b/src/discord-cluster-manager/cogs/leaderboard_cog.py
@@ -27,6 +27,8 @@
     with_error_handling,
 )
 
+from consts import REFERENCE_USER_ID, REFERENCE_USER
+
 if TYPE_CHECKING:
     from ..bot import ClusterBot
 
@@ -63,28 +65,51 @@ async def on_submit_hook(  # noqa: C901
         self,
         interaction: discord.Interaction,
         leaderboard_name: Optional[str],
-        script: discord.Attachment,
+        script: Optional[discord.Attachment],
         mode: SubmissionMode,
         cmd_gpus: Optional[List[str]],
     ) -> int:
         """
         Called as the main body of a submission to route to the correct runner.
         """
-        # Read the template file
-        submission_content = await script.read()
 
-        try:
-            submission_content = submission_content.decode()
-        except UnicodeError:
-            await send_discord_message(
-                interaction, "Could not decode your file. Is it UTF-8?", ephemeral=True
-            )
-            return -1
+        if script is None:
+            if mode != SubmissionMode.REFERENCE:
+                await send_discord_message(
+                    interaction,
+                    "Script attachment is required for this unless submission mode is reference",
+                    ephemeral=True,
+                )
+                return -1
+            else:
+                submission_content = ""
+        else:
+            # Read the template file
+            submission_content = await script.read()
+
+            try:
+                submission_content = submission_content.decode()
+            except UnicodeError:
+                await send_discord_message(
+                    interaction, "Could not decode your file. Is it UTF-8?", ephemeral=True
+                )
+                return -1
+        if mode == SubmissionMode.REFERENCE:
+            # create fake reference submission
+            file_name = None
+            submission_content = None
+            user_id = REFERENCE_USER_ID
+            user_name = REFERENCE_USER
+        else:
+            file_name = script.filename
+            submission_content = submission_content
+            user_id = interaction.user.id
+            user_name = interaction.user.global_name or interaction.user.name
 
         req = SubmissionRequest(
             code=submission_content,
-            file_name=script.filename,
-            user_id=interaction.user.id,
+            file_name=file_name,
+            user_id=user_id,
             gpus=cmd_gpus,
             leaderboard=leaderboard_name,
         )
@@ -105,26 +130,28 @@ async def on_submit_hook(  # noqa: C901
 
         command = self.bot.get_cog("SubmitCog").submit_leaderboard
 
-        user_name = interaction.user.global_name or interaction.user.name
         # Create a submission entry in the database
         with self.bot.leaderboard_db as db:
             sub_id = db.create_submission(
                 leaderboard=req.leaderboard,
-                file_name=script.filename,
+                file_name=file_name,
                 code=submission_content,
-                user_id=interaction.user.id,
+                user_id=user_id,
                 time=datetime.now(),
                 user_name=user_name,
             )
+        if mode == SubmissionMode.REFERENCE:
+            run_msg = f"Submission **{sub_id}**: is a reference submission for `{req.leaderboard}`"
+        else:
+            run_msg = f"Submission **{sub_id}**: `{file_name}` for `{req.leaderboard}`"
 
-        run_msg = f"Submission **{sub_id}**: `{script.filename}` for `{req.leaderboard}`"
         reporter = MultiProgressReporter(interaction, run_msg)
         try:
             tasks = [
                 command(
                     sub_id,
                     submission_content,
-                    script.filename,
+                    file_name,
                     gpu,
                     reporter.add_run(f"{gpu.name} on {gpu.runner}"),
                     req.task,
@@ -140,7 +167,7 @@ async def on_submit_hook(  # noqa: C901
                     command(
                         sub_id,
                         submission_content,
-                        script.filename,
+                        file_name,
                         gpu,
                         reporter.add_run(f"{gpu.name} on {gpu.runner} (secret)"),
                         req.task,
@@ -224,10 +251,19 @@ async def submit(
         self,
         interaction: discord.Interaction,
         leaderboard_name: Optional[str],
-        script: discord.Attachment,
+        script: Optional[discord.Attachment],
         mode: SubmissionMode,
         gpu: Optional[str],
     ):
+
+        if not mode == SubmissionMode.REFERENCE and not script:
+            await send_discord_message(
+                interaction,
+                "Script attachment is required for this unless submission mode is reference",
+                ephemeral=True,
+            )
+            return
+
         if not self.bot.accepts_jobs:
             await send_discord_message(
                 interaction,
@@ -319,6 +355,33 @@ async def submit_ranked(
             interaction, leaderboard_name, script, mode=SubmissionMode.LEADERBOARD, gpu=gpu
         )
 
+    @app_commands.command(name="reference_run", description="Create a reference run for a leaderboard")
+    @app_commands.describe(
+        leaderboard_name="Name of the leaderboard to create a reference run for",
+        gpu="Select GPU. Leave empty for interactive selection.",
+    )
+    @app_commands.autocomplete(leaderboard_name=leaderboard_name_autocomplete)
+    @with_error_handling
+    async def submit_reference(
+        self,
+        interaction: discord.Interaction,
+        leaderboard_name: str,
+        gpu: Optional[str] = None,
+    ):
+        # Check if reference run already exists
+        with self.bot.leaderboard_db as db:
+            if db.has_reference_run(leaderboard_name):
+                await send_discord_message(
+                    interaction,
+                    f"A reference run for leaderboard '{leaderboard_name}' already exists.",
+                    ephemeral=True,
+                )
+                return
+        # Process as a special submission
+        return await self.submit(
+            interaction, leaderboard_name, None, mode=SubmissionMode.REFERENCE, gpu=gpu
+        )
+
 
 async def lang_autocomplete(
     interaction: discord.Interaction,
diff --git a/src/discord-cluster-manager/consts.py b/src/discord-cluster-manager/consts.py
index 928f59d4..ac2ffaf2 100644
--- a/src/discord-cluster-manager/consts.py
+++ b/src/discord-cluster-manager/consts.py
@@ -97,6 +97,7 @@ class SubmissionMode(Enum):
     LEADERBOARD = "leaderboard"
     PRIVATE = "private"
     SCRIPT = "script"
+    REFERENCE = "reference"
 
 
 class Language(Enum):
@@ -157,3 +158,6 @@ class RankCriterion(Enum):
 --index-url https://download.pytorch.org/whl/rocm6.2.4
 torch
 """
+
+REFERENCE_USER = "REFERENCE_USER"
+REFERENCE_USER_ID = -123
\ No newline at end of file
diff --git a/src/discord-cluster-manager/eval.py b/src/discord-cluster-manager/eval.py
index d0f693e1..d27cfbff 100644
--- a/src/discord-cluster-manager/eval.py
+++ b/src/discord-cluster-manager/eval.py
@@ -1,3 +1,4 @@
+import argparse
 import math
 import os
 import sys
@@ -29,15 +30,22 @@ def correctness(rng: torch.Generator) -> bool:
     return True
 
 
-def metric(logger: PopcornLogger, rng: torch.Generator):
+def metric(logger: PopcornLogger, rng: torch.Generator, time_reference_impl: bool = False):
     warmup_runs = 10
     timed_runs = 100
+    if time_reference_impl:
+        logger.log("Timing Reference Implementation")
+    else:
+        logger.log("Timing Submitted Custom Implementation")
 
     # Warmup Code
     print("warming up...")
     for _ in range(warmup_runs):
         inputs = generate_input(torch.randint(0, int(2**31), (), generator=rng).item())
-        _ = custom_kernel(inputs)
+        if time_reference_impl:
+            _ = ref_kernel(inputs)
+        else:
+            _ = custom_kernel(inputs)
     torch.cuda.synchronize()
 
     # Timing Code
@@ -47,16 +55,20 @@ def metric(logger: PopcornLogger, rng: torch.Generator):
         inputs = generate_input(torch.randint(0, int(2**31), (), generator=rng).item())
 
         start_time = time.time()
-        custom_output = custom_kernel(inputs)
+        if time_reference_impl:
+            ref_output = ref_kernel(inputs)
+        else:
+            custom_output = custom_kernel(inputs)
         torch.cuda.synchronize()
         end_time = time.time()
         times.append(end_time - start_time)
 
-        ref_output = ref_kernel(inputs)
-        torch.cuda.synchronize()
-        if not check_implementation(custom_output, ref_output):
-            logger.log("check", "fail")
-            exit(112)
+        if not time_reference_impl:
+            ref_output = ref_kernel(inputs)
+            torch.cuda.synchronize()
+            if not check_implementation(custom_output, ref_output):
+                logger.log("check", "fail")
+                exit(112)
 
     total_time = sum(times)
     average_duration = total_time / timed_runs
@@ -71,10 +83,17 @@ def metric(logger: PopcornLogger, rng: torch.Generator):
     logger.log("duration.best", min(times) * 1e9)
     logger.log("duration.worst", max(times) * 1e9)
 
-    print(f"Submitted kernel runtime: {average_duration:.4f} ± {standard_error:.4} seconds")
+    kernel_name = "Reference" if time_reference_impl else "Submitted"
+    print(f"{kernel_name} kernel runtime: {average_duration:.4f} ± {standard_error:.4} seconds")
 
 
 def main():
+    parser = argparse.ArgumentParser(description='Evaluate kernel implementation.')
+    parser.add_argument(
+        '--time-ref', action='store_true', help='Time ref kernel.'
+    )
+    args = parser.parse_args()
+
     try:
         logger = PopcornLogger(int(os.environ["POPCORN_FD"]))
     except Exception as e:
@@ -85,10 +104,12 @@ def main():
     rng = torch.Generator()
     rng.manual_seed(seed)
 
-    if not correctness(rng):
-        logger.log("check", "fail")
-        exit(112)
-    metric(logger, rng)
+    if not args.time_ref:
+        if not correctness(rng):
+            logger.log("check", "fail")
+            exit(112)
+
+    metric(logger, rng, time_reference_impl=args.time_ref)
 
 
 if __name__ == "__main__":
diff --git a/src/discord-cluster-manager/launchers/github.py b/src/discord-cluster-manager/launchers/github.py
index 10232ebd..7a5c13e8 100644
--- a/src/discord-cluster-manager/launchers/github.py
+++ b/src/discord-cluster-manager/launchers/github.py
@@ -44,6 +44,7 @@ async def run_submission(
             raise ValueError(f"Invalid GPU type: {gpu_type.value}")
 
         lang = config["lang"]
+        args = config.get("args", [])
         if lang == "cu" and gpu_vendor == "AMD":
             # TODO implement HIP
             raise NotImplementedError("Cannot use CUDA runs with AMD GPUs")
diff --git a/src/discord-cluster-manager/leaderboard_db.py b/src/discord-cluster-manager/leaderboard_db.py
index d48e8404..3b979e69 100644
--- a/src/discord-cluster-manager/leaderboard_db.py
+++ b/src/discord-cluster-manager/leaderboard_db.py
@@ -27,6 +27,8 @@
     setup_logging,
 )
 
+from consts import REFERENCE_USER_ID, REFERENCE_USER
+
 leaderboard_name_cache = LRUCache(max_size=512)
 
 logger = setup_logging(__name__)
@@ -213,6 +215,11 @@ def create_submission(
         time: datetime.datetime,
         user_name: str = None,
     ) -> Optional[int]:
+        if user_id == REFERENCE_USER_ID and user_name == REFERENCE_USER:
+            # todo: add reference code to the database
+            code = ""
+            file_name = "reference.py"
+
         try:
             # check if we already have the code
             self.cursor.execute(
@@ -287,6 +294,25 @@ def create_submission(
             self.connection.rollback()  # Ensure rollback if error occurs
             raise KernelBotError("Error during creation of submission") from e
 
+    def has_reference_run(self, leaderboard_name: str) -> bool:
+        try:
+            self.cursor.execute(
+                """
+                SELECT COUNT(*) FROM leaderboard.runs
+                WHERE leaderboard.runs.leaderboard_id = (
+                    SELECT leaderboard.leaderboard.id
+                    FROM leaderboard.leaderboard
+                    WHERE leaderboard.leaderboard.name = %s
+                )
+                AND leaderboard.runs.user_id = %s;
+                """,
+                (leaderboard_name, REFERENCE_USER_ID),
+            )
+            return self.cursor.fetchone()[0] > 0
+        except psycopg2.Error as e:
+            logger.error("Error checking for reference run", exc_info=e)
+            return False
+
     def mark_submission_done(
         self,
         submission: int,
diff --git a/src/discord-cluster-manager/run_eval.py b/src/discord-cluster-manager/run_eval.py
index 5e7ab046..8ab6a28f 100644
--- a/src/discord-cluster-manager/run_eval.py
+++ b/src/discord-cluster-manager/run_eval.py
@@ -455,7 +455,7 @@ def run_pytorch_script(  # noqa: C901
         # "compile" step: execute the script once. Will populate
         # `load_inline`'s compile cache, so the actual runs will be faster.
         try:
-            compile_run = run_program(["python", "submission.py"], seed=1, timeout=Timeout.COMPILE)
+            compile_run = run_program(["python", "submission.py"].extend(kwargs.get("args", [])), seed=1, timeout=Timeout.COMPILE)
             if "-DTORCH_EXTENSION_NAME" in compile_run.stdout:
                 comp = CompileResult(
                     nvcc_found=True,
@@ -511,7 +511,7 @@ def run_evaluation(
     require multiple runner calls.
     """
     results: dict[str, EvalResult] = {}
-    if mode in ["test", "benchmark", "profile", "script"]:
+    if mode in ["test", "benchmark", "profile", "script", "reference"]:
         results[mode] = call(mode=mode)
     elif mode in ["private", "leaderboard"]:
         # first, run the tests
@@ -552,6 +552,7 @@ def run_config(config: dict):
         "ranked_timeout": config.get("ranked_timeout", Timeout.RANKED),
         "benchmark_timeout": config.get("benchmark_timeout", Timeout.BENCHMARK),
         "test_timeout": config.get("test_timeout", Timeout.TEST),
+        "args": config.get("args", []),
     }
     if config["lang"] == "py":
         runner = functools.partial(
diff --git a/src/discord-cluster-manager/submission.py b/src/discord-cluster-manager/submission.py
index 2777b15f..19c0c6a1 100644
--- a/src/discord-cluster-manager/submission.py
+++ b/src/discord-cluster-manager/submission.py
@@ -27,19 +27,30 @@ class ProcessedSubmissionRequest(SubmissionRequest):
 
 
 def prepare_submission(req: SubmissionRequest, lb_db: LeaderboardDB) -> ProcessedSubmissionRequest:
-    if profanity.contains_profanity(req.file_name):
-        raise KernelBotError("Please provide a non rude filename")
+    # Detect reference submissions (no file name & no code provided)
+    # A reference submission is identified by missing/empty code content (no user file)
+    is_reference_submission = not req.code
 
-    # check file extension
-    if not req.file_name.endswith((".py", ".cu", ".cuh", ".cpp")):
+    # Perform filename/content related checks only for *non* reference submissions
+    if not is_reference_submission:
+        if profanity.contains_profanity(req.file_name):
+            raise KernelBotError("Please provide a non rude filename")
+
+        # check file extension (if filename provided)
+        if req.file_name and not req.file_name.endswith((".py", ".cu", ".cuh", ".cpp")):
+            raise KernelBotError(
+                "Please provide a Python (.py) or CUDA (.cu / .cuh / .cpp) file",
+            )
+
+        # process file directives (GPU selection / leaderboard name)
+        req = handle_popcorn_directives(req)
+
+    # Ensure leaderboard name is present (might have come from the command directly)
+    if req.leaderboard is None:
         raise KernelBotError(
-            "Please provide a Python (.py) or CUDA (.cu / .cuh / .cpp) file",
+            "Missing leaderboard name. Either supply one as a command argument or via ``#!POPCORN leaderboard <name>`` directive.",
         )
 
-    # process file directives
-    req = handle_popcorn_directives(req)
-    assert req.leaderboard is not None
-
     leaderboard = lookup_leaderboard(req.leaderboard, lb_db)
     check_deadline(leaderboard)
 
@@ -117,14 +128,6 @@ def handle_popcorn_directives(req: SubmissionRequest) -> SubmissionRequest:
         else:
             req.leaderboard = info["leaderboard"]
 
-    if req.leaderboard is None:
-        raise KernelBotError(
-            "Missing leaderboard name. "
-            "Either supply one as an argument in the submit command, or "
-            "specify it in your submission script using the "
-            "`{#,//}!POPCORN leaderboard <leaderboard_name>` directive.",
-        )
-
     return req
 
 
diff --git a/src/discord-cluster-manager/utils.py b/src/discord-cluster-manager/utils.py
index c39192f7..86742561 100644
--- a/src/discord-cluster-manager/utils.py
+++ b/src/discord-cluster-manager/utils.py
@@ -245,8 +245,11 @@ def build_task_config(
 
         if lang == "py":
             config["main"] = "eval.py"
-
+        args = []
+        if mode == SubmissionMode.REFERENCE:
+            args.append("--time-ref")
         return {
+            "args": args,
             **config,
             "sources": {
                 eval_name: submission_content,

From f42f885bd991584af7552503728f3e95f74bd04a Mon Sep 17 00:00:00 2001
From: Sahan Paliskara <sahanp@meta.com>
Date: Wed, 28 May 2025 14:02:10 -0700
Subject: [PATCH 02/42] test push

---
 src/discord-cluster-manager/consts.py   |  3 ++-
 src/discord-cluster-manager/eval.py     |  3 ++-
 src/discord-cluster-manager/run_eval.py | 14 +++++++++-----
 src/discord-cluster-manager/utils.py    |  4 ++--
 4 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/src/discord-cluster-manager/consts.py b/src/discord-cluster-manager/consts.py
index ac2ffaf2..cf50fdf2 100644
--- a/src/discord-cluster-manager/consts.py
+++ b/src/discord-cluster-manager/consts.py
@@ -160,4 +160,5 @@ class RankCriterion(Enum):
 """
 
 REFERENCE_USER = "REFERENCE_USER"
-REFERENCE_USER_ID = -123
\ No newline at end of file
+REFERENCE_USER_ID = -123
+REFERENCE_TIMING_ARG = "--reference-timing"
\ No newline at end of file
diff --git a/src/discord-cluster-manager/eval.py b/src/discord-cluster-manager/eval.py
index d27cfbff..6c7f81ef 100644
--- a/src/discord-cluster-manager/eval.py
+++ b/src/discord-cluster-manager/eval.py
@@ -7,6 +7,7 @@
 import torch
 from reference import check_implementation, generate_input, ref_kernel
 from submission import custom_kernel
+from consts import REFERENCE_TIMING_ARG
 
 
 class PopcornLogger:
@@ -90,7 +91,7 @@ def metric(logger: PopcornLogger, rng: torch.Generator, time_reference_impl: boo
 def main():
     parser = argparse.ArgumentParser(description='Evaluate kernel implementation.')
     parser.add_argument(
-        '--time-ref', action='store_true', help='Time ref kernel.'
+        REFERENCE_TIMING_ARG, action='store_true', help='Time ref kernel.'
     )
     args = parser.parse_args()
 
diff --git a/src/discord-cluster-manager/run_eval.py b/src/discord-cluster-manager/run_eval.py
index 8ab6a28f..2652bf62 100644
--- a/src/discord-cluster-manager/run_eval.py
+++ b/src/discord-cluster-manager/run_eval.py
@@ -10,7 +10,7 @@
 from types import NoneType
 from typing import Optional, Protocol, Union
 
-from consts import CUDA_FLAGS, ExitCode, Timeout
+from consts import CUDA_FLAGS, ExitCode, REFERENCE_TIMING_ARG, Timeout
 
 
 @dataclasses.dataclass
@@ -446,11 +446,15 @@ def run_pytorch_script(  # noqa: C901
         RunResult
     """
     start = datetime.datetime.now()
+    args = kwargs.get("args", [])
+    # log everything that's going on
+    print("Running with args: %s", args)
+    print("Running with sources: %s", sources)
+    print("Running with main: %s", main)
     try:
-        assert main in sources.keys()
-
-        # Write submission files to directory
-        _create_files(sources)
+        if REFERENCE_TIMING_ARG not in args:
+            assert main in sources.keys()
+            _create_files(sources)
 
         # "compile" step: execute the script once. Will populate
         # `load_inline`'s compile cache, so the actual runs will be faster.
diff --git a/src/discord-cluster-manager/utils.py b/src/discord-cluster-manager/utils.py
index 86742561..695faab9 100644
--- a/src/discord-cluster-manager/utils.py
+++ b/src/discord-cluster-manager/utils.py
@@ -5,7 +5,7 @@
 from typing import TYPE_CHECKING, Any, List, NotRequired, Optional, TypedDict
 
 import discord
-from consts import Language, SubmissionMode
+from consts import Language, SubmissionMode, REFERENCE_TIMING_ARG
 
 if TYPE_CHECKING:
     from task import LeaderboardTask
@@ -247,7 +247,7 @@ def build_task_config(
             config["main"] = "eval.py"
         args = []
         if mode == SubmissionMode.REFERENCE:
-            args.append("--time-ref")
+            args.append(REFERENCE_TIMING_ARG)
         return {
             "args": args,
             **config,

From 1c5d1220d0bbaaed9a114b3fa9ba75e6d26b3525 Mon Sep 17 00:00:00 2001
From: Sahan Paliskara <sahanp@meta.com>
Date: Wed, 28 May 2025 14:08:39 -0700
Subject: [PATCH 03/42] test push

---
 .../cogs/leaderboard_cog.py                      | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/discord-cluster-manager/cogs/leaderboard_cog.py b/src/discord-cluster-manager/cogs/leaderboard_cog.py
index cc4366bf..73a5f285 100644
--- a/src/discord-cluster-manager/cogs/leaderboard_cog.py
+++ b/src/discord-cluster-manager/cogs/leaderboard_cog.py
@@ -369,14 +369,14 @@ async def submit_reference(
         gpu: Optional[str] = None,
     ):
         # Check if reference run already exists
-        with self.bot.leaderboard_db as db:
-            if db.has_reference_run(leaderboard_name):
-                await send_discord_message(
-                    interaction,
-                    f"A reference run for leaderboard '{leaderboard_name}' already exists.",
-                    ephemeral=True,
-                )
-                return
+        # with self.bot.leaderboard_db as db:
+        #     if db.has_reference_run(leaderboard_name):
+        #         await send_discord_message(
+        #             interaction,
+        #             f"A reference run for leaderboard '{leaderboard_name}' already exists.",
+        #             ephemeral=True,
+        #         )
+        #         return
         # Process as a special submission
         return await self.submit(
             interaction, leaderboard_name, None, mode=SubmissionMode.REFERENCE, gpu=gpu

From f9b0ea392afdb18d825cb3d6630d5f96822f63c4 Mon Sep 17 00:00:00 2001
From: Sahan Paliskara <sahanp@meta.com>
Date: Wed, 28 May 2025 15:05:55 -0700
Subject: [PATCH 04/42] test push

---
 src/discord-cluster-manager/run_eval.py | 1 +
 src/discord-cluster-manager/utils.py    | 5 ++++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/discord-cluster-manager/run_eval.py b/src/discord-cluster-manager/run_eval.py
index 2652bf62..46281bc5 100644
--- a/src/discord-cluster-manager/run_eval.py
+++ b/src/discord-cluster-manager/run_eval.py
@@ -448,6 +448,7 @@ def run_pytorch_script(  # noqa: C901
     start = datetime.datetime.now()
     args = kwargs.get("args", [])
     # log everything that's going on
+    print("Running with kwargs: %s", kwargs)
     print("Running with args: %s", args)
     print("Running with sources: %s", sources)
     print("Running with main: %s", main)
diff --git a/src/discord-cluster-manager/utils.py b/src/discord-cluster-manager/utils.py
index 695faab9..3ce66e7b 100644
--- a/src/discord-cluster-manager/utils.py
+++ b/src/discord-cluster-manager/utils.py
@@ -248,8 +248,8 @@ def build_task_config(
         args = []
         if mode == SubmissionMode.REFERENCE:
             args.append(REFERENCE_TIMING_ARG)
+        config["args"] = args
         return {
-            "args": args,
             **config,
             "sources": {
                 eval_name: submission_content,
@@ -274,7 +274,10 @@ def build_task_config(
             "ranked_timeout": task.ranked_timeout,
             "ranking_by": task.ranking_by.value,
             "seed": task.seed,
+            "args": [],
         }
+        if mode == SubmissionMode.REFERENCE:
+            common["args"].append(REFERENCE_TIMING_ARG)
 
         if task.lang == Language.Python:
             return {

From 729045eb102d5dbea31e7b65ccbc6e19059801b3 Mon Sep 17 00:00:00 2001
From: Sahan Paliskara <sahanp@meta.com>
Date: Wed, 28 May 2025 15:12:23 -0700
Subject: [PATCH 05/42] test push

---
 src/discord-cluster-manager/utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/discord-cluster-manager/utils.py b/src/discord-cluster-manager/utils.py
index 3ce66e7b..5488e556 100644
--- a/src/discord-cluster-manager/utils.py
+++ b/src/discord-cluster-manager/utils.py
@@ -248,6 +248,7 @@ def build_task_config(
         args = []
         if mode == SubmissionMode.REFERENCE:
             args.append(REFERENCE_TIMING_ARG)
+            submission_content = ""
         config["args"] = args
         return {
             **config,

From 72ac6860e310ec973c26cc2745ba1f4ce12d9797 Mon Sep 17 00:00:00 2001
From: Sahan Paliskara <sahanp@meta.com>
Date: Wed, 28 May 2025 15:17:25 -0700
Subject: [PATCH 06/42] test push

---
 src/discord-cluster-manager/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/discord-cluster-manager/utils.py b/src/discord-cluster-manager/utils.py
index 5488e556..769debdb 100644
--- a/src/discord-cluster-manager/utils.py
+++ b/src/discord-cluster-manager/utils.py
@@ -263,7 +263,7 @@ def build_task_config(
                 all_files[n] = submission_content
             else:
                 all_files[n] = c
-
+        print(f"all_files: {all_files}")
         common = {
             "lang": task.lang.value,
             "arch": arch,

From 54a2418abea70fec4dcf49d5ce99b058b7879c40 Mon Sep 17 00:00:00 2001
From: Sahan Paliskara <sahanp@meta.com>
Date: Wed, 28 May 2025 15:23:53 -0700
Subject: [PATCH 07/42] test push

---
 src/discord-cluster-manager/run_eval.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/discord-cluster-manager/run_eval.py b/src/discord-cluster-manager/run_eval.py
index 46281bc5..b3d067a0 100644
--- a/src/discord-cluster-manager/run_eval.py
+++ b/src/discord-cluster-manager/run_eval.py
@@ -219,6 +219,7 @@ def compile_cuda_script(  # # noqa: C901
 
 def run_program(args: list[str], seed: Optional[int], timeout: int) -> RunResult:
     print("[Running]")
+    print("Running with args: %s", args)
     # set up a pipe so the tester can communicate its verdict with us
     env = os.environ.copy()
     pipe_read, pipe_write = os.pipe()
@@ -460,7 +461,7 @@ def run_pytorch_script(  # noqa: C901
         # "compile" step: execute the script once. Will populate
         # `load_inline`'s compile cache, so the actual runs will be faster.
         try:
-            compile_run = run_program(["python", "submission.py"].extend(kwargs.get("args", [])), seed=1, timeout=Timeout.COMPILE)
+            compile_run = run_program(["python", "submission.py"] + args, seed=1, timeout=Timeout.COMPILE)
             if "-DTORCH_EXTENSION_NAME" in compile_run.stdout:
                 comp = CompileResult(
                     nvcc_found=True,

From 089445b831ba7c3d35f55d1e5919c8e54959702f Mon Sep 17 00:00:00 2001
From: Sahan Paliskara <sahanp@meta.com>
Date: Wed, 28 May 2025 15:31:01 -0700
Subject: [PATCH 08/42] test push

---
 src/discord-cluster-manager/run_eval.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/discord-cluster-manager/run_eval.py b/src/discord-cluster-manager/run_eval.py
index b3d067a0..bfb96936 100644
--- a/src/discord-cluster-manager/run_eval.py
+++ b/src/discord-cluster-manager/run_eval.py
@@ -287,6 +287,7 @@ def run_single_evaluation(
     ranked_timeout: int = Timeout.RANKED,
     ranking_by: str = "last",
     seed: Optional[int] = None,
+    args: Optional[list[str]] = [],
 ) -> RunResult:
     """
     A single runner run, either in the context of test files, or in the
@@ -296,8 +297,8 @@ def run_single_evaluation(
         with tempfile.NamedTemporaryFile("w") as tests_file:
             tests_file.write(tests)
             tests_file.flush()
-            return run_program(call + [mode, tests_file.name], seed=seed, timeout=test_timeout)
-    elif mode in ["benchmark", "profile", "leaderboard"]:
+            return run_program(call + [mode, tests_file.name] + args, seed=seed, timeout=test_timeout)
+    elif mode in ["benchmark", "profile", "leaderboard", "reference"]:
         timeout = ranked_timeout if mode == "leaderboard" else benchmark_timeout
         with tempfile.NamedTemporaryFile("w") as bench_file:
             if ranking_by == "last":
@@ -305,10 +306,10 @@ def run_single_evaluation(
             else:
                 bench_file.write(benchmarks)
             bench_file.flush()
-            return run_program(call + [mode, bench_file.name], seed=seed, timeout=timeout)
+            return run_program(call + [mode, bench_file.name] + args, seed=seed, timeout=timeout)
     else:
         assert mode == "script"
-        return run_program(call, seed=seed, timeout=Timeout.SCRIPT)
+        return run_program(call + args, seed=seed, timeout=Timeout.SCRIPT)
 
 
 def make_system_info() -> SystemInfo:

From b1c11523f83ca0d2cae8257dd059d586589b7aab Mon Sep 17 00:00:00 2001
From: Sahan Paliskara <sahanp@meta.com>
Date: Wed, 28 May 2025 15:45:46 -0700
Subject: [PATCH 09/42] test push

---
 src/discord-cluster-manager/run_eval.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/discord-cluster-manager/run_eval.py b/src/discord-cluster-manager/run_eval.py
index bfb96936..b8b0c4a2 100644
--- a/src/discord-cluster-manager/run_eval.py
+++ b/src/discord-cluster-manager/run_eval.py
@@ -457,7 +457,8 @@ def run_pytorch_script(  # noqa: C901
     try:
         if REFERENCE_TIMING_ARG not in args:
             assert main in sources.keys()
-            _create_files(sources)
+            # pluck out submission.py from sources as it is not needed for the run and is None
+            sources.pop("submission.py")
 
         # "compile" step: execute the script once. Will populate
         # `load_inline`'s compile cache, so the actual runs will be faster.

From e3b49dd705dff65beac12029b1ee350b0367afae Mon Sep 17 00:00:00 2001
From: Sahan Paliskara <sahanp@meta.com>
Date: Wed, 28 May 2025 15:55:26 -0700
Subject: [PATCH 10/42] test push

---
 src/discord-cluster-manager/run_eval.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/discord-cluster-manager/run_eval.py b/src/discord-cluster-manager/run_eval.py
index b8b0c4a2..036654c1 100644
--- a/src/discord-cluster-manager/run_eval.py
+++ b/src/discord-cluster-manager/run_eval.py
@@ -457,7 +457,7 @@ def run_pytorch_script(  # noqa: C901
     try:
         if REFERENCE_TIMING_ARG not in args:
             assert main in sources.keys()
-            # pluck out submission.py from sources as it is not needed for the run and is None
+            # pluck out submission.py from sources as it is not needed for the run and is None normally
             sources.pop("submission.py")
 
         # "compile" step: execute the script once. Will populate

From 110aea68795dc5c8075a6b86a1e3f0970773f71e Mon Sep 17 00:00:00 2001
From: Sahan Paliskara <sahanp@meta.com>
Date: Wed, 28 May 2025 16:01:25 -0700
Subject: [PATCH 11/42] test push

---
 src/discord-cluster-manager/run_eval.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/discord-cluster-manager/run_eval.py b/src/discord-cluster-manager/run_eval.py
index 036654c1..e236d340 100644
--- a/src/discord-cluster-manager/run_eval.py
+++ b/src/discord-cluster-manager/run_eval.py
@@ -455,10 +455,11 @@ def run_pytorch_script(  # noqa: C901
     print("Running with sources: %s", sources)
     print("Running with main: %s", main)
     try:
-        if REFERENCE_TIMING_ARG not in args:
-            assert main in sources.keys()
+        if REFERENCE_TIMING_ARG in args:
             # pluck out submission.py from sources as it is not needed for the run and is None normally
             sources.pop("submission.py")
+        assert main in sources.keys()
+        _create_files(sources)
 
         # "compile" step: execute the script once. Will populate
         # `load_inline`'s compile cache, so the actual runs will be faster.

From 371cad80dba16e9f0225e10949a33d408cabb599 Mon Sep 17 00:00:00 2001
From: Sahan Paliskara <sahanp@meta.com>
Date: Wed, 28 May 2025 16:46:08 -0700
Subject: [PATCH 12/42] test push

---
 src/discord-cluster-manager/eval.py     |  4 +-
 src/discord-cluster-manager/run_eval.py | 56 ++++++++++++++-----------
 2 files changed, 34 insertions(+), 26 deletions(-)

diff --git a/src/discord-cluster-manager/eval.py b/src/discord-cluster-manager/eval.py
index 6c7f81ef..b87d5eef 100644
--- a/src/discord-cluster-manager/eval.py
+++ b/src/discord-cluster-manager/eval.py
@@ -6,7 +6,6 @@
 
 import torch
 from reference import check_implementation, generate_input, ref_kernel
-from submission import custom_kernel
 from consts import REFERENCE_TIMING_ARG
 
 
@@ -19,6 +18,7 @@ def log(self, key: str, value):
 
 
 def correctness(rng: torch.Generator) -> bool:
+    from submission import custom_kernel
     for _ in range(10):  # check multiple times
         inputs = generate_input(torch.randint(0, int(2**31), (), generator=rng).item())
         custom_output = custom_kernel(inputs)
@@ -37,7 +37,9 @@ def metric(logger: PopcornLogger, rng: torch.Generator, time_reference_impl: boo
     if time_reference_impl:
         logger.log("Timing Reference Implementation")
     else:
+        # in the case of a reference run we don't have a submission
         logger.log("Timing Submitted Custom Implementation")
+        from submission import custom_kernel
 
     # Warmup Code
     print("warming up...")
diff --git a/src/discord-cluster-manager/run_eval.py b/src/discord-cluster-manager/run_eval.py
index e236d340..86ecde5b 100644
--- a/src/discord-cluster-manager/run_eval.py
+++ b/src/discord-cluster-manager/run_eval.py
@@ -454,41 +454,47 @@ def run_pytorch_script(  # noqa: C901
     print("Running with args: %s", args)
     print("Running with sources: %s", sources)
     print("Running with main: %s", main)
+    is_reference = False
+    if REFERENCE_TIMING_ARG in args:
+        # pluck out submission.py from sources as it is not needed for the run and is None normally
+        sources.pop("submission.py")
+        is_reference = True
     try:
-        if REFERENCE_TIMING_ARG in args:
-            # pluck out submission.py from sources as it is not needed for the run and is None normally
-            sources.pop("submission.py")
+        
         assert main in sources.keys()
         _create_files(sources)
 
         # "compile" step: execute the script once. Will populate
         # `load_inline`'s compile cache, so the actual runs will be faster.
         try:
-            compile_run = run_program(["python", "submission.py"] + args, seed=1, timeout=Timeout.COMPILE)
-            if "-DTORCH_EXTENSION_NAME" in compile_run.stdout:
+            if not is_reference:
+                compile_run = run_program(["python", "submission.py"] + args, seed=1, timeout=Timeout.COMPILE)
+                if "-DTORCH_EXTENSION_NAME" in compile_run.stdout:
+                    comp = CompileResult(
+                        nvcc_found=True,
+                        nvcc_version="",
+                        success=True,
+                        command=compile_run.command,
+                        stdout=compile_run.stdout,
+                        stderr=compile_run.stderr,
+                        exit_code=compile_run.exit_code,
+                    )
+                else:
+                    comp = None
+            except subprocess.CalledProcessError as e:
+                # This step is purely optional, so we just go on
+                # if it fails
                 comp = CompileResult(
-                    nvcc_found=True,
+                    nvcc_found=False,
                     nvcc_version="",
-                    success=True,
-                    command=compile_run.command,
-                    stdout=compile_run.stdout,
-                    stderr=compile_run.stderr,
-                    exit_code=compile_run.exit_code,
+                    success=False,
+                    command="python submission.py",
+                    stdout=e.stdout,
+                    stderr=e.stderr,
+                    exit_code=e.returncode,
                 )
-            else:
-                comp = None
-        except subprocess.CalledProcessError as e:
-            # This step is purely optional, so we just go on
-            # if it fails
-            comp = CompileResult(
-                nvcc_found=False,
-                nvcc_version="",
-                success=False,
-                command="python submission.py",
-                stdout=e.stdout,
-                stderr=e.stderr,
-                exit_code=e.returncode,
-            )
+        else:
+            comp = None
 
         run = run_single_evaluation(["python", main], **kwargs)
 

From cb819115736148fbc2840509c1974bb86f669583 Mon Sep 17 00:00:00 2001
From: Sahan Paliskara <sahanp@meta.com>
Date: Wed, 28 May 2025 16:47:41 -0700
Subject: [PATCH 13/42] test push

---
 src/discord-cluster-manager/run_eval.py | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/src/discord-cluster-manager/run_eval.py b/src/discord-cluster-manager/run_eval.py
index 86ecde5b..e06161a3 100644
--- a/src/discord-cluster-manager/run_eval.py
+++ b/src/discord-cluster-manager/run_eval.py
@@ -297,7 +297,7 @@ def run_single_evaluation(
         with tempfile.NamedTemporaryFile("w") as tests_file:
             tests_file.write(tests)
             tests_file.flush()
-            return run_program(call + [mode, tests_file.name] + args, seed=seed, timeout=test_timeout)
+            return run_program(call + [mode, tests_file.name], seed=seed, timeout=test_timeout)
     elif mode in ["benchmark", "profile", "leaderboard", "reference"]:
         timeout = ranked_timeout if mode == "leaderboard" else benchmark_timeout
         with tempfile.NamedTemporaryFile("w") as bench_file:
@@ -450,24 +450,24 @@ def run_pytorch_script(  # noqa: C901
     start = datetime.datetime.now()
     args = kwargs.get("args", [])
     # log everything that's going on
-    print("Running with kwargs: %s", kwargs)
-    print("Running with args: %s", args)
-    print("Running with sources: %s", sources)
-    print("Running with main: %s", main)
+    print("Running with kwargs: %s" % kwargs)
+    print("Running with args: %s" % args)
+    print("Running with sources: %s" % sources)
+    print("Running with main: %s" % main)
     is_reference = False
     if REFERENCE_TIMING_ARG in args:
         # pluck out submission.py from sources as it is not needed for the run and is None normally
-        sources.pop("submission.py")
+        sources.pop("submission.py", None)
         is_reference = True
     try:
-        
         assert main in sources.keys()
         _create_files(sources)
 
         # "compile" step: execute the script once. Will populate
         # `load_inline`'s compile cache, so the actual runs will be faster.
-        try:
-            if not is_reference:
+        comp = None
+        if not is_reference:
+            try:
                 compile_run = run_program(["python", "submission.py"] + args, seed=1, timeout=Timeout.COMPILE)
                 if "-DTORCH_EXTENSION_NAME" in compile_run.stdout:
                     comp = CompileResult(
@@ -479,8 +479,6 @@ def run_pytorch_script(  # noqa: C901
                         stderr=compile_run.stderr,
                         exit_code=compile_run.exit_code,
                     )
-                else:
-                    comp = None
             except subprocess.CalledProcessError as e:
                 # This step is purely optional, so we just go on
                 # if it fails
@@ -493,8 +491,6 @@ def run_pytorch_script(  # noqa: C901
                     stderr=e.stderr,
                     exit_code=e.returncode,
                 )
-        else:
-            comp = None
 
         run = run_single_evaluation(["python", main], **kwargs)
 

From dc17b308aeae14e25609c8bdcf2124236ff4b83d Mon Sep 17 00:00:00 2001
From: Sahan Paliskara <sahanp@meta.com>
Date: Wed, 28 May 2025 17:02:50 -0700
Subject: [PATCH 14/42] test push

---
 src/discord-cluster-manager/eval.py     | 9 ++++++---
 src/discord-cluster-manager/run_eval.py | 2 +-
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/discord-cluster-manager/eval.py b/src/discord-cluster-manager/eval.py
index b87d5eef..2945ce5c 100644
--- a/src/discord-cluster-manager/eval.py
+++ b/src/discord-cluster-manager/eval.py
@@ -32,6 +32,7 @@ def correctness(rng: torch.Generator) -> bool:
 
 
 def metric(logger: PopcornLogger, rng: torch.Generator, time_reference_impl: bool = False):
+    print("timing kernel")
     warmup_runs = 10
     timed_runs = 100
     if time_reference_impl:
@@ -96,7 +97,7 @@ def main():
         REFERENCE_TIMING_ARG, action='store_true', help='Time ref kernel.'
     )
     args = parser.parse_args()
-
+    print(f"starting script")
     try:
         logger = PopcornLogger(int(os.environ["POPCORN_FD"]))
     except Exception as e:
@@ -106,12 +107,14 @@ def main():
     seed = int(os.environ.get("POPCORN_FD", 42))
     rng = torch.Generator()
     rng.manual_seed(seed)
-
+    print(f"seed: {seed}")
+    print(f"time ref: {args.time_ref}")
+    print(f"correctness: {not args.time_ref}")
     if not args.time_ref:
         if not correctness(rng):
             logger.log("check", "fail")
             exit(112)
-
+    
     metric(logger, rng, time_reference_impl=args.time_ref)
 
 
diff --git a/src/discord-cluster-manager/run_eval.py b/src/discord-cluster-manager/run_eval.py
index e06161a3..56850714 100644
--- a/src/discord-cluster-manager/run_eval.py
+++ b/src/discord-cluster-manager/run_eval.py
@@ -306,7 +306,7 @@ def run_single_evaluation(
             else:
                 bench_file.write(benchmarks)
             bench_file.flush()
-            return run_program(call + [mode, bench_file.name] + args, seed=seed, timeout=timeout)
+            return run_program(call + args + [mode, bench_file.name], seed=seed, timeout=timeout)
     else:
         assert mode == "script"
         return run_program(call + args, seed=seed, timeout=Timeout.SCRIPT)

From c2a536716fba5d296038a3b91dee977d91a336e1 Mon Sep 17 00:00:00 2001
From: Sahan Paliskara <sahanp@meta.com>
Date: Wed, 28 May 2025 17:43:06 -0700
Subject: [PATCH 15/42] test push

---
 examples/eval.py | 38 +++++++++++++++++++++++++-------------
 1 file changed, 25 insertions(+), 13 deletions(-)

diff --git a/examples/eval.py b/examples/eval.py
index e414a580..5e747d31 100644
--- a/examples/eval.py
+++ b/examples/eval.py
@@ -17,9 +17,13 @@
 except ImportError:
     TestSpec = dict
 
-from reference import check_implementation, generate_input
+from reference import check_implementation, generate_input, ref_kernel
 
 
+# -----------------------------------------------------------------------------
+# Determine which kernel to use (reference or submission)
+# -----------------------------------------------------------------------------
+MODE_REFERENCE_STRING = "reference"  # Define the string to check for mode
 class PopcornOutput:
     def __init__(self, fd: int):
         self.file = os.fdopen(fd, 'w')
@@ -156,7 +160,7 @@ def _run_single_test(test: TestCase):
     from submission import custom_kernel
     data = generate_input(**test.args)
     torch.cuda.synchronize()
-    submission_output = custom_kernel(_clone_data(data))
+    submission_output = active_kernel(_clone_data(data))
     torch.cuda.synchronize()
     return wrap_check_implementation(data, submission_output)
 
@@ -198,18 +202,21 @@ def run_testing(logger: PopcornOutput, pool: multiprocessing.Pool, tests: list[T
         return 112
 
 
-def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_time_ns: float) -> Stats | Any:
+def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_time_ns: float, is_reference_run: bool) -> Stats | Any:
     """
     Runs one benchmark. Do not call directly.
     """
-    from submission import custom_kernel
+    if not is_reference_run:
+        # submission does not exist for a reference run
+        from submission import custom_kernel
 
     durations = []
     # generate input data once
     data = generate_input(**test.args)
     check_copy = _clone_data(data)
+    active_kernel = ref_kernel if is_reference_run else custom_kernel
     #  first, one obligatory correctness check
-    output = custom_kernel(data)
+    output = active_kernel(data)
     good, message = wrap_check_implementation(check_copy, output)
     if not good:
         return message
@@ -229,7 +236,7 @@ def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_t
             check_copy = _clone_data(data)
         torch.cuda.synchronize()
         start = time.perf_counter_ns()
-        output = custom_kernel(data)
+        output = active_kernel(data)
         torch.cuda.synchronize()
         end = time.perf_counter_ns()
 
@@ -249,7 +256,7 @@ def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_t
     return calculate_stats(durations)
 
 
-def run_single_benchmark(pool: multiprocessing.Pool, test: TestCase, recheck: bool, max_repeats: int, max_time_ns: float):
+def run_single_benchmark(pool: multiprocessing.Pool, test: TestCase, recheck: bool, max_repeats: int, max_time_ns: float, is_reference_run: bool = False):
     """
     For a particular test case, check correctness (if applicable) and grab runtime results.
 
@@ -260,7 +267,7 @@ def run_single_benchmark(pool: multiprocessing.Pool, test: TestCase, recheck: bo
     @param max_time_ns: Timeout time in nanoseconds.
     @return: A Stats object for this particular benchmark case or an error if the test fails.
     """
-    return pool.apply(_run_single_benchmark, (test, recheck, max_repeats, max_time_ns))
+    return pool.apply(_run_single_benchmark, (test, recheck, max_repeats, max_time_ns, is_reference_run))
 
 
 def run_benchmarking(logger: PopcornOutput, pool: multiprocessing.Pool, tests: list[TestCase]):
@@ -300,13 +307,13 @@ def run_single_profile(test: TestCase) -> str:
     """
     Runs a single test case. Do not call directly
     """
-    from submission import custom_kernel
     from torch.profiler import profile, record_function, ProfilerActivity
+    from submission import custom_kernel
     data = generate_input(**test.args)
     torch.cuda.synchronize()
 
     with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
-        submission_output = custom_kernel(_clone_data(data))
+        submission_output = active_kernel(_clone_data(data))
         torch.cuda.synchronize()
     return prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=20)
 
@@ -345,13 +352,18 @@ def main():
             if mode == "benchmark":
                 return run_benchmarking(logger, pool, tests)
 
-            if mode == "leaderboard":
+            if mode == "leaderboard" or mode == "reference":
+                is_reference_run = mode == "reference"
                 # warmup
-                run_single_benchmark(pool, tests[0], False, 100, 1e7)
+                run_single_benchmark(pool, tests[0], False, 100, 1e7, is_reference_run)
+                if is_reference_run:
+                    logger.log("Running reference run")
+                else:
+                    logger.log("Running leaderboard run")
                 logger.log("benchmark-count", len(tests))
                 passed = True
                 for i in range(len(tests)):
-                    result = run_single_benchmark(pool, tests[i], True, 100, 30e9)
+                    result = run_single_benchmark(pool, tests[i], True, 100, 30e9, is_reference_run)
                     logger.log(f"benchmark.{i}.spec", tests[i].spec)
                     if isinstance(result, Stats):
                         for field in dataclasses.fields(Stats):

From 19647d27ba91e0d40307916947d8aef6b4f9f06d Mon Sep 17 00:00:00 2001
From: Sahan Paliskara <sahanp@meta.com>
Date: Wed, 28 May 2025 21:02:00 -0700
Subject: [PATCH 16/42] test push

---
 src/discord-cluster-manager/task.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/discord-cluster-manager/task.py b/src/discord-cluster-manager/task.py
index 3a14bc51..db3adbb8 100644
--- a/src/discord-cluster-manager/task.py
+++ b/src/discord-cluster-manager/task.py
@@ -64,6 +64,7 @@ class LeaderboardTask:
     ranking_by: RankCriterion = RankCriterion.LAST
     templates: dict[str, str] = dataclasses.field(default_factory=dict)
     seed: Optional[int] = None
+    
 
     @staticmethod
     def from_dict(data: dict):

From a2404851ba680c01fde47d418c76e3eb0a60e026 Mon Sep 17 00:00:00 2001
From: Sahan Paliskara <sahanp@meta.com>
Date: Wed, 28 May 2025 21:37:42 -0700
Subject: [PATCH 17/42] test push

---
 .github/workflows/nvidia_workflow.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/nvidia_workflow.yml b/.github/workflows/nvidia_workflow.yml
index 5cf7c0a3..258d7838 100644
--- a/.github/workflows/nvidia_workflow.yml
+++ b/.github/workflows/nvidia_workflow.yml
@@ -29,7 +29,7 @@ jobs:
       shell: bash
       run: |
         # install jq
-        apt update && apt install -y jq
+        # apt update && apt install -y jq
         # Extract the payload content without printing it
         PAYLOAD=$(jq -r '.inputs.payload' $GITHUB_EVENT_PATH)
         

From 7f1e04a492ca77bfbf9196780c9fd40bc78bc3dc Mon Sep 17 00:00:00 2001
From: Sahan Paliskara <sahanp@meta.com>
Date: Wed, 28 May 2025 21:43:44 -0700
Subject: [PATCH 18/42] test push

---
 .github/workflows/nvidia_workflow.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/nvidia_workflow.yml b/.github/workflows/nvidia_workflow.yml
index 258d7838..857331b8 100644
--- a/.github/workflows/nvidia_workflow.yml
+++ b/.github/workflows/nvidia_workflow.yml
@@ -29,7 +29,7 @@ jobs:
       shell: bash
       run: |
         # install jq
-        # apt update && apt install -y jq
+        snap install jq
         # Extract the payload content without printing it
         PAYLOAD=$(jq -r '.inputs.payload' $GITHUB_EVENT_PATH)
         

From 104d1c96eeb9a6cafa57052a0bb3f8f9de20f8a7 Mon Sep 17 00:00:00 2001
From: Sahan Paliskara <sahanp@meta.com>
Date: Wed, 28 May 2025 21:47:51 -0700
Subject: [PATCH 19/42] test push

---
 .github/workflows/nvidia_workflow.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/nvidia_workflow.yml b/.github/workflows/nvidia_workflow.yml
index 857331b8..7de49097 100644
--- a/.github/workflows/nvidia_workflow.yml
+++ b/.github/workflows/nvidia_workflow.yml
@@ -29,6 +29,8 @@ jobs:
       shell: bash
       run: |
         # install jq
+        apt update
+        apt install snapd
         snap install jq
         # Extract the payload content without printing it
         PAYLOAD=$(jq -r '.inputs.payload' $GITHUB_EVENT_PATH)

From 1f54c968db3b4ae011d068b778848e285eb99fb6 Mon Sep 17 00:00:00 2001
From: Sahan Paliskara <sahanp@meta.com>
Date: Wed, 28 May 2025 21:50:47 -0700
Subject: [PATCH 20/42] test push

---
 .github/workflows/nvidia_workflow.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/nvidia_workflow.yml b/.github/workflows/nvidia_workflow.yml
index 7de49097..2113cf75 100644
--- a/.github/workflows/nvidia_workflow.yml
+++ b/.github/workflows/nvidia_workflow.yml
@@ -30,7 +30,7 @@ jobs:
       run: |
         # install jq
         apt update
-        apt install snapd
+        apt install -y snapd
         snap install jq
         # Extract the payload content without printing it
         PAYLOAD=$(jq -r '.inputs.payload' $GITHUB_EVENT_PATH)

From 7f8cf648eb9e16dcf850b18a6fb16c5e6f08a7c3 Mon Sep 17 00:00:00 2001
From: Sahan Paliskara <sahanp@meta.com>
Date: Wed, 28 May 2025 21:54:42 -0700
Subject: [PATCH 21/42] test push

---
 .github/workflows/nvidia_workflow.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/nvidia_workflow.yml b/.github/workflows/nvidia_workflow.yml
index 2113cf75..340a8fb9 100644
--- a/.github/workflows/nvidia_workflow.yml
+++ b/.github/workflows/nvidia_workflow.yml
@@ -30,8 +30,9 @@ jobs:
       run: |
         # install jq
         apt update
-        apt install -y snapd
-        snap install jq
+        apt-get install -y jq
+        # apt install -y snapd
+        # snap install jq
         # Extract the payload content without printing it
         PAYLOAD=$(jq -r '.inputs.payload' $GITHUB_EVENT_PATH)
         

From 79672f0071b7e81f1e84e68e04f5324f5f5f7d9a Mon Sep 17 00:00:00 2001
From: Sahan Paliskara <sahanp@meta.com>
Date: Wed, 28 May 2025 22:02:06 -0700
Subject: [PATCH 22/42] test push

---
 .github/workflows/nvidia_workflow.yml | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/nvidia_workflow.yml b/.github/workflows/nvidia_workflow.yml
index 340a8fb9..b50c01af 100644
--- a/.github/workflows/nvidia_workflow.yml
+++ b/.github/workflows/nvidia_workflow.yml
@@ -24,13 +24,19 @@ jobs:
       uses: actions/setup-python@v5
       with:
         python-version: '3.10'
-
+    
+    - name: 'Setup jq'
+      uses: dcarbone/install-jq-action@v3
+      with:
+        version: '1.7.1'
+        force: true
+    
     - name: Create input files
       shell: bash
       run: |
         # install jq
-        apt update
-        apt-get install -y jq
+        # apt update
+        # apt install -y jq
         # apt install -y snapd
         # snap install jq
         # Extract the payload content without printing it

From 419fb221dc4a07191fa50ae26982d7255f6f463d Mon Sep 17 00:00:00 2001
From: Sahan Paliskara <sahanp@meta.com>
Date: Wed, 28 May 2025 22:10:31 -0700
Subject: [PATCH 23/42] test push

---
 .github/workflows/nvidia_workflow.yml | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/.github/workflows/nvidia_workflow.yml b/.github/workflows/nvidia_workflow.yml
index b50c01af..9b1d8feb 100644
--- a/.github/workflows/nvidia_workflow.yml
+++ b/.github/workflows/nvidia_workflow.yml
@@ -25,12 +25,6 @@ jobs:
       with:
         python-version: '3.10'
     
-    - name: 'Setup jq'
-      uses: dcarbone/install-jq-action@v3
-      with:
-        version: '1.7.1'
-        force: true
-    
     - name: Create input files
       shell: bash
       run: |

From ea8812c3a1f34c2a5689dca6de3d5264fbcca34c Mon Sep 17 00:00:00 2001
From: Sahan Paliskara <sahanp@meta.com>
Date: Wed, 28 May 2025 22:16:54 -0700
Subject: [PATCH 24/42] test push

---
 .github/workflows/nvidia_workflow.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/nvidia_workflow.yml b/.github/workflows/nvidia_workflow.yml
index 9b1d8feb..f2171f4f 100644
--- a/.github/workflows/nvidia_workflow.yml
+++ b/.github/workflows/nvidia_workflow.yml
@@ -29,8 +29,8 @@ jobs:
       shell: bash
       run: |
         # install jq
-        # apt update
-        # apt install -y jq
+        apt update
+        apt install -y jq
         # apt install -y snapd
         # snap install jq
         # Extract the payload content without printing it

From 13b2aef2299de6eef4b9306befacbd18163f0cec Mon Sep 17 00:00:00 2001
From: Sahan Paliskara <sahanp@meta.com>
Date: Wed, 28 May 2025 22:56:11 -0700
Subject: [PATCH 25/42] test push

---
 src/discord-cluster-manager/run_eval.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/discord-cluster-manager/run_eval.py b/src/discord-cluster-manager/run_eval.py
index 56850714..4789c638 100644
--- a/src/discord-cluster-manager/run_eval.py
+++ b/src/discord-cluster-manager/run_eval.py
@@ -306,10 +306,10 @@ def run_single_evaluation(
             else:
                 bench_file.write(benchmarks)
             bench_file.flush()
-            return run_program(call + args + [mode, bench_file.name], seed=seed, timeout=timeout)
+            return run_program(call + [mode, bench_file.name], seed=seed, timeout=timeout)
     else:
         assert mode == "script"
-        return run_program(call + args, seed=seed, timeout=Timeout.SCRIPT)
+        return run_program(call, seed=seed, timeout=Timeout.SCRIPT)
 
 
 def make_system_info() -> SystemInfo:
@@ -468,7 +468,7 @@ def run_pytorch_script(  # noqa: C901
         comp = None
         if not is_reference:
             try:
-                compile_run = run_program(["python", "submission.py"] + args, seed=1, timeout=Timeout.COMPILE)
+                compile_run = run_program(["python", "submission.py"], seed=1, timeout=Timeout.COMPILE)
                 if "-DTORCH_EXTENSION_NAME" in compile_run.stdout:
                     comp = CompileResult(
                         nvcc_found=True,

From 3ae64d59ed97bbbc5bf12a65c78fae5a7e2156c2 Mon Sep 17 00:00:00 2001
From: Sahan Paliskara <sahanp@meta.com>
Date: Wed, 28 May 2025 23:09:26 -0700
Subject: [PATCH 26/42] test push

---
 .github/workflows/nvidia_workflow.yml | 32 ++++++++++++++-------------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/.github/workflows/nvidia_workflow.yml b/.github/workflows/nvidia_workflow.yml
index f2171f4f..21ed6999 100644
--- a/.github/workflows/nvidia_workflow.yml
+++ b/.github/workflows/nvidia_workflow.yml
@@ -26,21 +26,23 @@ jobs:
         python-version: '3.10'
     
     - name: Create input files
-      shell: bash
-      run: |
-        # install jq
-        apt update
-        apt install -y jq
-        # apt install -y snapd
-        # snap install jq
-        # Extract the payload content without printing it
-        PAYLOAD=$(jq -r '.inputs.payload' $GITHUB_EVENT_PATH)
-        
-        # Apply mask to the extracted content
-        echo "::add-mask::$PAYLOAD"
-        
-        # Now write to file (won't be logged since it's masked)
-        echo "$PAYLOAD" > payload.json
+      uses: nick-fields/retry@v3
+      with:
+        timeout_minutes: 2
+        max_attempts: 5
+        shell: bash
+        command: |
+          # install jq
+          apt update
+          apt install -y jq
+          # Extract the payload content without printing it
+          PAYLOAD=$(jq -r '.inputs.payload' $GITHUB_EVENT_PATH)
+          
+          # Apply mask to the extracted content
+          echo "::add-mask::$PAYLOAD"
+          
+          # Now write to file (won't be logged since it's masked)
+          echo "$PAYLOAD" > payload.json
 
     - name: Install uv
       uses: astral-sh/setup-uv@v3

From f265a377403eb9289ea51c8347887762faafb6d4 Mon Sep 17 00:00:00 2001
From: Sahan Paliskara <sahanp@meta.com>
Date: Wed, 28 May 2025 23:18:17 -0700
Subject: [PATCH 27/42] test push

---
 examples/eval.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/examples/eval.py b/examples/eval.py
index 5e747d31..24221afb 100644
--- a/examples/eval.py
+++ b/examples/eval.py
@@ -337,6 +337,7 @@ def main():
         return 2
 
     mode = sys.argv[1]
+    print(f"Running in mode {mode}")
     seed = os.getenv("POPCORN_SEED")
     os.unsetenv("POPCORN_SEED")
     seed = int(seed) if seed else None
@@ -379,6 +380,7 @@ def main():
                 run_profiling(logger, tests)
             else:
                 # TODO: Implement script mode
+                print(f"mode {mode} not implemented")
                 return 2
 
 

From 5b8c698a93fddc19118eed32884c0c7b59cbb63e Mon Sep 17 00:00:00 2001
From: Sahan Paliskara <sahanp@meta.com>
Date: Wed, 28 May 2025 23:30:11 -0700
Subject: [PATCH 28/42] test push

---
 examples/eval.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/eval.py b/examples/eval.py
index 24221afb..be778b40 100644
--- a/examples/eval.py
+++ b/examples/eval.py
@@ -336,7 +336,7 @@ def main():
     if len(sys.argv) < 3:
         return 2
 
-    mode = sys.argv[1]
+    mode = sys.argv[1].strip()
     print(f"Running in mode {mode}")
     seed = os.getenv("POPCORN_SEED")
     os.unsetenv("POPCORN_SEED")
@@ -345,6 +345,7 @@ def main():
     tests = get_test_cases(sys.argv[2], seed)
 
     with PopcornOutput(int(fd)) as logger:
+        logger.log(f"running in mode {mode}")
         import multiprocessing
         mp_context = multiprocessing.get_context('spawn')
         with mp_context.Pool(1) as pool:

From fdb1ed6e6663ab113f6d9abf171cd365b356fd5d Mon Sep 17 00:00:00 2001
From: Sahan Paliskara <sahanp@meta.com>
Date: Wed, 28 May 2025 23:47:24 -0700
Subject: [PATCH 29/42] test push

---
 .github/workflows/nvidia_workflow.yml | 5 ++++-
 examples/eval.py                      | 5 +++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/nvidia_workflow.yml b/.github/workflows/nvidia_workflow.yml
index 21ed6999..78d950da 100644
--- a/.github/workflows/nvidia_workflow.yml
+++ b/.github/workflows/nvidia_workflow.yml
@@ -18,7 +18,10 @@ jobs:
     container:
       image: nvidia/cuda:12.4.0-devel-ubuntu22.04
     steps:
-    - uses: actions/checkout@v3
+    - name: Checkout code
+      uses: actions/checkout@v4
+      with:
+        ref: ${{ github.event.pull_request.head.sha }}
 
     - name: Setup Python
       uses: actions/setup-python@v5
diff --git a/examples/eval.py b/examples/eval.py
index be778b40..fd643516 100644
--- a/examples/eval.py
+++ b/examples/eval.py
@@ -334,7 +334,7 @@ def main():
         return 111
 
     if len(sys.argv) < 3:
-        return 2
+        return 222
 
     mode = sys.argv[1].strip()
     print(f"Running in mode {mode}")
@@ -381,8 +381,9 @@ def main():
                 run_profiling(logger, tests)
             else:
                 # TODO: Implement script mode
+                logger.log(mode, "not implemented")
                 print(f"mode {mode} not implemented")
-                return 2
+                return 333
 
 
 if __name__ == "__main__":

From 2e489447f310603da91b49bd41c78f1776ee2303 Mon Sep 17 00:00:00 2001
From: Sahan Paliskara <sahanp@meta.com>
Date: Thu, 29 May 2025 10:28:08 -0700
Subject: [PATCH 30/42] test push

---
 examples/eval.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/eval.py b/examples/eval.py
index fd643516..e5f25576 100644
--- a/examples/eval.py
+++ b/examples/eval.py
@@ -345,7 +345,7 @@ def main():
     tests = get_test_cases(sys.argv[2], seed)
 
     with PopcornOutput(int(fd)) as logger:
-        logger.log(f"running in mode {mode}")
+        logger.log("debug message", f"running in mode {mode}")
         import multiprocessing
         mp_context = multiprocessing.get_context('spawn')
         with mp_context.Pool(1) as pool:
@@ -359,9 +359,9 @@ def main():
                 # warmup
                 run_single_benchmark(pool, tests[0], False, 100, 1e7, is_reference_run)
                 if is_reference_run:
-                    logger.log("Running reference run")
+                    logger.log("debug message", "Running reference run")
                 else:
-                    logger.log("Running leaderboard run")
+                    logger.log("debug message", "Running leaderboard run")
                 logger.log("benchmark-count", len(tests))
                 passed = True
                 for i in range(len(tests)):

From d78119caff5e82f3a10e2f5fd085ddf0b50a4deb Mon Sep 17 00:00:00 2001
From: Sahan Paliskara <sahanp@meta.com>
Date: Thu, 29 May 2025 10:41:35 -0700
Subject: [PATCH 31/42] test push

---
 examples/eval.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/eval.py b/examples/eval.py
index e5f25576..aa77e3ae 100644
--- a/examples/eval.py
+++ b/examples/eval.py
@@ -241,7 +241,7 @@ def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_t
         end = time.perf_counter_ns()
 
         if recheck:
-            good, message = check_implementation(check_copy, output)
+            good, message = wrap_check_implementation(check_copy, output)
             if not good:
                 return message
 

From 474d9c413979806b253b917dcfbc88052c1a71cc Mon Sep 17 00:00:00 2001
From: Sahan Paliskara <sahanp@meta.com>
Date: Thu, 29 May 2025 11:54:12 -0700
Subject: [PATCH 32/42] test push

---
 src/discord-cluster-manager/cogs/leaderboard_cog.py |  5 +++--
 src/discord-cluster-manager/cogs/submit_cog.py      |  2 +-
 src/discord-cluster-manager/report.py               | 13 +++++++++++++
 3 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/src/discord-cluster-manager/cogs/leaderboard_cog.py b/src/discord-cluster-manager/cogs/leaderboard_cog.py
index 73a5f285..4c2e00ac 100644
--- a/src/discord-cluster-manager/cogs/leaderboard_cog.py
+++ b/src/discord-cluster-manager/cogs/leaderboard_cog.py
@@ -182,7 +182,7 @@ async def on_submit_hook(  # noqa: C901
             with self.bot.leaderboard_db as db:
                 db.mark_submission_done(sub_id)
 
-        if mode == SubmissionMode.LEADERBOARD:
+        if mode == SubmissionMode.LEADERBOARD or mode == SubmissionMode.REFERENCE:
             await self.post_submit_hook(interaction, sub_id)
         return sub_id
 
@@ -493,7 +493,8 @@ async def _display_lb_submissions_helper(
             processed_submissions = [
                 {
                     "Rank": submission["rank"],
-                    "User": await get_user_from_id(self.bot, submission["user_id"]),
+                    # "User": await get_user_from_id(self.bot, submission["user_id"]),
+                    "User": submission["user_id"],
                     "Score": f"{format_time(float(submission['submission_score']) * 1e9)}",
                     "Submission Name": submission["submission_name"],
                 }
diff --git a/src/discord-cluster-manager/cogs/submit_cog.py b/src/discord-cluster-manager/cogs/submit_cog.py
index 0657641f..03f51d32 100644
--- a/src/discord-cluster-manager/cogs/submit_cog.py
+++ b/src/discord-cluster-manager/cogs/submit_cog.py
@@ -221,7 +221,7 @@ async def _handle_submission(
             await reporter.update_title(reporter.title + " ✅ success")
 
         short_report = make_short_report(
-            result.runs, full=mode in [SubmissionMode.PRIVATE, SubmissionMode.LEADERBOARD]
+            result.runs, full=mode in [SubmissionMode.PRIVATE, SubmissionMode.LEADERBOARD, SubmissionMode.REFERENCE]
         )
         await reporter.push(short_report)
         if mode != SubmissionMode.PRIVATE:
diff --git a/src/discord-cluster-manager/report.py b/src/discord-cluster-manager/report.py
index 4e09c2c5..b7901214 100644
--- a/src/discord-cluster-manager/report.py
+++ b/src/discord-cluster-manager/report.py
@@ -160,6 +160,7 @@ def make_short_report(runs: dict[str, EvalResult], full=True) -> list[str]:  # n
     Creates a minimalistic report for `runs`,
     returned as a list of status strings
     """
+    
     any_compile = False
     result = []
     for r in runs.values():
@@ -218,6 +219,18 @@ def make_short_report(runs: dict[str, EvalResult], full=True) -> list[str]:  # n
             result.append("✅ Leaderboard run successful")
     elif full:
         result.append("❌ Leaderboard missing")
+    
+    if "reference" in runs:
+        ref_run = runs["reference"].run
+        if not ref_run.success:
+            result.append("❌ Running reference failed" + _short_fail_reason(ref_run))
+        elif not ref_run.passed:
+            result.append("❌ Reference run failed")
+        else:
+            result.append("✅ Reference run successful")
+    elif full:
+        result.append("❌ Reference missing")
+
     return result
 
 

From 6fda9308127e127f65e252d23c60080518a9c385 Mon Sep 17 00:00:00 2001
From: Sahan Paliskara <sahanp@meta.com>
Date: Thu, 29 May 2025 14:02:55 -0700
Subject: [PATCH 33/42] it works now cleanup

---
 .../cogs/leaderboard_cog.py                   |  6 ++---
 .../cogs/submit_cog.py                        | 24 ++++++++++++-------
 src/discord-cluster-manager/leaderboard_db.py |  5 ++++
 src/discord-cluster-manager/report.py         |  2 --
 4 files changed, 24 insertions(+), 13 deletions(-)

diff --git a/src/discord-cluster-manager/cogs/leaderboard_cog.py b/src/discord-cluster-manager/cogs/leaderboard_cog.py
index 4c2e00ac..df234134 100644
--- a/src/discord-cluster-manager/cogs/leaderboard_cog.py
+++ b/src/discord-cluster-manager/cogs/leaderboard_cog.py
@@ -221,16 +221,16 @@ def generate_run_verdict(self, run: RunItem, sub_data: SubmissionItem):
     async def post_submit_hook(self, interaction: discord.Interaction, sub_id: int):
         with self.bot.leaderboard_db as db:
             sub_data: SubmissionItem = db.get_submission_by_id(sub_id)
-
+        print(f"sub_data: {sub_data}")
         result_lines = []
         for run in sub_data["runs"]:
             if (
                 not run["secret"]
-                and run["mode"] == SubmissionMode.LEADERBOARD.value
+                and (run["mode"] == SubmissionMode.LEADERBOARD.value or run["mode"] == SubmissionMode.REFERENCE.value)
                 and run["passed"]
             ):
                 result_lines.append(self.generate_run_verdict(run, sub_data))
-
+        print(f"result_lines: {result_lines}")
         if len(result_lines) > 0:
             await send_discord_message(
                 interaction,
diff --git a/src/discord-cluster-manager/cogs/submit_cog.py b/src/discord-cluster-manager/cogs/submit_cog.py
index 03f51d32..af26077b 100644
--- a/src/discord-cluster-manager/cogs/submit_cog.py
+++ b/src/discord-cluster-manager/cogs/submit_cog.py
@@ -104,12 +104,21 @@ async def submit_leaderboard(  # noqa: C901
         if result.success:
             score = None
             if (
-                "leaderboard" in result.runs
+                ("leaderboard" in result.runs
                 and result.runs["leaderboard"].run.success
-                and result.runs["leaderboard"].run.passed
+                and result.runs["leaderboard"].run.passed)
+                or ("reference" in result.runs
+                and result.runs["reference"].run.success
+                and result.runs["reference"].run.passed)
             ):
+                if "leaderboard" in result.runs:
+                    key = "leaderboard"
+                elif "reference" in result.runs:
+                    key = "reference"
+                else:
+                    raise KernelBotError("Leaderboard or reference run failed")
                 score = 0.0
-                num_benchmarks = int(result.runs["leaderboard"].run.result["benchmark-count"])
+                num_benchmarks = int(result.runs[key].run.result["benchmark-count"])
                 if task.ranking_by == RankCriterion.LAST:
                     if num_benchmarks != 1:
                         logger.error(
@@ -122,19 +131,18 @@ async def submit_leaderboard(  # noqa: C901
                             f"Expected submission to have exactly one benchmark,"
                             f"got {num_benchmarks}."
                         )
-                    score = float(result.runs["leaderboard"].run.result["benchmark.0.mean"]) / 1e9
+                    score = float(result.runs[key].run.result["benchmark.0.mean"]) / 1e9
                 else:
                     scores = []
                     for i in range(num_benchmarks):
                         scores.append(
-                            float(result.runs["leaderboard"].run.result[f"benchmark.{i}.mean"])
-                            / 1e9
+                            float(result.runs[key].run.result[f"benchmark.{i}.mean"]) / 1e9
                         )
                     if task.ranking_by == RankCriterion.MEAN:
                         score = sum(scores) / len(scores)
                     elif task.ranking_by == RankCriterion.GEOM:
                         score = math.pow(math.prod(scores), 1.0 / num_benchmarks)
-
+            print(f"\nScore: {score}\n")
             # verifyruns uses a fake submission id of -1
             if submission_id != -1:
                 with self.bot.leaderboard_db as db:
@@ -145,7 +153,7 @@ async def submit_leaderboard(  # noqa: C901
                             value.end,
                             mode=key,
                             runner=gpu_type.name,
-                            score=None if key != "leaderboard" else score,
+                            score=None if (key != "leaderboard" and key != "reference") else score,
                             secret=mode == SubmissionMode.PRIVATE,
                             compilation=value.compilation,
                             result=value.run,
diff --git a/src/discord-cluster-manager/leaderboard_db.py b/src/discord-cluster-manager/leaderboard_db.py
index 3b979e69..e570beec 100644
--- a/src/discord-cluster-manager/leaderboard_db.py
+++ b/src/discord-cluster-manager/leaderboard_db.py
@@ -345,6 +345,11 @@ def create_submission_run(
         result: RunResult,
         system: SystemInfo,
     ):
+        print(f"\n\nCreating run for submission {submission} with mode {mode} and runner {runner}\n\n")
+        print(f"Result: {result}\n")
+        print(f"System: {system}\n")
+        print(f"Compilation: {compilation}\n")
+        print(f"Score: {score}\n")
         try:
             if compilation is not None:
                 compilation = json.dumps(dataclasses.asdict(compilation))
diff --git a/src/discord-cluster-manager/report.py b/src/discord-cluster-manager/report.py
index b7901214..5572c95c 100644
--- a/src/discord-cluster-manager/report.py
+++ b/src/discord-cluster-manager/report.py
@@ -228,8 +228,6 @@ def make_short_report(runs: dict[str, EvalResult], full=True) -> list[str]:  # n
             result.append("❌ Reference run failed")
         else:
             result.append("✅ Reference run successful")
-    elif full:
-        result.append("❌ Reference missing")
 
     return result
 

From 89679562cf637397cfdbd15d8a0f96dcf052f8f3 Mon Sep 17 00:00:00 2001
From: Sahan Paliskara <sahanp@meta.com>
Date: Thu, 29 May 2025 16:04:13 -0700
Subject: [PATCH 34/42] cleanup

---
 examples/eval.py                              |  5 --
 src/discord-cluster-manager/cogs/admin_cog.py | 64 ++++++++++++++++++-
 .../cogs/leaderboard_cog.py                   | 33 +---------
 .../cogs/submit_cog.py                        |  1 -
 src/discord-cluster-manager/leaderboard_db.py | 18 ++----
 src/discord-cluster-manager/utils.py          |  1 -
 6 files changed, 68 insertions(+), 54 deletions(-)

diff --git a/examples/eval.py b/examples/eval.py
index aa77e3ae..a66fb45b 100644
--- a/examples/eval.py
+++ b/examples/eval.py
@@ -345,7 +345,6 @@ def main():
     tests = get_test_cases(sys.argv[2], seed)
 
     with PopcornOutput(int(fd)) as logger:
-        logger.log("debug message", f"running in mode {mode}")
         import multiprocessing
         mp_context = multiprocessing.get_context('spawn')
         with mp_context.Pool(1) as pool:
@@ -358,10 +357,6 @@ def main():
                 is_reference_run = mode == "reference"
                 # warmup
                 run_single_benchmark(pool, tests[0], False, 100, 1e7, is_reference_run)
-                if is_reference_run:
-                    logger.log("debug message", "Running reference run")
-                else:
-                    logger.log("debug message", "Running leaderboard run")
                 logger.log("benchmark-count", len(tests))
                 passed = True
                 for i in range(len(tests)):
diff --git a/src/discord-cluster-manager/cogs/admin_cog.py b/src/discord-cluster-manager/cogs/admin_cog.py
index 8e39ee2f..7fc480bd 100644
--- a/src/discord-cluster-manager/cogs/admin_cog.py
+++ b/src/discord-cluster-manager/cogs/admin_cog.py
@@ -1,6 +1,7 @@
 import json
 import subprocess
 import tempfile
+import yaml
 from datetime import datetime, timedelta, timezone
 from decimal import Decimal
 from io import StringIO
@@ -9,8 +10,7 @@
 
 import discord
 import env
-import yaml
-from consts import GitHubGPU, ModalGPU
+from consts import GitHubGPU, ModalGPU, SubmissionMode
 from discord import app_commands
 from discord.ext import commands, tasks
 from leaderboard_db import leaderboard_name_autocomplete
@@ -24,6 +24,7 @@
     setup_logging,
     with_error_handling,
 )
+from cogs.leaderboard_cog import LeaderboardSubmitCog
 
 if TYPE_CHECKING:
     from ..bot import ClusterBot
@@ -120,6 +121,10 @@ def __init__(self, bot: "ClusterBot"):
             name="set-forum-ids", description="Sets forum IDs"
         )(self.set_forum_ids)
 
+        self.reference_run = bot.admin_group.command(
+            name="reference-run", description="Create a reference run for a leaderboard"
+        )(self.reference_run)
+
         self._scheduled_cleanup_temp_users.start()
 
     # --------------------------------------------------------------------------
@@ -1025,3 +1030,58 @@ async def set_forum_ids(self, interaction: discord.Interaction):
             error_message = f"Error updating forum ids: {str(e)}"
             logger.error(error_message, exc_info=True)
             await send_discord_message(interaction, error_message, ephemeral=True)
+
+    # ----------------------------------------------------------------------
+    # Reference run submission (admin only)
+    # ----------------------------------------------------------------------
+    @discord.app_commands.describe(
+        leaderboard_name="Name of the leaderboard to create a reference run for",
+        gpu="GPU(s) to use; leave empty for interactive selection",
+        force="Create another reference run even if one already exists.",
+    )
+    @discord.app_commands.autocomplete(
+        leaderboard_name=leaderboard_name_autocomplete,
+    )
+    @with_error_handling
+    async def reference_run(
+        self,
+        interaction: discord.Interaction,
+        leaderboard_name: str,
+        gpu: Optional[str] = None,
+        force: bool = False,
+    ):
+        """Admin command to create (or force-create) a reference run."""
+
+        # Ensure caller is admin
+        is_admin = await self.admin_check(interaction)
+        if not is_admin:
+            await send_discord_message(
+                interaction,
+                "You need Admin permissions to run this command.",
+                ephemeral=True,
+            )
+            return
+
+        # Check for existing reference run unless forcing
+        if not force:
+            with self.bot.leaderboard_db as db:
+                if db.has_reference_run(leaderboard_name):
+                    await send_discord_message(
+                        interaction,
+                        (
+                            "A reference run already exists for this leaderboard. "
+                            "Use the 'force' flag to create another."
+                        ),
+                        ephemeral=True,
+                    )
+                    return
+
+        lb_cog = LeaderboardSubmitCog(self.bot)
+
+        await lb_cog.submit(
+            interaction=interaction,
+            leaderboard_name=leaderboard_name,
+            script=None,
+            mode=SubmissionMode.REFERENCE,
+            gpu=gpu,
+        )
diff --git a/src/discord-cluster-manager/cogs/leaderboard_cog.py b/src/discord-cluster-manager/cogs/leaderboard_cog.py
index df234134..d4a9b49a 100644
--- a/src/discord-cluster-manager/cogs/leaderboard_cog.py
+++ b/src/discord-cluster-manager/cogs/leaderboard_cog.py
@@ -221,7 +221,6 @@ def generate_run_verdict(self, run: RunItem, sub_data: SubmissionItem):
     async def post_submit_hook(self, interaction: discord.Interaction, sub_id: int):
         with self.bot.leaderboard_db as db:
             sub_data: SubmissionItem = db.get_submission_by_id(sub_id)
-        print(f"sub_data: {sub_data}")
         result_lines = []
         for run in sub_data["runs"]:
             if (
@@ -230,7 +229,6 @@ async def post_submit_hook(self, interaction: discord.Interaction, sub_id: int):
                 and run["passed"]
             ):
                 result_lines.append(self.generate_run_verdict(run, sub_data))
-        print(f"result_lines: {result_lines}")
         if len(result_lines) > 0:
             await send_discord_message(
                 interaction,
@@ -355,34 +353,6 @@ async def submit_ranked(
             interaction, leaderboard_name, script, mode=SubmissionMode.LEADERBOARD, gpu=gpu
         )
 
-    @app_commands.command(name="reference_run", description="Create a reference run for a leaderboard")
-    @app_commands.describe(
-        leaderboard_name="Name of the leaderboard to create a reference run for",
-        gpu="Select GPU. Leave empty for interactive selection.",
-    )
-    @app_commands.autocomplete(leaderboard_name=leaderboard_name_autocomplete)
-    @with_error_handling
-    async def submit_reference(
-        self,
-        interaction: discord.Interaction,
-        leaderboard_name: str,
-        gpu: Optional[str] = None,
-    ):
-        # Check if reference run already exists
-        # with self.bot.leaderboard_db as db:
-        #     if db.has_reference_run(leaderboard_name):
-        #         await send_discord_message(
-        #             interaction,
-        #             f"A reference run for leaderboard '{leaderboard_name}' already exists.",
-        #             ephemeral=True,
-        #         )
-        #         return
-        # Process as a special submission
-        return await self.submit(
-            interaction, leaderboard_name, None, mode=SubmissionMode.REFERENCE, gpu=gpu
-        )
-
-
 async def lang_autocomplete(
     interaction: discord.Interaction,
     current: str,
@@ -493,8 +463,7 @@ async def _display_lb_submissions_helper(
             processed_submissions = [
                 {
                     "Rank": submission["rank"],
-                    # "User": await get_user_from_id(self.bot, submission["user_id"]),
-                    "User": submission["user_id"],
+                    "User": await get_user_from_id(self.bot, submission["user_id"]),
                     "Score": f"{format_time(float(submission['submission_score']) * 1e9)}",
                     "Submission Name": submission["submission_name"],
                 }
diff --git a/src/discord-cluster-manager/cogs/submit_cog.py b/src/discord-cluster-manager/cogs/submit_cog.py
index af26077b..d689fea0 100644
--- a/src/discord-cluster-manager/cogs/submit_cog.py
+++ b/src/discord-cluster-manager/cogs/submit_cog.py
@@ -142,7 +142,6 @@ async def submit_leaderboard(  # noqa: C901
                         score = sum(scores) / len(scores)
                     elif task.ranking_by == RankCriterion.GEOM:
                         score = math.pow(math.prod(scores), 1.0 / num_benchmarks)
-            print(f"\nScore: {score}\n")
             # verifyruns uses a fake submission id of -1
             if submission_id != -1:
                 with self.bot.leaderboard_db as db:
diff --git a/src/discord-cluster-manager/leaderboard_db.py b/src/discord-cluster-manager/leaderboard_db.py
index e570beec..d3f43b87 100644
--- a/src/discord-cluster-manager/leaderboard_db.py
+++ b/src/discord-cluster-manager/leaderboard_db.py
@@ -298,15 +298,12 @@ def has_reference_run(self, leaderboard_name: str) -> bool:
         try:
             self.cursor.execute(
                 """
-                SELECT COUNT(*) FROM leaderboard.runs
-                WHERE leaderboard.runs.leaderboard_id = (
-                    SELECT leaderboard.leaderboard.id
-                    FROM leaderboard.leaderboard
-                    WHERE leaderboard.leaderboard.name = %s
-                )
-                AND leaderboard.runs.user_id = %s;
+                SELECT COUNT(*) FROM leaderboard.runs r
+                JOIN leaderboard.submission s ON r.submission_id = s.id
+                JOIN leaderboard.leaderboard l ON s.leaderboard_id = l.id
+                WHERE l.name = %s AND s.user_id = %s
                 """,
-                (leaderboard_name, REFERENCE_USER_ID),
+                (leaderboard_name, str(REFERENCE_USER_ID)),
             )
             return self.cursor.fetchone()[0] > 0
         except psycopg2.Error as e:
@@ -345,11 +342,6 @@ def create_submission_run(
         result: RunResult,
         system: SystemInfo,
     ):
-        print(f"\n\nCreating run for submission {submission} with mode {mode} and runner {runner}\n\n")
-        print(f"Result: {result}\n")
-        print(f"System: {system}\n")
-        print(f"Compilation: {compilation}\n")
-        print(f"Score: {score}\n")
         try:
             if compilation is not None:
                 compilation = json.dumps(dataclasses.asdict(compilation))
diff --git a/src/discord-cluster-manager/utils.py b/src/discord-cluster-manager/utils.py
index 769debdb..71b4f079 100644
--- a/src/discord-cluster-manager/utils.py
+++ b/src/discord-cluster-manager/utils.py
@@ -263,7 +263,6 @@ def build_task_config(
                 all_files[n] = submission_content
             else:
                 all_files[n] = c
-        print(f"all_files: {all_files}")
         common = {
             "lang": task.lang.value,
             "arch": arch,

From 4bc6e8d7bdbe00d5551e75f70e34f3348f4f6fe0 Mon Sep 17 00:00:00 2001
From: Sahan Paliskara <sahanp@meta.com>
Date: Thu, 29 May 2025 19:12:57 -0700
Subject: [PATCH 35/42] test push

---
 examples/eval.py                              | 20 +++----
 src/discord-cluster-manager/cogs/admin_cog.py | 28 ++++-----
 .../cogs/leaderboard_cog.py                   | 60 +++++--------------
 .../cogs/submit_cog.py                        | 16 ++---
 src/discord-cluster-manager/consts.py         | 17 ++++--
 src/discord-cluster-manager/eval.py           | 59 +++++-------------
 src/discord-cluster-manager/leaderboard_db.py |  8 +--
 src/discord-cluster-manager/report.py         | 10 ++--
 src/discord-cluster-manager/run_eval.py       | 30 ++++------
 src/discord-cluster-manager/submission.py     |  4 +-
 src/discord-cluster-manager/utils.py          |  8 +--
 11 files changed, 99 insertions(+), 161 deletions(-)

diff --git a/examples/eval.py b/examples/eval.py
index a66fb45b..f80bfcfa 100644
--- a/examples/eval.py
+++ b/examples/eval.py
@@ -21,9 +21,9 @@
 
 
 # -----------------------------------------------------------------------------
-# Determine which kernel to use (reference or submission)
+# Determine which kernel to use (baseline or submission)
 # -----------------------------------------------------------------------------
-MODE_REFERENCE_STRING = "reference"  # Define the string to check for mode
+MODE_BASELINE_STRING = "baseline"  # Define the string to check for mode
 class PopcornOutput:
     def __init__(self, fd: int):
         self.file = os.fdopen(fd, 'w')
@@ -202,11 +202,11 @@ def run_testing(logger: PopcornOutput, pool: multiprocessing.Pool, tests: list[T
         return 112
 
 
-def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_time_ns: float, is_reference_run: bool) -> Stats | Any:
+def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_time_ns: float, is_baseline_run: bool) -> Stats | Any:
     """
     Runs one benchmark. Do not call directly.
     """
-    if not is_reference_run:
+    if not is_baseline_run:
         # submission does not exist for a reference run
         from submission import custom_kernel
 
@@ -214,7 +214,7 @@ def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_t
     # generate input data once
     data = generate_input(**test.args)
     check_copy = _clone_data(data)
-    active_kernel = ref_kernel if is_reference_run else custom_kernel
+    active_kernel = ref_kernel if is_baseline_run else custom_kernel
     #  first, one obligatory correctness check
     output = active_kernel(data)
     good, message = wrap_check_implementation(check_copy, output)
@@ -256,7 +256,7 @@ def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_t
     return calculate_stats(durations)
 
 
-def run_single_benchmark(pool: multiprocessing.Pool, test: TestCase, recheck: bool, max_repeats: int, max_time_ns: float, is_reference_run: bool = False):
+def run_single_benchmark(pool: multiprocessing.Pool, test: TestCase, recheck: bool, max_repeats: int, max_time_ns: float, is_baseline_run: bool = False):
     """
     For a particular test case, check correctness (if applicable) and grab runtime results.
 
@@ -267,7 +267,7 @@ def run_single_benchmark(pool: multiprocessing.Pool, test: TestCase, recheck: bo
     @param max_time_ns: Timeout time in nanoseconds.
     @return: A Stats object for this particular benchmark case or an error if the test fails.
     """
-    return pool.apply(_run_single_benchmark, (test, recheck, max_repeats, max_time_ns, is_reference_run))
+    return pool.apply(_run_single_benchmark, (test, recheck, max_repeats, max_time_ns, is_baseline_run))
 
 
 def run_benchmarking(logger: PopcornOutput, pool: multiprocessing.Pool, tests: list[TestCase]):
@@ -354,13 +354,13 @@ def main():
                 return run_benchmarking(logger, pool, tests)
 
             if mode == "leaderboard" or mode == "reference":
-                is_reference_run = mode == "reference"
+                is_baseline_run = mode == "reference"
                 # warmup
-                run_single_benchmark(pool, tests[0], False, 100, 1e7, is_reference_run)
+                run_single_benchmark(pool, tests[0], False, 100, 1e7, is_baseline_run)
                 logger.log("benchmark-count", len(tests))
                 passed = True
                 for i in range(len(tests)):
-                    result = run_single_benchmark(pool, tests[i], True, 100, 30e9, is_reference_run)
+                    result = run_single_benchmark(pool, tests[i], True, 100, 30e9, is_baseline_run)
                     logger.log(f"benchmark.{i}.spec", tests[i].spec)
                     if isinstance(result, Stats):
                         for field in dataclasses.fields(Stats):
diff --git a/src/discord-cluster-manager/cogs/admin_cog.py b/src/discord-cluster-manager/cogs/admin_cog.py
index 7fc480bd..a20c9bb3 100644
--- a/src/discord-cluster-manager/cogs/admin_cog.py
+++ b/src/discord-cluster-manager/cogs/admin_cog.py
@@ -1,7 +1,6 @@
 import json
 import subprocess
 import tempfile
-import yaml
 from datetime import datetime, timedelta, timezone
 from decimal import Decimal
 from io import StringIO
@@ -10,6 +9,8 @@
 
 import discord
 import env
+import yaml
+from cogs.leaderboard_cog import LeaderboardSubmitCog
 from consts import GitHubGPU, ModalGPU, SubmissionMode
 from discord import app_commands
 from discord.ext import commands, tasks
@@ -24,7 +25,6 @@
     setup_logging,
     with_error_handling,
 )
-from cogs.leaderboard_cog import LeaderboardSubmitCog
 
 if TYPE_CHECKING:
     from ..bot import ClusterBot
@@ -121,9 +121,9 @@ def __init__(self, bot: "ClusterBot"):
             name="set-forum-ids", description="Sets forum IDs"
         )(self.set_forum_ids)
 
-        self.reference_run = bot.admin_group.command(
-            name="reference-run", description="Create a reference run for a leaderboard"
-        )(self.reference_run)
+        self.baseline_run = bot.admin_group.command(
+            name="baseline-run", description="Create a baseline run for a leaderboard"
+        )(self.baseline_run)
 
         self._scheduled_cleanup_temp_users.start()
 
@@ -1032,25 +1032,25 @@ async def set_forum_ids(self, interaction: discord.Interaction):
             await send_discord_message(interaction, error_message, ephemeral=True)
 
     # ----------------------------------------------------------------------
-    # Reference run submission (admin only)
+    # Baseline run submission (admin only)
     # ----------------------------------------------------------------------
     @discord.app_commands.describe(
-        leaderboard_name="Name of the leaderboard to create a reference run for",
+        leaderboard_name="Name of the leaderboard to create a baseline run for",
         gpu="GPU(s) to use; leave empty for interactive selection",
-        force="Create another reference run even if one already exists.",
+        force="Create another baseline run even if one already exists.",
     )
     @discord.app_commands.autocomplete(
         leaderboard_name=leaderboard_name_autocomplete,
     )
     @with_error_handling
-    async def reference_run(
+    async def baseline_run(
         self,
         interaction: discord.Interaction,
         leaderboard_name: str,
         gpu: Optional[str] = None,
         force: bool = False,
     ):
-        """Admin command to create (or force-create) a reference run."""
+        """Admin command to create (or force-create) a baseline run."""
 
         # Ensure caller is admin
         is_admin = await self.admin_check(interaction)
@@ -1062,14 +1062,14 @@ async def reference_run(
             )
             return
 
-        # Check for existing reference run unless forcing
+        # Check for existing baseline run unless forcing
         if not force:
             with self.bot.leaderboard_db as db:
-                if db.has_reference_run(leaderboard_name):
+                if db.has_baseline_run(leaderboard_name):
                     await send_discord_message(
                         interaction,
                         (
-                            "A reference run already exists for this leaderboard. "
+                            "A baseline run already exists for this leaderboard. "
                             "Use the 'force' flag to create another."
                         ),
                         ephemeral=True,
@@ -1082,6 +1082,6 @@ async def reference_run(
             interaction=interaction,
             leaderboard_name=leaderboard_name,
             script=None,
-            mode=SubmissionMode.REFERENCE,
+            mode=SubmissionMode.BASELINE,
             gpu=gpu,
         )
diff --git a/src/discord-cluster-manager/cogs/leaderboard_cog.py b/src/discord-cluster-manager/cogs/leaderboard_cog.py
index d4a9b49a..9171efc6 100644
--- a/src/discord-cluster-manager/cogs/leaderboard_cog.py
+++ b/src/discord-cluster-manager/cogs/leaderboard_cog.py
@@ -7,6 +7,8 @@
 from consts import (
     SubmissionMode,
     get_gpu_by_name,
+    BASELINE_USER_ID,
+    BASELINE_USER,
 )
 from discord import app_commands
 from discord.ext import commands
@@ -27,8 +29,6 @@
     with_error_handling,
 )
 
-from consts import REFERENCE_USER_ID, REFERENCE_USER
-
 if TYPE_CHECKING:
     from ..bot import ClusterBot
 
@@ -74,10 +74,10 @@ async def on_submit_hook(  # noqa: C901
         """
 
         if script is None:
-            if mode != SubmissionMode.REFERENCE:
+            if mode != SubmissionMode.BASELINE and not script:
                 await send_discord_message(
                     interaction,
-                    "Script attachment is required for this unless submission mode is reference",
+                    "Script attachment is required for this unless submission mode is baseline",
                     ephemeral=True,
                 )
                 return -1
@@ -94,12 +94,12 @@ async def on_submit_hook(  # noqa: C901
                     interaction, "Could not decode your file. Is it UTF-8?", ephemeral=True
                 )
                 return -1
-        if mode == SubmissionMode.REFERENCE:
-            # create fake reference submission
+        if mode == SubmissionMode.BASELINE:
+            # create fake baseline submission
             file_name = None
             submission_content = None
-            user_id = REFERENCE_USER_ID
-            user_name = REFERENCE_USER
+            user_id = BASELINE_USER_ID
+            user_name = BASELINE_USER
         else:
             file_name = script.filename
             submission_content = submission_content
@@ -140,8 +140,8 @@ async def on_submit_hook(  # noqa: C901
                 time=datetime.now(),
                 user_name=user_name,
             )
-        if mode == SubmissionMode.REFERENCE:
-            run_msg = f"Submission **{sub_id}**: is a reference submission for `{req.leaderboard}`"
+        if mode == SubmissionMode.BASELINE:
+            run_msg = f"Submission **{sub_id}**: is a baseline submission for `{req.leaderboard}`"
         else:
             run_msg = f"Submission **{sub_id}**: `{file_name}` for `{req.leaderboard}`"
 
@@ -182,42 +182,10 @@ async def on_submit_hook(  # noqa: C901
             with self.bot.leaderboard_db as db:
                 db.mark_submission_done(sub_id)
 
-        if mode == SubmissionMode.LEADERBOARD or mode == SubmissionMode.REFERENCE:
+        if mode == SubmissionMode.LEADERBOARD or mode == SubmissionMode.BASELINE:
             await self.post_submit_hook(interaction, sub_id)
         return sub_id
 
-    def generate_run_verdict(self, run: RunItem, sub_data: SubmissionItem):
-        medals = {1: "🥇 First", 2: "🥈 Second", 3: "🥉 Third"}
-
-        # get the competition
-        with self.bot.leaderboard_db as db:
-            competition = db.get_leaderboard_submissions(
-                sub_data["leaderboard_name"], run["runner"]
-            )
-        # compare against the competition
-        other_by_user = False
-        run_time = float(run["score"])
-        score_text = format_time(run_time * 1e9)
-
-        for entry in competition:
-            # can we find our own run? Only if it is the fastest submission by this user
-            if entry["submission_id"] == sub_data["submission_id"]:
-                rank = entry["rank"]
-                if 1 <= rank <= 3:
-                    return f"> {medals[rank]} place on {run['runner']}: {score_text}"
-                elif rank <= 10:
-                    return f"> {rank}th place on {run['runner']}: {score_text}"
-                else:
-                    return f"> Personal best on {run['runner']}: {score_text}"
-            elif entry["user_id"] == sub_data["user_id"]:
-                other_by_user = True
-        if other_by_user:
-            # User already has a submission that is faster
-            return f"> Successful on {run['runner']}: {score_text}"
-        else:
-            # no submission by the user exists
-            return f"> 🍾 First successful submission on {run['runner']}: {score_text}"
-
     async def post_submit_hook(self, interaction: discord.Interaction, sub_id: int):
         with self.bot.leaderboard_db as db:
             sub_data: SubmissionItem = db.get_submission_by_id(sub_id)
@@ -225,7 +193,7 @@ async def post_submit_hook(self, interaction: discord.Interaction, sub_id: int):
         for run in sub_data["runs"]:
             if (
                 not run["secret"]
-                and (run["mode"] == SubmissionMode.LEADERBOARD.value or run["mode"] == SubmissionMode.REFERENCE.value)
+                and (run["mode"] == SubmissionMode.LEADERBOARD.value or run["mode"] == SubmissionMode.BASELINE.value)
                 and run["passed"]
             ):
                 result_lines.append(self.generate_run_verdict(run, sub_data))
@@ -254,10 +222,10 @@ async def submit(
         gpu: Optional[str],
     ):
 
-        if not mode == SubmissionMode.REFERENCE and not script:
+        if not mode == SubmissionMode.BASELINE and not script:
             await send_discord_message(
                 interaction,
-                "Script attachment is required for this unless submission mode is reference",
+                "Script attachment is required for this unless submission mode is baseline",
                 ephemeral=True,
             )
             return
diff --git a/src/discord-cluster-manager/cogs/submit_cog.py b/src/discord-cluster-manager/cogs/submit_cog.py
index d689fea0..0ac10e2a 100644
--- a/src/discord-cluster-manager/cogs/submit_cog.py
+++ b/src/discord-cluster-manager/cogs/submit_cog.py
@@ -107,16 +107,16 @@ async def submit_leaderboard(  # noqa: C901
                 ("leaderboard" in result.runs
                 and result.runs["leaderboard"].run.success
                 and result.runs["leaderboard"].run.passed)
-                or ("reference" in result.runs
-                and result.runs["reference"].run.success
-                and result.runs["reference"].run.passed)
+                or ("baseline" in result.runs
+                and result.runs["baseline"].run.success
+                and result.runs["baseline"].run.passed)
             ):
                 if "leaderboard" in result.runs:
                     key = "leaderboard"
-                elif "reference" in result.runs:
-                    key = "reference"
+                elif "baseline" in result.runs:
+                    key = "baseline"
                 else:
-                    raise KernelBotError("Leaderboard or reference run failed")
+                    raise KernelBotError("Leaderboard or baseline run failed")
                 score = 0.0
                 num_benchmarks = int(result.runs[key].run.result["benchmark-count"])
                 if task.ranking_by == RankCriterion.LAST:
@@ -152,7 +152,7 @@ async def submit_leaderboard(  # noqa: C901
                             value.end,
                             mode=key,
                             runner=gpu_type.name,
-                            score=None if (key != "leaderboard" and key != "reference") else score,
+                            score=None if (key != "leaderboard" and key != "baseline") else score,
                             secret=mode == SubmissionMode.PRIVATE,
                             compilation=value.compilation,
                             result=value.run,
@@ -228,7 +228,7 @@ async def _handle_submission(
             await reporter.update_title(reporter.title + " ✅ success")
 
         short_report = make_short_report(
-            result.runs, full=mode in [SubmissionMode.PRIVATE, SubmissionMode.LEADERBOARD, SubmissionMode.REFERENCE]
+            result.runs, full=mode in [SubmissionMode.PRIVATE, SubmissionMode.LEADERBOARD]
         )
         await reporter.push(short_report)
         if mode != SubmissionMode.PRIVATE:
diff --git a/src/discord-cluster-manager/consts.py b/src/discord-cluster-manager/consts.py
index cf50fdf2..d0934dd9 100644
--- a/src/discord-cluster-manager/consts.py
+++ b/src/discord-cluster-manager/consts.py
@@ -80,7 +80,7 @@ class SubmissionMode(Enum):
     """
     Different types of submission that can be made:
     Test: Run tests and give detailed results about passed/failed tests. These have short timeouts.
-    Benchmark: Run larger benchmarks. Each benchmark is tested once, and then run multiple times.
+    Benchmark: Run larger benchmarks. Each benchmark is tested once, then run multiple times.
     Profile: Gather profiling information. One selected benchmark is run under the profiler. No
         testing is performed in this mode (sometimes, you need to profile deliberately broken code)
     Leaderboard: Official submission to the leaderboard. This first runs public tests, then a
@@ -97,7 +97,10 @@ class SubmissionMode(Enum):
     LEADERBOARD = "leaderboard"
     PRIVATE = "private"
     SCRIPT = "script"
-    REFERENCE = "reference"
+    BASELINE = "baseline"
+
+    # Alias for backward compatibility; to be removed in future release
+    REFERENCE = "baseline"
 
 
 class Language(Enum):
@@ -159,6 +162,10 @@ class RankCriterion(Enum):
 torch
 """
 
-REFERENCE_USER = "REFERENCE_USER"
-REFERENCE_USER_ID = -123
-REFERENCE_TIMING_ARG = "--reference-timing"
\ No newline at end of file
+# Constants used for baseline runs
+BASELINE_USER = "BASELINE_USER"
+BASELINE_USER_ID = -123
+
+# Aliases for backward compatibility (to be removed in future release)
+REFERENCE_USER = BASELINE_USER
+REFERENCE_USER_ID = BASELINE_USER_ID
\ No newline at end of file
diff --git a/src/discord-cluster-manager/eval.py b/src/discord-cluster-manager/eval.py
index 2945ce5c..c36de918 100644
--- a/src/discord-cluster-manager/eval.py
+++ b/src/discord-cluster-manager/eval.py
@@ -1,4 +1,3 @@
-import argparse
 import math
 import os
 import sys
@@ -6,7 +5,7 @@
 
 import torch
 from reference import check_implementation, generate_input, ref_kernel
-from consts import REFERENCE_TIMING_ARG
+from submission import custom_kernel
 
 
 class PopcornLogger:
@@ -18,7 +17,6 @@ def log(self, key: str, value):
 
 
 def correctness(rng: torch.Generator) -> bool:
-    from submission import custom_kernel
     for _ in range(10):  # check multiple times
         inputs = generate_input(torch.randint(0, int(2**31), (), generator=rng).item())
         custom_output = custom_kernel(inputs)
@@ -31,25 +29,15 @@ def correctness(rng: torch.Generator) -> bool:
     return True
 
 
-def metric(logger: PopcornLogger, rng: torch.Generator, time_reference_impl: bool = False):
-    print("timing kernel")
+def metric(logger: PopcornLogger, rng: torch.Generator):
     warmup_runs = 10
     timed_runs = 100
-    if time_reference_impl:
-        logger.log("Timing Reference Implementation")
-    else:
-        # in the case of a reference run we don't have a submission
-        logger.log("Timing Submitted Custom Implementation")
-        from submission import custom_kernel
 
     # Warmup Code
     print("warming up...")
     for _ in range(warmup_runs):
         inputs = generate_input(torch.randint(0, int(2**31), (), generator=rng).item())
-        if time_reference_impl:
-            _ = ref_kernel(inputs)
-        else:
-            _ = custom_kernel(inputs)
+        _ = custom_kernel(inputs)
     torch.cuda.synchronize()
 
     # Timing Code
@@ -59,20 +47,16 @@ def metric(logger: PopcornLogger, rng: torch.Generator, time_reference_impl: boo
         inputs = generate_input(torch.randint(0, int(2**31), (), generator=rng).item())
 
         start_time = time.time()
-        if time_reference_impl:
-            ref_output = ref_kernel(inputs)
-        else:
-            custom_output = custom_kernel(inputs)
+        custom_output = custom_kernel(inputs)
         torch.cuda.synchronize()
         end_time = time.time()
         times.append(end_time - start_time)
 
-        if not time_reference_impl:
-            ref_output = ref_kernel(inputs)
-            torch.cuda.synchronize()
-            if not check_implementation(custom_output, ref_output):
-                logger.log("check", "fail")
-                exit(112)
+        ref_output = ref_kernel(inputs)
+        torch.cuda.synchronize()
+        if not check_implementation(custom_output, ref_output):
+            logger.log("check", "fail")
+            exit(112)
 
     total_time = sum(times)
     average_duration = total_time / timed_runs
@@ -87,17 +71,10 @@ def metric(logger: PopcornLogger, rng: torch.Generator, time_reference_impl: boo
     logger.log("duration.best", min(times) * 1e9)
     logger.log("duration.worst", max(times) * 1e9)
 
-    kernel_name = "Reference" if time_reference_impl else "Submitted"
-    print(f"{kernel_name} kernel runtime: {average_duration:.4f} ± {standard_error:.4} seconds")
+    print(f"Submitted kernel runtime: {average_duration:.4f} ± {standard_error:.4} seconds")
 
 
 def main():
-    parser = argparse.ArgumentParser(description='Evaluate kernel implementation.')
-    parser.add_argument(
-        REFERENCE_TIMING_ARG, action='store_true', help='Time ref kernel.'
-    )
-    args = parser.parse_args()
-    print(f"starting script")
     try:
         logger = PopcornLogger(int(os.environ["POPCORN_FD"]))
     except Exception as e:
@@ -107,16 +84,12 @@ def main():
     seed = int(os.environ.get("POPCORN_FD", 42))
     rng = torch.Generator()
     rng.manual_seed(seed)
-    print(f"seed: {seed}")
-    print(f"time ref: {args.time_ref}")
-    print(f"correctness: {not args.time_ref}")
-    if not args.time_ref:
-        if not correctness(rng):
-            logger.log("check", "fail")
-            exit(112)
-    
-    metric(logger, rng, time_reference_impl=args.time_ref)
+
+    if not correctness(rng):
+        logger.log("check", "fail")
+        exit(112)
+    metric(logger, rng)
 
 
 if __name__ == "__main__":
-    main()
+    main()
\ No newline at end of file
diff --git a/src/discord-cluster-manager/leaderboard_db.py b/src/discord-cluster-manager/leaderboard_db.py
index d3f43b87..e327afab 100644
--- a/src/discord-cluster-manager/leaderboard_db.py
+++ b/src/discord-cluster-manager/leaderboard_db.py
@@ -27,7 +27,7 @@
     setup_logging,
 )
 
-from consts import REFERENCE_USER_ID, REFERENCE_USER
+from consts import BASELINE_USER_ID, BASELINE_USER
 
 leaderboard_name_cache = LRUCache(max_size=512)
 
@@ -215,7 +215,7 @@ def create_submission(
         time: datetime.datetime,
         user_name: str = None,
     ) -> Optional[int]:
-        if user_id == REFERENCE_USER_ID and user_name == REFERENCE_USER:
+        if user_id == BASELINE_USER_ID and user_name == BASELINE_USER:
             # todo: add reference code to the database
             code = ""
             file_name = "reference.py"
@@ -294,7 +294,7 @@ def create_submission(
             self.connection.rollback()  # Ensure rollback if error occurs
             raise KernelBotError("Error during creation of submission") from e
 
-    def has_reference_run(self, leaderboard_name: str) -> bool:
+    def has_baseline_run(self, leaderboard_name: str) -> bool:
         try:
             self.cursor.execute(
                 """
@@ -303,7 +303,7 @@ def has_reference_run(self, leaderboard_name: str) -> bool:
                 JOIN leaderboard.leaderboard l ON s.leaderboard_id = l.id
                 WHERE l.name = %s AND s.user_id = %s
                 """,
-                (leaderboard_name, str(REFERENCE_USER_ID)),
+                (leaderboard_name, str(BASELINE_USER_ID)),
             )
             return self.cursor.fetchone()[0] > 0
         except psycopg2.Error as e:
diff --git a/src/discord-cluster-manager/report.py b/src/discord-cluster-manager/report.py
index 5572c95c..fb43d31f 100644
--- a/src/discord-cluster-manager/report.py
+++ b/src/discord-cluster-manager/report.py
@@ -220,14 +220,14 @@ def make_short_report(runs: dict[str, EvalResult], full=True) -> list[str]:  # n
     elif full:
         result.append("❌ Leaderboard missing")
     
-    if "reference" in runs:
-        ref_run = runs["reference"].run
+    if "baseline" in runs:
+        ref_run = runs["baseline"].run
         if not ref_run.success:
-            result.append("❌ Running reference failed" + _short_fail_reason(ref_run))
+            result.append("❌ Running baseline failed" + _short_fail_reason(ref_run))
         elif not ref_run.passed:
-            result.append("❌ Reference run failed")
+            result.append("❌ Baseline run failed")
         else:
-            result.append("✅ Reference run successful")
+            result.append("✅ Baseline run successful")
 
     return result
 
diff --git a/src/discord-cluster-manager/run_eval.py b/src/discord-cluster-manager/run_eval.py
index 4789c638..7614d0bc 100644
--- a/src/discord-cluster-manager/run_eval.py
+++ b/src/discord-cluster-manager/run_eval.py
@@ -10,7 +10,7 @@
 from types import NoneType
 from typing import Optional, Protocol, Union
 
-from consts import CUDA_FLAGS, ExitCode, REFERENCE_TIMING_ARG, Timeout
+from consts import CUDA_FLAGS, ExitCode, Timeout
 
 
 @dataclasses.dataclass
@@ -218,8 +218,6 @@ def compile_cuda_script(  # # noqa: C901
 
 
 def run_program(args: list[str], seed: Optional[int], timeout: int) -> RunResult:
-    print("[Running]")
-    print("Running with args: %s", args)
     # set up a pipe so the tester can communicate its verdict with us
     env = os.environ.copy()
     pipe_read, pipe_write = os.pipe()
@@ -434,6 +432,7 @@ def run_cuda_script(  # # noqa: C901
 def run_pytorch_script(  # noqa: C901
     sources: dict[str, str],
     main: str,
+    is_baseline: bool = False,
     **kwargs,
 ) -> EvalResult:
     """
@@ -448,17 +447,6 @@ def run_pytorch_script(  # noqa: C901
         RunResult
     """
     start = datetime.datetime.now()
-    args = kwargs.get("args", [])
-    # log everything that's going on
-    print("Running with kwargs: %s" % kwargs)
-    print("Running with args: %s" % args)
-    print("Running with sources: %s" % sources)
-    print("Running with main: %s" % main)
-    is_reference = False
-    if REFERENCE_TIMING_ARG in args:
-        # pluck out submission.py from sources as it is not needed for the run and is None normally
-        sources.pop("submission.py", None)
-        is_reference = True
     try:
         assert main in sources.keys()
         _create_files(sources)
@@ -466,7 +454,7 @@ def run_pytorch_script(  # noqa: C901
         # "compile" step: execute the script once. Will populate
         # `load_inline`'s compile cache, so the actual runs will be faster.
         comp = None
-        if not is_reference:
+        if not is_baseline:
             try:
                 compile_run = run_program(["python", "submission.py"], seed=1, timeout=Timeout.COMPILE)
                 if "-DTORCH_EXTENSION_NAME" in compile_run.stdout:
@@ -522,7 +510,7 @@ def run_evaluation(
     require multiple runner calls.
     """
     results: dict[str, EvalResult] = {}
-    if mode in ["test", "benchmark", "profile", "script", "reference"]:
+    if mode in ["test", "benchmark", "profile", "script", "baseline"]:
         results[mode] = call(mode=mode)
     elif mode in ["private", "leaderboard"]:
         # first, run the tests
@@ -539,7 +527,7 @@ def run_evaluation(
         # if they pass, run the leaderboard validation
         results["leaderboard"] = call(mode="leaderboard")
     else:
-        raise AssertionError("Invalid mode")
+        raise AssertionError(f"Invalid mode: {mode}")
 
     return results
 
@@ -555,6 +543,12 @@ def build_test_string(tests: list[dict]):
 
 
 def run_config(config: dict):
+    mode = config["mode"]
+    is_baseline = False
+    if mode == "baseline":
+        config["sources"].pop("submission.py", None)
+        is_baseline = True
+
     common_args = {
         "tests": build_test_string(config.get("tests", [])),
         "benchmarks": build_test_string(config.get("benchmarks", [])),
@@ -563,13 +557,13 @@ def run_config(config: dict):
         "ranked_timeout": config.get("ranked_timeout", Timeout.RANKED),
         "benchmark_timeout": config.get("benchmark_timeout", Timeout.BENCHMARK),
         "test_timeout": config.get("test_timeout", Timeout.TEST),
-        "args": config.get("args", []),
     }
     if config["lang"] == "py":
         runner = functools.partial(
             run_pytorch_script,
             sources=config["sources"],
             main=config["main"],
+            is_baseline=is_baseline,
             **common_args,
         )
     elif config["lang"] == "cu":
diff --git a/src/discord-cluster-manager/submission.py b/src/discord-cluster-manager/submission.py
index 19c0c6a1..090f7dee 100644
--- a/src/discord-cluster-manager/submission.py
+++ b/src/discord-cluster-manager/submission.py
@@ -29,10 +29,10 @@ class ProcessedSubmissionRequest(SubmissionRequest):
 def prepare_submission(req: SubmissionRequest, lb_db: LeaderboardDB) -> ProcessedSubmissionRequest:
     # Detect reference submissions (no file name & no code provided)
     # A reference submission is identified by missing/empty code content (no user file)
-    is_reference_submission = not req.code
+    is_baseline_submission = not req.code
 
     # Perform filename/content related checks only for *non* reference submissions
-    if not is_reference_submission:
+    if not is_baseline_submission:
         if profanity.contains_profanity(req.file_name):
             raise KernelBotError("Please provide a non rude filename")
 
diff --git a/src/discord-cluster-manager/utils.py b/src/discord-cluster-manager/utils.py
index 71b4f079..d63a44e1 100644
--- a/src/discord-cluster-manager/utils.py
+++ b/src/discord-cluster-manager/utils.py
@@ -5,7 +5,7 @@
 from typing import TYPE_CHECKING, Any, List, NotRequired, Optional, TypedDict
 
 import discord
-from consts import Language, SubmissionMode, REFERENCE_TIMING_ARG
+from consts import Language, SubmissionMode
 
 if TYPE_CHECKING:
     from task import LeaderboardTask
@@ -246,8 +246,7 @@ def build_task_config(
         if lang == "py":
             config["main"] = "eval.py"
         args = []
-        if mode == SubmissionMode.REFERENCE:
-            args.append(REFERENCE_TIMING_ARG)
+        if mode == SubmissionMode.BASELINE:
             submission_content = ""
         config["args"] = args
         return {
@@ -274,10 +273,7 @@ def build_task_config(
             "ranked_timeout": task.ranked_timeout,
             "ranking_by": task.ranking_by.value,
             "seed": task.seed,
-            "args": [],
         }
-        if mode == SubmissionMode.REFERENCE:
-            common["args"].append(REFERENCE_TIMING_ARG)
 
         if task.lang == Language.Python:
             return {

From f1eb9a73a5aea8976d4f3940c37fba22333d8119 Mon Sep 17 00:00:00 2001
From: Sahan Paliskara <sahanp@meta.com>
Date: Thu, 29 May 2025 19:28:33 -0700
Subject: [PATCH 36/42] test push

---
 src/discord-cluster-manager/run_eval.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/discord-cluster-manager/run_eval.py b/src/discord-cluster-manager/run_eval.py
index 7614d0bc..2f6aaf1d 100644
--- a/src/discord-cluster-manager/run_eval.py
+++ b/src/discord-cluster-manager/run_eval.py
@@ -296,7 +296,7 @@ def run_single_evaluation(
             tests_file.write(tests)
             tests_file.flush()
             return run_program(call + [mode, tests_file.name], seed=seed, timeout=test_timeout)
-    elif mode in ["benchmark", "profile", "leaderboard", "reference"]:
+    elif mode in ["benchmark", "profile", "leaderboard", "baseline"]:
         timeout = ranked_timeout if mode == "leaderboard" else benchmark_timeout
         with tempfile.NamedTemporaryFile("w") as bench_file:
             if ranking_by == "last":

From 748a5371cd023770c36bf180d4e78195580213dd Mon Sep 17 00:00:00 2001
From: Sahan Paliskara <sahanp@meta.com>
Date: Thu, 29 May 2025 19:50:53 -0700
Subject: [PATCH 37/42] fix lint errors and cleanup

---
 src/discord-cluster-manager/cogs/leaderboard_cog.py | 8 ++++----
 src/discord-cluster-manager/consts.py               | 2 +-
 src/discord-cluster-manager/eval.py                 | 2 +-
 src/discord-cluster-manager/launchers/github.py     | 1 -
 src/discord-cluster-manager/leaderboard_db.py       | 3 +--
 src/discord-cluster-manager/report.py               | 4 ++--
 src/discord-cluster-manager/run_eval.py             | 6 ++++--
 src/discord-cluster-manager/submission.py           | 3 ++-
 src/discord-cluster-manager/task.py                 | 2 +-
 9 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/src/discord-cluster-manager/cogs/leaderboard_cog.py b/src/discord-cluster-manager/cogs/leaderboard_cog.py
index 9171efc6..63af6adb 100644
--- a/src/discord-cluster-manager/cogs/leaderboard_cog.py
+++ b/src/discord-cluster-manager/cogs/leaderboard_cog.py
@@ -5,10 +5,10 @@
 
 import discord
 from consts import (
+    BASELINE_USER,
+    BASELINE_USER_ID,
     SubmissionMode,
     get_gpu_by_name,
-    BASELINE_USER_ID,
-    BASELINE_USER,
 )
 from discord import app_commands
 from discord.ext import commands
@@ -20,7 +20,6 @@
 from utils import (
     LeaderboardItem,
     LeaderboardRankedEntry,
-    RunItem,
     SubmissionItem,
     format_time,
     get_user_from_id,
@@ -193,7 +192,8 @@ async def post_submit_hook(self, interaction: discord.Interaction, sub_id: int):
         for run in sub_data["runs"]:
             if (
                 not run["secret"]
-                and (run["mode"] == SubmissionMode.LEADERBOARD.value or run["mode"] == SubmissionMode.BASELINE.value)
+                and (run["mode"] == SubmissionMode.LEADERBOARD.value
+                or run["mode"] == SubmissionMode.BASELINE.value)
                 and run["passed"]
             ):
                 result_lines.append(self.generate_run_verdict(run, sub_data))
diff --git a/src/discord-cluster-manager/consts.py b/src/discord-cluster-manager/consts.py
index d0934dd9..2018ac4a 100644
--- a/src/discord-cluster-manager/consts.py
+++ b/src/discord-cluster-manager/consts.py
@@ -168,4 +168,4 @@ class RankCriterion(Enum):
 
 # Aliases for backward compatibility (to be removed in future release)
 REFERENCE_USER = BASELINE_USER
-REFERENCE_USER_ID = BASELINE_USER_ID
\ No newline at end of file
+REFERENCE_USER_ID = BASELINE_USER_ID
diff --git a/src/discord-cluster-manager/eval.py b/src/discord-cluster-manager/eval.py
index c36de918..d0f693e1 100644
--- a/src/discord-cluster-manager/eval.py
+++ b/src/discord-cluster-manager/eval.py
@@ -92,4 +92,4 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/src/discord-cluster-manager/launchers/github.py b/src/discord-cluster-manager/launchers/github.py
index 7a5c13e8..10232ebd 100644
--- a/src/discord-cluster-manager/launchers/github.py
+++ b/src/discord-cluster-manager/launchers/github.py
@@ -44,7 +44,6 @@ async def run_submission(
             raise ValueError(f"Invalid GPU type: {gpu_type.value}")
 
         lang = config["lang"]
-        args = config.get("args", [])
         if lang == "cu" and gpu_vendor == "AMD":
             # TODO implement HIP
             raise NotImplementedError("Cannot use CUDA runs with AMD GPUs")
diff --git a/src/discord-cluster-manager/leaderboard_db.py b/src/discord-cluster-manager/leaderboard_db.py
index e327afab..69a08d7a 100644
--- a/src/discord-cluster-manager/leaderboard_db.py
+++ b/src/discord-cluster-manager/leaderboard_db.py
@@ -6,6 +6,7 @@
 
 import discord
 import psycopg2
+from consts import BASELINE_USER, BASELINE_USER_ID
 from env import (
     DATABASE_URL,
     DISABLE_SSL,
@@ -27,8 +28,6 @@
     setup_logging,
 )
 
-from consts import BASELINE_USER_ID, BASELINE_USER
-
 leaderboard_name_cache = LRUCache(max_size=512)
 
 logger = setup_logging(__name__)
diff --git a/src/discord-cluster-manager/report.py b/src/discord-cluster-manager/report.py
index fb43d31f..07e3b517 100644
--- a/src/discord-cluster-manager/report.py
+++ b/src/discord-cluster-manager/report.py
@@ -160,7 +160,7 @@ def make_short_report(runs: dict[str, EvalResult], full=True) -> list[str]:  # n
     Creates a minimalistic report for `runs`,
     returned as a list of status strings
     """
-    
+
     any_compile = False
     result = []
     for r in runs.values():
@@ -219,7 +219,7 @@ def make_short_report(runs: dict[str, EvalResult], full=True) -> list[str]:  # n
             result.append("✅ Leaderboard run successful")
     elif full:
         result.append("❌ Leaderboard missing")
-    
+
     if "baseline" in runs:
         ref_run = runs["baseline"].run
         if not ref_run.success:
diff --git a/src/discord-cluster-manager/run_eval.py b/src/discord-cluster-manager/run_eval.py
index 2f6aaf1d..9cc23bea 100644
--- a/src/discord-cluster-manager/run_eval.py
+++ b/src/discord-cluster-manager/run_eval.py
@@ -285,7 +285,6 @@ def run_single_evaluation(
     ranked_timeout: int = Timeout.RANKED,
     ranking_by: str = "last",
     seed: Optional[int] = None,
-    args: Optional[list[str]] = [],
 ) -> RunResult:
     """
     A single runner run, either in the context of test files, or in the
@@ -456,7 +455,10 @@ def run_pytorch_script(  # noqa: C901
         comp = None
         if not is_baseline:
             try:
-                compile_run = run_program(["python", "submission.py"], seed=1, timeout=Timeout.COMPILE)
+                compile_run = run_program(["python",
+                "submission.py"],
+                 seed=1,
+                 timeout=Timeout.COMPILE)
                 if "-DTORCH_EXTENSION_NAME" in compile_run.stdout:
                     comp = CompileResult(
                         nvcc_found=True,
diff --git a/src/discord-cluster-manager/submission.py b/src/discord-cluster-manager/submission.py
index 090f7dee..37c14de6 100644
--- a/src/discord-cluster-manager/submission.py
+++ b/src/discord-cluster-manager/submission.py
@@ -48,7 +48,8 @@ def prepare_submission(req: SubmissionRequest, lb_db: LeaderboardDB) -> Processe
     # Ensure leaderboard name is present (might have come from the command directly)
     if req.leaderboard is None:
         raise KernelBotError(
-            "Missing leaderboard name. Either supply one as a command argument or via ``#!POPCORN leaderboard <name>`` directive.",
+            "Missing leaderboard name. Either supply one as a command \
+                argument or via ``#!POPCORN leaderboard <name>`` directive.",
         )
 
     leaderboard = lookup_leaderboard(req.leaderboard, lb_db)
diff --git a/src/discord-cluster-manager/task.py b/src/discord-cluster-manager/task.py
index db3adbb8..8641d4f7 100644
--- a/src/discord-cluster-manager/task.py
+++ b/src/discord-cluster-manager/task.py
@@ -64,7 +64,7 @@ class LeaderboardTask:
     ranking_by: RankCriterion = RankCriterion.LAST
     templates: dict[str, str] = dataclasses.field(default_factory=dict)
     seed: Optional[int] = None
-    
+
 
     @staticmethod
     def from_dict(data: dict):

From 8224cb0fb2d4076d7c7b045264c16cea44ec7a1c Mon Sep 17 00:00:00 2001
From: Sahan Paliskara <sahanp@meta.com>
Date: Fri, 30 May 2025 09:46:16 -0700
Subject: [PATCH 38/42] fix lint errors and cleanup

---
 examples/eval.py | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/examples/eval.py b/examples/eval.py
index f80bfcfa..e7b89104 100644
--- a/examples/eval.py
+++ b/examples/eval.py
@@ -20,10 +20,6 @@
 from reference import check_implementation, generate_input, ref_kernel
 
 
-# -----------------------------------------------------------------------------
-# Determine which kernel to use (baseline or submission)
-# -----------------------------------------------------------------------------
-MODE_BASELINE_STRING = "baseline"  # Define the string to check for mode
 class PopcornOutput:
     def __init__(self, fd: int):
         self.file = os.fdopen(fd, 'w')
@@ -207,7 +203,7 @@ def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_t
     Runs one benchmark. Do not call directly.
     """
     if not is_baseline_run:
-        # submission does not exist for a reference run
+        # submission does not exist for a baseline run
         from submission import custom_kernel
 
     durations = []
@@ -337,7 +333,6 @@ def main():
         return 222
 
     mode = sys.argv[1].strip()
-    print(f"Running in mode {mode}")
     seed = os.getenv("POPCORN_SEED")
     os.unsetenv("POPCORN_SEED")
     seed = int(seed) if seed else None
@@ -353,8 +348,8 @@ def main():
             if mode == "benchmark":
                 return run_benchmarking(logger, pool, tests)
 
-            if mode == "leaderboard" or mode == "reference":
-                is_baseline_run = mode == "reference"
+            if (mode == "leaderboard") or (mode == "baseline"):
+                is_baseline_run = mode == "baseline"
                 # warmup
                 run_single_benchmark(pool, tests[0], False, 100, 1e7, is_baseline_run)
                 logger.log("benchmark-count", len(tests))

From 9ea4e778eea6829cc13bd2e203d94b1fcfb61567 Mon Sep 17 00:00:00 2001
From: Sahan Paliskara <sahanp@meta.com>
Date: Fri, 30 May 2025 10:10:01 -0700
Subject: [PATCH 39/42] final cleanup

---
 .github/workflows/nvidia_workflow.yml         |  5 +--
 .../cogs/leaderboard_cog.py                   | 33 +++++++++++++++++++
 src/discord-cluster-manager/consts.py         |  8 -----
 3 files changed, 34 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/nvidia_workflow.yml b/.github/workflows/nvidia_workflow.yml
index 78d950da..52eb4a81 100644
--- a/.github/workflows/nvidia_workflow.yml
+++ b/.github/workflows/nvidia_workflow.yml
@@ -18,10 +18,7 @@ jobs:
     container:
       image: nvidia/cuda:12.4.0-devel-ubuntu22.04
     steps:
-    - name: Checkout code
-      uses: actions/checkout@v4
-      with:
-        ref: ${{ github.event.pull_request.head.sha }}
+    - uses: actions/checkout@v4
 
     - name: Setup Python
       uses: actions/setup-python@v5
diff --git a/src/discord-cluster-manager/cogs/leaderboard_cog.py b/src/discord-cluster-manager/cogs/leaderboard_cog.py
index 63af6adb..354752f7 100644
--- a/src/discord-cluster-manager/cogs/leaderboard_cog.py
+++ b/src/discord-cluster-manager/cogs/leaderboard_cog.py
@@ -20,6 +20,7 @@
 from utils import (
     LeaderboardItem,
     LeaderboardRankedEntry,
+    RunItem,
     SubmissionItem,
     format_time,
     get_user_from_id,
@@ -185,6 +186,38 @@ async def on_submit_hook(  # noqa: C901
             await self.post_submit_hook(interaction, sub_id)
         return sub_id
 
+    def generate_run_verdict(self, run: RunItem, sub_data: SubmissionItem):
+        medals = {1: "🥇 First", 2: "🥈 Second", 3: "🥉 Third"}
+
+        # get the competition
+        with self.bot.leaderboard_db as db:
+            competition = db.get_leaderboard_submissions(
+                sub_data["leaderboard_name"], run["runner"]
+            )
+        # compare against the competition
+        other_by_user = False
+        run_time = float(run["score"])
+        score_text = format_time(run_time * 1e9)
+
+        for entry in competition:
+            # can we find our own run? Only if it is the fastest submission by this user
+            if entry["submission_id"] == sub_data["submission_id"]:
+                rank = entry["rank"]
+                if 1 <= rank <= 3:
+                    return f"> {medals[rank]} place on {run['runner']}: {score_text}"
+                elif rank <= 10:
+                    return f"> {rank}th place on {run['runner']}: {score_text}"
+                else:
+                    return f"> Personal best on {run['runner']}: {score_text}"
+            elif entry["user_id"] == sub_data["user_id"]:
+                other_by_user = True
+        if other_by_user:
+            # User already has a submission that is faster
+            return f"> Successful on {run['runner']}: {score_text}"
+        else:
+            # no submission by the user exists
+            return f"> 🍾 First successful submission on {run['runner']}: {score_text}"
+
     async def post_submit_hook(self, interaction: discord.Interaction, sub_id: int):
         with self.bot.leaderboard_db as db:
             sub_data: SubmissionItem = db.get_submission_by_id(sub_id)
diff --git a/src/discord-cluster-manager/consts.py b/src/discord-cluster-manager/consts.py
index 2018ac4a..03ada325 100644
--- a/src/discord-cluster-manager/consts.py
+++ b/src/discord-cluster-manager/consts.py
@@ -99,10 +99,6 @@ class SubmissionMode(Enum):
     SCRIPT = "script"
     BASELINE = "baseline"
 
-    # Alias for backward compatibility; to be removed in future release
-    REFERENCE = "baseline"
-
-
 class Language(Enum):
     Python = "py"
     CUDA = "cu"
@@ -165,7 +161,3 @@ class RankCriterion(Enum):
 # Constants used for baseline runs
 BASELINE_USER = "BASELINE_USER"
 BASELINE_USER_ID = -123
-
-# Aliases for backward compatibility (to be removed in future release)
-REFERENCE_USER = BASELINE_USER
-REFERENCE_USER_ID = BASELINE_USER_ID

From d9b2d0245fc64a2983c197a14504acfc39875e81 Mon Sep 17 00:00:00 2001
From: Sahan Paliskara <sahanp@meta.com>
Date: Fri, 30 May 2025 10:57:12 -0700
Subject: [PATCH 40/42] get full error to fix bug in ci

---
 scripts/ci_test_python.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/ci_test_python.py b/scripts/ci_test_python.py
index d36a7c9a..410a0a55 100644
--- a/scripts/ci_test_python.py
+++ b/scripts/ci_test_python.py
@@ -49,6 +49,7 @@ def custom_kernel(input):
         """
 
     run = run_pytorch_helper({**files, "submission.py": sub})
+    print(f"full run is: \n {run}")
     assert run.success is True
     assert run.passed is False
     assert "python eval.py test" in run.command

From ad9e7582ba89d67e99731d97456bcd840ac8abbc Mon Sep 17 00:00:00 2001
From: Sahan Paliskara <sahanp@meta.com>
Date: Fri, 30 May 2025 11:02:15 -0700
Subject: [PATCH 41/42] fix ci bug

---
 examples/eval.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/eval.py b/examples/eval.py
index e7b89104..37d3eef9 100644
--- a/examples/eval.py
+++ b/examples/eval.py
@@ -156,7 +156,7 @@ def _run_single_test(test: TestCase):
     from submission import custom_kernel
     data = generate_input(**test.args)
     torch.cuda.synchronize()
-    submission_output = active_kernel(_clone_data(data))
+    submission_output = custom_kernel(_clone_data(data))
     torch.cuda.synchronize()
     return wrap_check_implementation(data, submission_output)
 

From 6a365ff82cfa6a2a1b304b2fd4bf64a94f8cd963 Mon Sep 17 00:00:00 2001
From: Sahan Paliskara <sahanp@meta.com>
Date: Fri, 30 May 2025 11:02:23 -0700
Subject: [PATCH 42/42] fix ci bug

---
 scripts/ci_test_python.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/scripts/ci_test_python.py b/scripts/ci_test_python.py
index 410a0a55..d36a7c9a 100644
--- a/scripts/ci_test_python.py
+++ b/scripts/ci_test_python.py
@@ -49,7 +49,6 @@ def custom_kernel(input):
         """
 
     run = run_pytorch_helper({**files, "submission.py": sub})
-    print(f"full run is: \n {run}")
     assert run.success is True
     assert run.passed is False
     assert "python eval.py test" in run.command