From 77d04e1f610cf7987c3308365caef072801c9397 Mon Sep 17 00:00:00 2001 From: Sahan Paliskara Date: Wed, 28 May 2025 13:08:01 -0700 Subject: [PATCH 01/42] push to test workflow --- .github/workflows/nvidia_workflow.yml | 2 + .../cogs/leaderboard_cog.py | 101 ++++++++++++++---- src/discord-cluster-manager/consts.py | 4 + src/discord-cluster-manager/eval.py | 47 +++++--- .../launchers/github.py | 1 + src/discord-cluster-manager/leaderboard_db.py | 26 +++++ src/discord-cluster-manager/run_eval.py | 5 +- src/discord-cluster-manager/submission.py | 37 ++++--- src/discord-cluster-manager/utils.py | 5 +- 9 files changed, 176 insertions(+), 52 deletions(-) diff --git a/.github/workflows/nvidia_workflow.yml b/.github/workflows/nvidia_workflow.yml index 7852377d..5cf7c0a3 100644 --- a/.github/workflows/nvidia_workflow.yml +++ b/.github/workflows/nvidia_workflow.yml @@ -28,6 +28,8 @@ jobs: - name: Create input files shell: bash run: | + # install jq + apt update && apt install -y jq # Extract the payload content without printing it PAYLOAD=$(jq -r '.inputs.payload' $GITHUB_EVENT_PATH) diff --git a/src/discord-cluster-manager/cogs/leaderboard_cog.py b/src/discord-cluster-manager/cogs/leaderboard_cog.py index fadc9c41..cc4366bf 100644 --- a/src/discord-cluster-manager/cogs/leaderboard_cog.py +++ b/src/discord-cluster-manager/cogs/leaderboard_cog.py @@ -27,6 +27,8 @@ with_error_handling, ) +from consts import REFERENCE_USER_ID, REFERENCE_USER + if TYPE_CHECKING: from ..bot import ClusterBot @@ -63,28 +65,51 @@ async def on_submit_hook( # noqa: C901 self, interaction: discord.Interaction, leaderboard_name: Optional[str], - script: discord.Attachment, + script: Optional[discord.Attachment], mode: SubmissionMode, cmd_gpus: Optional[List[str]], ) -> int: """ Called as the main body of a submission to route to the correct runner. """ - # Read the template file - submission_content = await script.read() - try: - submission_content = submission_content.decode() - except UnicodeError: - await send_discord_message( - interaction, "Could not decode your file. Is it UTF-8?", ephemeral=True - ) - return -1 + if script is None: + if mode != SubmissionMode.REFERENCE: + await send_discord_message( + interaction, + "Script attachment is required for this unless submission mode is reference", + ephemeral=True, + ) + return -1 + else: + submission_content = "" + else: + # Read the template file + submission_content = await script.read() + + try: + submission_content = submission_content.decode() + except UnicodeError: + await send_discord_message( + interaction, "Could not decode your file. Is it UTF-8?", ephemeral=True + ) + return -1 + if mode == SubmissionMode.REFERENCE: + # create fake reference submission + file_name = None + submission_content = None + user_id = REFERENCE_USER_ID + user_name = REFERENCE_USER + else: + file_name = script.filename + submission_content = submission_content + user_id = interaction.user.id + user_name = interaction.user.global_name or interaction.user.name req = SubmissionRequest( code=submission_content, - file_name=script.filename, - user_id=interaction.user.id, + file_name=file_name, + user_id=user_id, gpus=cmd_gpus, leaderboard=leaderboard_name, ) @@ -105,26 +130,28 @@ async def on_submit_hook( # noqa: C901 command = self.bot.get_cog("SubmitCog").submit_leaderboard - user_name = interaction.user.global_name or interaction.user.name # Create a submission entry in the database with self.bot.leaderboard_db as db: sub_id = db.create_submission( leaderboard=req.leaderboard, - file_name=script.filename, + file_name=file_name, code=submission_content, - user_id=interaction.user.id, + user_id=user_id, time=datetime.now(), user_name=user_name, ) + if mode == SubmissionMode.REFERENCE: + run_msg = f"Submission **{sub_id}**: is a reference submission for `{req.leaderboard}`" + else: + run_msg = f"Submission **{sub_id}**: `{file_name}` for `{req.leaderboard}`" - run_msg = f"Submission **{sub_id}**: `{script.filename}` for `{req.leaderboard}`" reporter = MultiProgressReporter(interaction, run_msg) try: tasks = [ command( sub_id, submission_content, - script.filename, + file_name, gpu, reporter.add_run(f"{gpu.name} on {gpu.runner}"), req.task, @@ -140,7 +167,7 @@ async def on_submit_hook( # noqa: C901 command( sub_id, submission_content, - script.filename, + file_name, gpu, reporter.add_run(f"{gpu.name} on {gpu.runner} (secret)"), req.task, @@ -224,10 +251,19 @@ async def submit( self, interaction: discord.Interaction, leaderboard_name: Optional[str], - script: discord.Attachment, + script: Optional[discord.Attachment], mode: SubmissionMode, gpu: Optional[str], ): + + if not mode == SubmissionMode.REFERENCE and not script: + await send_discord_message( + interaction, + "Script attachment is required for this unless submission mode is reference", + ephemeral=True, + ) + return + if not self.bot.accepts_jobs: await send_discord_message( interaction, @@ -319,6 +355,33 @@ async def submit_ranked( interaction, leaderboard_name, script, mode=SubmissionMode.LEADERBOARD, gpu=gpu ) + @app_commands.command(name="reference_run", description="Create a reference run for a leaderboard") + @app_commands.describe( + leaderboard_name="Name of the leaderboard to create a reference run for", + gpu="Select GPU. Leave empty for interactive selection.", + ) + @app_commands.autocomplete(leaderboard_name=leaderboard_name_autocomplete) + @with_error_handling + async def submit_reference( + self, + interaction: discord.Interaction, + leaderboard_name: str, + gpu: Optional[str] = None, + ): + # Check if reference run already exists + with self.bot.leaderboard_db as db: + if db.has_reference_run(leaderboard_name): + await send_discord_message( + interaction, + f"A reference run for leaderboard '{leaderboard_name}' already exists.", + ephemeral=True, + ) + return + # Process as a special submission + return await self.submit( + interaction, leaderboard_name, None, mode=SubmissionMode.REFERENCE, gpu=gpu + ) + async def lang_autocomplete( interaction: discord.Interaction, diff --git a/src/discord-cluster-manager/consts.py b/src/discord-cluster-manager/consts.py index 928f59d4..ac2ffaf2 100644 --- a/src/discord-cluster-manager/consts.py +++ b/src/discord-cluster-manager/consts.py @@ -97,6 +97,7 @@ class SubmissionMode(Enum): LEADERBOARD = "leaderboard" PRIVATE = "private" SCRIPT = "script" + REFERENCE = "reference" class Language(Enum): @@ -157,3 +158,6 @@ class RankCriterion(Enum): --index-url https://download.pytorch.org/whl/rocm6.2.4 torch """ + +REFERENCE_USER = "REFERENCE_USER" +REFERENCE_USER_ID = -123 \ No newline at end of file diff --git a/src/discord-cluster-manager/eval.py b/src/discord-cluster-manager/eval.py index d0f693e1..d27cfbff 100644 --- a/src/discord-cluster-manager/eval.py +++ b/src/discord-cluster-manager/eval.py @@ -1,3 +1,4 @@ +import argparse import math import os import sys @@ -29,15 +30,22 @@ def correctness(rng: torch.Generator) -> bool: return True -def metric(logger: PopcornLogger, rng: torch.Generator): +def metric(logger: PopcornLogger, rng: torch.Generator, time_reference_impl: bool = False): warmup_runs = 10 timed_runs = 100 + if time_reference_impl: + logger.log("Timing Reference Implementation") + else: + logger.log("Timing Submitted Custom Implementation") # Warmup Code print("warming up...") for _ in range(warmup_runs): inputs = generate_input(torch.randint(0, int(2**31), (), generator=rng).item()) - _ = custom_kernel(inputs) + if time_reference_impl: + _ = ref_kernel(inputs) + else: + _ = custom_kernel(inputs) torch.cuda.synchronize() # Timing Code @@ -47,16 +55,20 @@ def metric(logger: PopcornLogger, rng: torch.Generator): inputs = generate_input(torch.randint(0, int(2**31), (), generator=rng).item()) start_time = time.time() - custom_output = custom_kernel(inputs) + if time_reference_impl: + ref_output = ref_kernel(inputs) + else: + custom_output = custom_kernel(inputs) torch.cuda.synchronize() end_time = time.time() times.append(end_time - start_time) - ref_output = ref_kernel(inputs) - torch.cuda.synchronize() - if not check_implementation(custom_output, ref_output): - logger.log("check", "fail") - exit(112) + if not time_reference_impl: + ref_output = ref_kernel(inputs) + torch.cuda.synchronize() + if not check_implementation(custom_output, ref_output): + logger.log("check", "fail") + exit(112) total_time = sum(times) average_duration = total_time / timed_runs @@ -71,10 +83,17 @@ def metric(logger: PopcornLogger, rng: torch.Generator): logger.log("duration.best", min(times) * 1e9) logger.log("duration.worst", max(times) * 1e9) - print(f"Submitted kernel runtime: {average_duration:.4f} ± {standard_error:.4} seconds") + kernel_name = "Reference" if time_reference_impl else "Submitted" + print(f"{kernel_name} kernel runtime: {average_duration:.4f} ± {standard_error:.4} seconds") def main(): + parser = argparse.ArgumentParser(description='Evaluate kernel implementation.') + parser.add_argument( + '--time-ref', action='store_true', help='Time ref kernel.' + ) + args = parser.parse_args() + try: logger = PopcornLogger(int(os.environ["POPCORN_FD"])) except Exception as e: @@ -85,10 +104,12 @@ def main(): rng = torch.Generator() rng.manual_seed(seed) - if not correctness(rng): - logger.log("check", "fail") - exit(112) - metric(logger, rng) + if not args.time_ref: + if not correctness(rng): + logger.log("check", "fail") + exit(112) + + metric(logger, rng, time_reference_impl=args.time_ref) if __name__ == "__main__": diff --git a/src/discord-cluster-manager/launchers/github.py b/src/discord-cluster-manager/launchers/github.py index 10232ebd..7a5c13e8 100644 --- a/src/discord-cluster-manager/launchers/github.py +++ b/src/discord-cluster-manager/launchers/github.py @@ -44,6 +44,7 @@ async def run_submission( raise ValueError(f"Invalid GPU type: {gpu_type.value}") lang = config["lang"] + args = config.get("args", []) if lang == "cu" and gpu_vendor == "AMD": # TODO implement HIP raise NotImplementedError("Cannot use CUDA runs with AMD GPUs") diff --git a/src/discord-cluster-manager/leaderboard_db.py b/src/discord-cluster-manager/leaderboard_db.py index d48e8404..3b979e69 100644 --- a/src/discord-cluster-manager/leaderboard_db.py +++ b/src/discord-cluster-manager/leaderboard_db.py @@ -27,6 +27,8 @@ setup_logging, ) +from consts import REFERENCE_USER_ID, REFERENCE_USER + leaderboard_name_cache = LRUCache(max_size=512) logger = setup_logging(__name__) @@ -213,6 +215,11 @@ def create_submission( time: datetime.datetime, user_name: str = None, ) -> Optional[int]: + if user_id == REFERENCE_USER_ID and user_name == REFERENCE_USER: + # todo: add reference code to the database + code = "" + file_name = "reference.py" + try: # check if we already have the code self.cursor.execute( @@ -287,6 +294,25 @@ def create_submission( self.connection.rollback() # Ensure rollback if error occurs raise KernelBotError("Error during creation of submission") from e + def has_reference_run(self, leaderboard_name: str) -> bool: + try: + self.cursor.execute( + """ + SELECT COUNT(*) FROM leaderboard.runs + WHERE leaderboard.runs.leaderboard_id = ( + SELECT leaderboard.leaderboard.id + FROM leaderboard.leaderboard + WHERE leaderboard.leaderboard.name = %s + ) + AND leaderboard.runs.user_id = %s; + """, + (leaderboard_name, REFERENCE_USER_ID), + ) + return self.cursor.fetchone()[0] > 0 + except psycopg2.Error as e: + logger.error("Error checking for reference run", exc_info=e) + return False + def mark_submission_done( self, submission: int, diff --git a/src/discord-cluster-manager/run_eval.py b/src/discord-cluster-manager/run_eval.py index 5e7ab046..8ab6a28f 100644 --- a/src/discord-cluster-manager/run_eval.py +++ b/src/discord-cluster-manager/run_eval.py @@ -455,7 +455,7 @@ def run_pytorch_script( # noqa: C901 # "compile" step: execute the script once. Will populate # `load_inline`'s compile cache, so the actual runs will be faster. try: - compile_run = run_program(["python", "submission.py"], seed=1, timeout=Timeout.COMPILE) + compile_run = run_program(["python", "submission.py"].extend(kwargs.get("args", [])), seed=1, timeout=Timeout.COMPILE) if "-DTORCH_EXTENSION_NAME" in compile_run.stdout: comp = CompileResult( nvcc_found=True, @@ -511,7 +511,7 @@ def run_evaluation( require multiple runner calls. """ results: dict[str, EvalResult] = {} - if mode in ["test", "benchmark", "profile", "script"]: + if mode in ["test", "benchmark", "profile", "script", "reference"]: results[mode] = call(mode=mode) elif mode in ["private", "leaderboard"]: # first, run the tests @@ -552,6 +552,7 @@ def run_config(config: dict): "ranked_timeout": config.get("ranked_timeout", Timeout.RANKED), "benchmark_timeout": config.get("benchmark_timeout", Timeout.BENCHMARK), "test_timeout": config.get("test_timeout", Timeout.TEST), + "args": config.get("args", []), } if config["lang"] == "py": runner = functools.partial( diff --git a/src/discord-cluster-manager/submission.py b/src/discord-cluster-manager/submission.py index 2777b15f..19c0c6a1 100644 --- a/src/discord-cluster-manager/submission.py +++ b/src/discord-cluster-manager/submission.py @@ -27,19 +27,30 @@ class ProcessedSubmissionRequest(SubmissionRequest): def prepare_submission(req: SubmissionRequest, lb_db: LeaderboardDB) -> ProcessedSubmissionRequest: - if profanity.contains_profanity(req.file_name): - raise KernelBotError("Please provide a non rude filename") + # Detect reference submissions (no file name & no code provided) + # A reference submission is identified by missing/empty code content (no user file) + is_reference_submission = not req.code - # check file extension - if not req.file_name.endswith((".py", ".cu", ".cuh", ".cpp")): + # Perform filename/content related checks only for *non* reference submissions + if not is_reference_submission: + if profanity.contains_profanity(req.file_name): + raise KernelBotError("Please provide a non rude filename") + + # check file extension (if filename provided) + if req.file_name and not req.file_name.endswith((".py", ".cu", ".cuh", ".cpp")): + raise KernelBotError( + "Please provide a Python (.py) or CUDA (.cu / .cuh / .cpp) file", + ) + + # process file directives (GPU selection / leaderboard name) + req = handle_popcorn_directives(req) + + # Ensure leaderboard name is present (might have come from the command directly) + if req.leaderboard is None: raise KernelBotError( - "Please provide a Python (.py) or CUDA (.cu / .cuh / .cpp) file", + "Missing leaderboard name. Either supply one as a command argument or via ``#!POPCORN leaderboard `` directive.", ) - # process file directives - req = handle_popcorn_directives(req) - assert req.leaderboard is not None - leaderboard = lookup_leaderboard(req.leaderboard, lb_db) check_deadline(leaderboard) @@ -117,14 +128,6 @@ def handle_popcorn_directives(req: SubmissionRequest) -> SubmissionRequest: else: req.leaderboard = info["leaderboard"] - if req.leaderboard is None: - raise KernelBotError( - "Missing leaderboard name. " - "Either supply one as an argument in the submit command, or " - "specify it in your submission script using the " - "`{#,//}!POPCORN leaderboard ` directive.", - ) - return req diff --git a/src/discord-cluster-manager/utils.py b/src/discord-cluster-manager/utils.py index c39192f7..86742561 100644 --- a/src/discord-cluster-manager/utils.py +++ b/src/discord-cluster-manager/utils.py @@ -245,8 +245,11 @@ def build_task_config( if lang == "py": config["main"] = "eval.py" - + args = [] + if mode == SubmissionMode.REFERENCE: + args.append("--time-ref") return { + "args": args, **config, "sources": { eval_name: submission_content, From f42f885bd991584af7552503728f3e95f74bd04a Mon Sep 17 00:00:00 2001 From: Sahan Paliskara Date: Wed, 28 May 2025 14:02:10 -0700 Subject: [PATCH 02/42] test push --- src/discord-cluster-manager/consts.py | 3 ++- src/discord-cluster-manager/eval.py | 3 ++- src/discord-cluster-manager/run_eval.py | 14 +++++++++----- src/discord-cluster-manager/utils.py | 4 ++-- 4 files changed, 15 insertions(+), 9 deletions(-) diff --git a/src/discord-cluster-manager/consts.py b/src/discord-cluster-manager/consts.py index ac2ffaf2..cf50fdf2 100644 --- a/src/discord-cluster-manager/consts.py +++ b/src/discord-cluster-manager/consts.py @@ -160,4 +160,5 @@ class RankCriterion(Enum): """ REFERENCE_USER = "REFERENCE_USER" -REFERENCE_USER_ID = -123 \ No newline at end of file +REFERENCE_USER_ID = -123 +REFERENCE_TIMING_ARG = "--reference-timing" \ No newline at end of file diff --git a/src/discord-cluster-manager/eval.py b/src/discord-cluster-manager/eval.py index d27cfbff..6c7f81ef 100644 --- a/src/discord-cluster-manager/eval.py +++ b/src/discord-cluster-manager/eval.py @@ -7,6 +7,7 @@ import torch from reference import check_implementation, generate_input, ref_kernel from submission import custom_kernel +from consts import REFERENCE_TIMING_ARG class PopcornLogger: @@ -90,7 +91,7 @@ def metric(logger: PopcornLogger, rng: torch.Generator, time_reference_impl: boo def main(): parser = argparse.ArgumentParser(description='Evaluate kernel implementation.') parser.add_argument( - '--time-ref', action='store_true', help='Time ref kernel.' + REFERENCE_TIMING_ARG, action='store_true', help='Time ref kernel.' ) args = parser.parse_args() diff --git a/src/discord-cluster-manager/run_eval.py b/src/discord-cluster-manager/run_eval.py index 8ab6a28f..2652bf62 100644 --- a/src/discord-cluster-manager/run_eval.py +++ b/src/discord-cluster-manager/run_eval.py @@ -10,7 +10,7 @@ from types import NoneType from typing import Optional, Protocol, Union -from consts import CUDA_FLAGS, ExitCode, Timeout +from consts import CUDA_FLAGS, ExitCode, REFERENCE_TIMING_ARG, Timeout @dataclasses.dataclass @@ -446,11 +446,15 @@ def run_pytorch_script( # noqa: C901 RunResult """ start = datetime.datetime.now() + args = kwargs.get("args", []) + # log everything that's going on + print("Running with args: %s", args) + print("Running with sources: %s", sources) + print("Running with main: %s", main) try: - assert main in sources.keys() - - # Write submission files to directory - _create_files(sources) + if REFERENCE_TIMING_ARG not in args: + assert main in sources.keys() + _create_files(sources) # "compile" step: execute the script once. Will populate # `load_inline`'s compile cache, so the actual runs will be faster. diff --git a/src/discord-cluster-manager/utils.py b/src/discord-cluster-manager/utils.py index 86742561..695faab9 100644 --- a/src/discord-cluster-manager/utils.py +++ b/src/discord-cluster-manager/utils.py @@ -5,7 +5,7 @@ from typing import TYPE_CHECKING, Any, List, NotRequired, Optional, TypedDict import discord -from consts import Language, SubmissionMode +from consts import Language, SubmissionMode, REFERENCE_TIMING_ARG if TYPE_CHECKING: from task import LeaderboardTask @@ -247,7 +247,7 @@ def build_task_config( config["main"] = "eval.py" args = [] if mode == SubmissionMode.REFERENCE: - args.append("--time-ref") + args.append(REFERENCE_TIMING_ARG) return { "args": args, **config, From 1c5d1220d0bbaaed9a114b3fa9ba75e6d26b3525 Mon Sep 17 00:00:00 2001 From: Sahan Paliskara Date: Wed, 28 May 2025 14:08:39 -0700 Subject: [PATCH 03/42] test push --- .../cogs/leaderboard_cog.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/discord-cluster-manager/cogs/leaderboard_cog.py b/src/discord-cluster-manager/cogs/leaderboard_cog.py index cc4366bf..73a5f285 100644 --- a/src/discord-cluster-manager/cogs/leaderboard_cog.py +++ b/src/discord-cluster-manager/cogs/leaderboard_cog.py @@ -369,14 +369,14 @@ async def submit_reference( gpu: Optional[str] = None, ): # Check if reference run already exists - with self.bot.leaderboard_db as db: - if db.has_reference_run(leaderboard_name): - await send_discord_message( - interaction, - f"A reference run for leaderboard '{leaderboard_name}' already exists.", - ephemeral=True, - ) - return + # with self.bot.leaderboard_db as db: + # if db.has_reference_run(leaderboard_name): + # await send_discord_message( + # interaction, + # f"A reference run for leaderboard '{leaderboard_name}' already exists.", + # ephemeral=True, + # ) + # return # Process as a special submission return await self.submit( interaction, leaderboard_name, None, mode=SubmissionMode.REFERENCE, gpu=gpu From f9b0ea392afdb18d825cb3d6630d5f96822f63c4 Mon Sep 17 00:00:00 2001 From: Sahan Paliskara Date: Wed, 28 May 2025 15:05:55 -0700 Subject: [PATCH 04/42] test push --- src/discord-cluster-manager/run_eval.py | 1 + src/discord-cluster-manager/utils.py | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/discord-cluster-manager/run_eval.py b/src/discord-cluster-manager/run_eval.py index 2652bf62..46281bc5 100644 --- a/src/discord-cluster-manager/run_eval.py +++ b/src/discord-cluster-manager/run_eval.py @@ -448,6 +448,7 @@ def run_pytorch_script( # noqa: C901 start = datetime.datetime.now() args = kwargs.get("args", []) # log everything that's going on + print("Running with kwargs: %s", kwargs) print("Running with args: %s", args) print("Running with sources: %s", sources) print("Running with main: %s", main) diff --git a/src/discord-cluster-manager/utils.py b/src/discord-cluster-manager/utils.py index 695faab9..3ce66e7b 100644 --- a/src/discord-cluster-manager/utils.py +++ b/src/discord-cluster-manager/utils.py @@ -248,8 +248,8 @@ def build_task_config( args = [] if mode == SubmissionMode.REFERENCE: args.append(REFERENCE_TIMING_ARG) + config["args"] = args return { - "args": args, **config, "sources": { eval_name: submission_content, @@ -274,7 +274,10 @@ def build_task_config( "ranked_timeout": task.ranked_timeout, "ranking_by": task.ranking_by.value, "seed": task.seed, + "args": [], } + if mode == SubmissionMode.REFERENCE: + common["args"].append(REFERENCE_TIMING_ARG) if task.lang == Language.Python: return { From 729045eb102d5dbea31e7b65ccbc6e19059801b3 Mon Sep 17 00:00:00 2001 From: Sahan Paliskara Date: Wed, 28 May 2025 15:12:23 -0700 Subject: [PATCH 05/42] test push --- src/discord-cluster-manager/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/discord-cluster-manager/utils.py b/src/discord-cluster-manager/utils.py index 3ce66e7b..5488e556 100644 --- a/src/discord-cluster-manager/utils.py +++ b/src/discord-cluster-manager/utils.py @@ -248,6 +248,7 @@ def build_task_config( args = [] if mode == SubmissionMode.REFERENCE: args.append(REFERENCE_TIMING_ARG) + submission_content = "" config["args"] = args return { **config, From 72ac6860e310ec973c26cc2745ba1f4ce12d9797 Mon Sep 17 00:00:00 2001 From: Sahan Paliskara Date: Wed, 28 May 2025 15:17:25 -0700 Subject: [PATCH 06/42] test push --- src/discord-cluster-manager/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/discord-cluster-manager/utils.py b/src/discord-cluster-manager/utils.py index 5488e556..769debdb 100644 --- a/src/discord-cluster-manager/utils.py +++ b/src/discord-cluster-manager/utils.py @@ -263,7 +263,7 @@ def build_task_config( all_files[n] = submission_content else: all_files[n] = c - + print(f"all_files: {all_files}") common = { "lang": task.lang.value, "arch": arch, From 54a2418abea70fec4dcf49d5ce99b058b7879c40 Mon Sep 17 00:00:00 2001 From: Sahan Paliskara Date: Wed, 28 May 2025 15:23:53 -0700 Subject: [PATCH 07/42] test push --- src/discord-cluster-manager/run_eval.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/discord-cluster-manager/run_eval.py b/src/discord-cluster-manager/run_eval.py index 46281bc5..b3d067a0 100644 --- a/src/discord-cluster-manager/run_eval.py +++ b/src/discord-cluster-manager/run_eval.py @@ -219,6 +219,7 @@ def compile_cuda_script( # # noqa: C901 def run_program(args: list[str], seed: Optional[int], timeout: int) -> RunResult: print("[Running]") + print("Running with args: %s", args) # set up a pipe so the tester can communicate its verdict with us env = os.environ.copy() pipe_read, pipe_write = os.pipe() @@ -460,7 +461,7 @@ def run_pytorch_script( # noqa: C901 # "compile" step: execute the script once. Will populate # `load_inline`'s compile cache, so the actual runs will be faster. try: - compile_run = run_program(["python", "submission.py"].extend(kwargs.get("args", [])), seed=1, timeout=Timeout.COMPILE) + compile_run = run_program(["python", "submission.py"] + args, seed=1, timeout=Timeout.COMPILE) if "-DTORCH_EXTENSION_NAME" in compile_run.stdout: comp = CompileResult( nvcc_found=True, From 089445b831ba7c3d35f55d1e5919c8e54959702f Mon Sep 17 00:00:00 2001 From: Sahan Paliskara Date: Wed, 28 May 2025 15:31:01 -0700 Subject: [PATCH 08/42] test push --- src/discord-cluster-manager/run_eval.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/discord-cluster-manager/run_eval.py b/src/discord-cluster-manager/run_eval.py index b3d067a0..bfb96936 100644 --- a/src/discord-cluster-manager/run_eval.py +++ b/src/discord-cluster-manager/run_eval.py @@ -287,6 +287,7 @@ def run_single_evaluation( ranked_timeout: int = Timeout.RANKED, ranking_by: str = "last", seed: Optional[int] = None, + args: Optional[list[str]] = [], ) -> RunResult: """ A single runner run, either in the context of test files, or in the @@ -296,8 +297,8 @@ def run_single_evaluation( with tempfile.NamedTemporaryFile("w") as tests_file: tests_file.write(tests) tests_file.flush() - return run_program(call + [mode, tests_file.name], seed=seed, timeout=test_timeout) - elif mode in ["benchmark", "profile", "leaderboard"]: + return run_program(call + [mode, tests_file.name] + args, seed=seed, timeout=test_timeout) + elif mode in ["benchmark", "profile", "leaderboard", "reference"]: timeout = ranked_timeout if mode == "leaderboard" else benchmark_timeout with tempfile.NamedTemporaryFile("w") as bench_file: if ranking_by == "last": @@ -305,10 +306,10 @@ def run_single_evaluation( else: bench_file.write(benchmarks) bench_file.flush() - return run_program(call + [mode, bench_file.name], seed=seed, timeout=timeout) + return run_program(call + [mode, bench_file.name] + args, seed=seed, timeout=timeout) else: assert mode == "script" - return run_program(call, seed=seed, timeout=Timeout.SCRIPT) + return run_program(call + args, seed=seed, timeout=Timeout.SCRIPT) def make_system_info() -> SystemInfo: From b1c11523f83ca0d2cae8257dd059d586589b7aab Mon Sep 17 00:00:00 2001 From: Sahan Paliskara Date: Wed, 28 May 2025 15:45:46 -0700 Subject: [PATCH 09/42] test push --- src/discord-cluster-manager/run_eval.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/discord-cluster-manager/run_eval.py b/src/discord-cluster-manager/run_eval.py index bfb96936..b8b0c4a2 100644 --- a/src/discord-cluster-manager/run_eval.py +++ b/src/discord-cluster-manager/run_eval.py @@ -457,7 +457,8 @@ def run_pytorch_script( # noqa: C901 try: if REFERENCE_TIMING_ARG not in args: assert main in sources.keys() - _create_files(sources) + # pluck out submission.py from sources as it is not needed for the run and is None + sources.pop("submission.py") # "compile" step: execute the script once. Will populate # `load_inline`'s compile cache, so the actual runs will be faster. From e3b49dd705dff65beac12029b1ee350b0367afae Mon Sep 17 00:00:00 2001 From: Sahan Paliskara Date: Wed, 28 May 2025 15:55:26 -0700 Subject: [PATCH 10/42] test push --- src/discord-cluster-manager/run_eval.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/discord-cluster-manager/run_eval.py b/src/discord-cluster-manager/run_eval.py index b8b0c4a2..036654c1 100644 --- a/src/discord-cluster-manager/run_eval.py +++ b/src/discord-cluster-manager/run_eval.py @@ -457,7 +457,7 @@ def run_pytorch_script( # noqa: C901 try: if REFERENCE_TIMING_ARG not in args: assert main in sources.keys() - # pluck out submission.py from sources as it is not needed for the run and is None + # pluck out submission.py from sources as it is not needed for the run and is None normally sources.pop("submission.py") # "compile" step: execute the script once. Will populate From 110aea68795dc5c8075a6b86a1e3f0970773f71e Mon Sep 17 00:00:00 2001 From: Sahan Paliskara Date: Wed, 28 May 2025 16:01:25 -0700 Subject: [PATCH 11/42] test push --- src/discord-cluster-manager/run_eval.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/discord-cluster-manager/run_eval.py b/src/discord-cluster-manager/run_eval.py index 036654c1..e236d340 100644 --- a/src/discord-cluster-manager/run_eval.py +++ b/src/discord-cluster-manager/run_eval.py @@ -455,10 +455,11 @@ def run_pytorch_script( # noqa: C901 print("Running with sources: %s", sources) print("Running with main: %s", main) try: - if REFERENCE_TIMING_ARG not in args: - assert main in sources.keys() + if REFERENCE_TIMING_ARG in args: # pluck out submission.py from sources as it is not needed for the run and is None normally sources.pop("submission.py") + assert main in sources.keys() + _create_files(sources) # "compile" step: execute the script once. Will populate # `load_inline`'s compile cache, so the actual runs will be faster. From 371cad80dba16e9f0225e10949a33d408cabb599 Mon Sep 17 00:00:00 2001 From: Sahan Paliskara Date: Wed, 28 May 2025 16:46:08 -0700 Subject: [PATCH 12/42] test push --- src/discord-cluster-manager/eval.py | 4 +- src/discord-cluster-manager/run_eval.py | 56 ++++++++++++++----------- 2 files changed, 34 insertions(+), 26 deletions(-) diff --git a/src/discord-cluster-manager/eval.py b/src/discord-cluster-manager/eval.py index 6c7f81ef..b87d5eef 100644 --- a/src/discord-cluster-manager/eval.py +++ b/src/discord-cluster-manager/eval.py @@ -6,7 +6,6 @@ import torch from reference import check_implementation, generate_input, ref_kernel -from submission import custom_kernel from consts import REFERENCE_TIMING_ARG @@ -19,6 +18,7 @@ def log(self, key: str, value): def correctness(rng: torch.Generator) -> bool: + from submission import custom_kernel for _ in range(10): # check multiple times inputs = generate_input(torch.randint(0, int(2**31), (), generator=rng).item()) custom_output = custom_kernel(inputs) @@ -37,7 +37,9 @@ def metric(logger: PopcornLogger, rng: torch.Generator, time_reference_impl: boo if time_reference_impl: logger.log("Timing Reference Implementation") else: + # in the case of a reference run we don't have a submission logger.log("Timing Submitted Custom Implementation") + from submission import custom_kernel # Warmup Code print("warming up...") diff --git a/src/discord-cluster-manager/run_eval.py b/src/discord-cluster-manager/run_eval.py index e236d340..86ecde5b 100644 --- a/src/discord-cluster-manager/run_eval.py +++ b/src/discord-cluster-manager/run_eval.py @@ -454,41 +454,47 @@ def run_pytorch_script( # noqa: C901 print("Running with args: %s", args) print("Running with sources: %s", sources) print("Running with main: %s", main) + is_reference = False + if REFERENCE_TIMING_ARG in args: + # pluck out submission.py from sources as it is not needed for the run and is None normally + sources.pop("submission.py") + is_reference = True try: - if REFERENCE_TIMING_ARG in args: - # pluck out submission.py from sources as it is not needed for the run and is None normally - sources.pop("submission.py") + assert main in sources.keys() _create_files(sources) # "compile" step: execute the script once. Will populate # `load_inline`'s compile cache, so the actual runs will be faster. try: - compile_run = run_program(["python", "submission.py"] + args, seed=1, timeout=Timeout.COMPILE) - if "-DTORCH_EXTENSION_NAME" in compile_run.stdout: + if not is_reference: + compile_run = run_program(["python", "submission.py"] + args, seed=1, timeout=Timeout.COMPILE) + if "-DTORCH_EXTENSION_NAME" in compile_run.stdout: + comp = CompileResult( + nvcc_found=True, + nvcc_version="", + success=True, + command=compile_run.command, + stdout=compile_run.stdout, + stderr=compile_run.stderr, + exit_code=compile_run.exit_code, + ) + else: + comp = None + except subprocess.CalledProcessError as e: + # This step is purely optional, so we just go on + # if it fails comp = CompileResult( - nvcc_found=True, + nvcc_found=False, nvcc_version="", - success=True, - command=compile_run.command, - stdout=compile_run.stdout, - stderr=compile_run.stderr, - exit_code=compile_run.exit_code, + success=False, + command="python submission.py", + stdout=e.stdout, + stderr=e.stderr, + exit_code=e.returncode, ) - else: - comp = None - except subprocess.CalledProcessError as e: - # This step is purely optional, so we just go on - # if it fails - comp = CompileResult( - nvcc_found=False, - nvcc_version="", - success=False, - command="python submission.py", - stdout=e.stdout, - stderr=e.stderr, - exit_code=e.returncode, - ) + else: + comp = None run = run_single_evaluation(["python", main], **kwargs) From cb819115736148fbc2840509c1974bb86f669583 Mon Sep 17 00:00:00 2001 From: Sahan Paliskara Date: Wed, 28 May 2025 16:47:41 -0700 Subject: [PATCH 13/42] test push --- src/discord-cluster-manager/run_eval.py | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/src/discord-cluster-manager/run_eval.py b/src/discord-cluster-manager/run_eval.py index 86ecde5b..e06161a3 100644 --- a/src/discord-cluster-manager/run_eval.py +++ b/src/discord-cluster-manager/run_eval.py @@ -297,7 +297,7 @@ def run_single_evaluation( with tempfile.NamedTemporaryFile("w") as tests_file: tests_file.write(tests) tests_file.flush() - return run_program(call + [mode, tests_file.name] + args, seed=seed, timeout=test_timeout) + return run_program(call + [mode, tests_file.name], seed=seed, timeout=test_timeout) elif mode in ["benchmark", "profile", "leaderboard", "reference"]: timeout = ranked_timeout if mode == "leaderboard" else benchmark_timeout with tempfile.NamedTemporaryFile("w") as bench_file: @@ -450,24 +450,24 @@ def run_pytorch_script( # noqa: C901 start = datetime.datetime.now() args = kwargs.get("args", []) # log everything that's going on - print("Running with kwargs: %s", kwargs) - print("Running with args: %s", args) - print("Running with sources: %s", sources) - print("Running with main: %s", main) + print("Running with kwargs: %s" % kwargs) + print("Running with args: %s" % args) + print("Running with sources: %s" % sources) + print("Running with main: %s" % main) is_reference = False if REFERENCE_TIMING_ARG in args: # pluck out submission.py from sources as it is not needed for the run and is None normally - sources.pop("submission.py") + sources.pop("submission.py", None) is_reference = True try: - assert main in sources.keys() _create_files(sources) # "compile" step: execute the script once. Will populate # `load_inline`'s compile cache, so the actual runs will be faster. - try: - if not is_reference: + comp = None + if not is_reference: + try: compile_run = run_program(["python", "submission.py"] + args, seed=1, timeout=Timeout.COMPILE) if "-DTORCH_EXTENSION_NAME" in compile_run.stdout: comp = CompileResult( @@ -479,8 +479,6 @@ def run_pytorch_script( # noqa: C901 stderr=compile_run.stderr, exit_code=compile_run.exit_code, ) - else: - comp = None except subprocess.CalledProcessError as e: # This step is purely optional, so we just go on # if it fails @@ -493,8 +491,6 @@ def run_pytorch_script( # noqa: C901 stderr=e.stderr, exit_code=e.returncode, ) - else: - comp = None run = run_single_evaluation(["python", main], **kwargs) From dc17b308aeae14e25609c8bdcf2124236ff4b83d Mon Sep 17 00:00:00 2001 From: Sahan Paliskara Date: Wed, 28 May 2025 17:02:50 -0700 Subject: [PATCH 14/42] test push --- src/discord-cluster-manager/eval.py | 9 ++++++--- src/discord-cluster-manager/run_eval.py | 2 +- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/discord-cluster-manager/eval.py b/src/discord-cluster-manager/eval.py index b87d5eef..2945ce5c 100644 --- a/src/discord-cluster-manager/eval.py +++ b/src/discord-cluster-manager/eval.py @@ -32,6 +32,7 @@ def correctness(rng: torch.Generator) -> bool: def metric(logger: PopcornLogger, rng: torch.Generator, time_reference_impl: bool = False): + print("timing kernel") warmup_runs = 10 timed_runs = 100 if time_reference_impl: @@ -96,7 +97,7 @@ def main(): REFERENCE_TIMING_ARG, action='store_true', help='Time ref kernel.' ) args = parser.parse_args() - + print(f"starting script") try: logger = PopcornLogger(int(os.environ["POPCORN_FD"])) except Exception as e: @@ -106,12 +107,14 @@ def main(): seed = int(os.environ.get("POPCORN_FD", 42)) rng = torch.Generator() rng.manual_seed(seed) - + print(f"seed: {seed}") + print(f"time ref: {args.time_ref}") + print(f"correctness: {not args.time_ref}") if not args.time_ref: if not correctness(rng): logger.log("check", "fail") exit(112) - + metric(logger, rng, time_reference_impl=args.time_ref) diff --git a/src/discord-cluster-manager/run_eval.py b/src/discord-cluster-manager/run_eval.py index e06161a3..56850714 100644 --- a/src/discord-cluster-manager/run_eval.py +++ b/src/discord-cluster-manager/run_eval.py @@ -306,7 +306,7 @@ def run_single_evaluation( else: bench_file.write(benchmarks) bench_file.flush() - return run_program(call + [mode, bench_file.name] + args, seed=seed, timeout=timeout) + return run_program(call + args + [mode, bench_file.name], seed=seed, timeout=timeout) else: assert mode == "script" return run_program(call + args, seed=seed, timeout=Timeout.SCRIPT) From c2a536716fba5d296038a3b91dee977d91a336e1 Mon Sep 17 00:00:00 2001 From: Sahan Paliskara Date: Wed, 28 May 2025 17:43:06 -0700 Subject: [PATCH 15/42] test push --- examples/eval.py | 38 +++++++++++++++++++++++++------------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/examples/eval.py b/examples/eval.py index e414a580..5e747d31 100644 --- a/examples/eval.py +++ b/examples/eval.py @@ -17,9 +17,13 @@ except ImportError: TestSpec = dict -from reference import check_implementation, generate_input +from reference import check_implementation, generate_input, ref_kernel +# ----------------------------------------------------------------------------- +# Determine which kernel to use (reference or submission) +# ----------------------------------------------------------------------------- +MODE_REFERENCE_STRING = "reference" # Define the string to check for mode class PopcornOutput: def __init__(self, fd: int): self.file = os.fdopen(fd, 'w') @@ -156,7 +160,7 @@ def _run_single_test(test: TestCase): from submission import custom_kernel data = generate_input(**test.args) torch.cuda.synchronize() - submission_output = custom_kernel(_clone_data(data)) + submission_output = active_kernel(_clone_data(data)) torch.cuda.synchronize() return wrap_check_implementation(data, submission_output) @@ -198,18 +202,21 @@ def run_testing(logger: PopcornOutput, pool: multiprocessing.Pool, tests: list[T return 112 -def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_time_ns: float) -> Stats | Any: +def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_time_ns: float, is_reference_run: bool) -> Stats | Any: """ Runs one benchmark. Do not call directly. """ - from submission import custom_kernel + if not is_reference_run: + # submission does not exist for a reference run + from submission import custom_kernel durations = [] # generate input data once data = generate_input(**test.args) check_copy = _clone_data(data) + active_kernel = ref_kernel if is_reference_run else custom_kernel # first, one obligatory correctness check - output = custom_kernel(data) + output = active_kernel(data) good, message = wrap_check_implementation(check_copy, output) if not good: return message @@ -229,7 +236,7 @@ def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_t check_copy = _clone_data(data) torch.cuda.synchronize() start = time.perf_counter_ns() - output = custom_kernel(data) + output = active_kernel(data) torch.cuda.synchronize() end = time.perf_counter_ns() @@ -249,7 +256,7 @@ def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_t return calculate_stats(durations) -def run_single_benchmark(pool: multiprocessing.Pool, test: TestCase, recheck: bool, max_repeats: int, max_time_ns: float): +def run_single_benchmark(pool: multiprocessing.Pool, test: TestCase, recheck: bool, max_repeats: int, max_time_ns: float, is_reference_run: bool = False): """ For a particular test case, check correctness (if applicable) and grab runtime results. @@ -260,7 +267,7 @@ def run_single_benchmark(pool: multiprocessing.Pool, test: TestCase, recheck: bo @param max_time_ns: Timeout time in nanoseconds. @return: A Stats object for this particular benchmark case or an error if the test fails. """ - return pool.apply(_run_single_benchmark, (test, recheck, max_repeats, max_time_ns)) + return pool.apply(_run_single_benchmark, (test, recheck, max_repeats, max_time_ns, is_reference_run)) def run_benchmarking(logger: PopcornOutput, pool: multiprocessing.Pool, tests: list[TestCase]): @@ -300,13 +307,13 @@ def run_single_profile(test: TestCase) -> str: """ Runs a single test case. Do not call directly """ - from submission import custom_kernel from torch.profiler import profile, record_function, ProfilerActivity + from submission import custom_kernel data = generate_input(**test.args) torch.cuda.synchronize() with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof: - submission_output = custom_kernel(_clone_data(data)) + submission_output = active_kernel(_clone_data(data)) torch.cuda.synchronize() return prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=20) @@ -345,13 +352,18 @@ def main(): if mode == "benchmark": return run_benchmarking(logger, pool, tests) - if mode == "leaderboard": + if mode == "leaderboard" or mode == "reference": + is_reference_run = mode == "reference" # warmup - run_single_benchmark(pool, tests[0], False, 100, 1e7) + run_single_benchmark(pool, tests[0], False, 100, 1e7, is_reference_run) + if is_reference_run: + logger.log("Running reference run") + else: + logger.log("Running leaderboard run") logger.log("benchmark-count", len(tests)) passed = True for i in range(len(tests)): - result = run_single_benchmark(pool, tests[i], True, 100, 30e9) + result = run_single_benchmark(pool, tests[i], True, 100, 30e9, is_reference_run) logger.log(f"benchmark.{i}.spec", tests[i].spec) if isinstance(result, Stats): for field in dataclasses.fields(Stats): From 19647d27ba91e0d40307916947d8aef6b4f9f06d Mon Sep 17 00:00:00 2001 From: Sahan Paliskara Date: Wed, 28 May 2025 21:02:00 -0700 Subject: [PATCH 16/42] test push --- src/discord-cluster-manager/task.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/discord-cluster-manager/task.py b/src/discord-cluster-manager/task.py index 3a14bc51..db3adbb8 100644 --- a/src/discord-cluster-manager/task.py +++ b/src/discord-cluster-manager/task.py @@ -64,6 +64,7 @@ class LeaderboardTask: ranking_by: RankCriterion = RankCriterion.LAST templates: dict[str, str] = dataclasses.field(default_factory=dict) seed: Optional[int] = None + @staticmethod def from_dict(data: dict): From a2404851ba680c01fde47d418c76e3eb0a60e026 Mon Sep 17 00:00:00 2001 From: Sahan Paliskara Date: Wed, 28 May 2025 21:37:42 -0700 Subject: [PATCH 17/42] test push --- .github/workflows/nvidia_workflow.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nvidia_workflow.yml b/.github/workflows/nvidia_workflow.yml index 5cf7c0a3..258d7838 100644 --- a/.github/workflows/nvidia_workflow.yml +++ b/.github/workflows/nvidia_workflow.yml @@ -29,7 +29,7 @@ jobs: shell: bash run: | # install jq - apt update && apt install -y jq + # apt update && apt install -y jq # Extract the payload content without printing it PAYLOAD=$(jq -r '.inputs.payload' $GITHUB_EVENT_PATH) From 7f1e04a492ca77bfbf9196780c9fd40bc78bc3dc Mon Sep 17 00:00:00 2001 From: Sahan Paliskara Date: Wed, 28 May 2025 21:43:44 -0700 Subject: [PATCH 18/42] test push --- .github/workflows/nvidia_workflow.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nvidia_workflow.yml b/.github/workflows/nvidia_workflow.yml index 258d7838..857331b8 100644 --- a/.github/workflows/nvidia_workflow.yml +++ b/.github/workflows/nvidia_workflow.yml @@ -29,7 +29,7 @@ jobs: shell: bash run: | # install jq - # apt update && apt install -y jq + snap install jq # Extract the payload content without printing it PAYLOAD=$(jq -r '.inputs.payload' $GITHUB_EVENT_PATH) From 104d1c96eeb9a6cafa57052a0bb3f8f9de20f8a7 Mon Sep 17 00:00:00 2001 From: Sahan Paliskara Date: Wed, 28 May 2025 21:47:51 -0700 Subject: [PATCH 19/42] test push --- .github/workflows/nvidia_workflow.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/nvidia_workflow.yml b/.github/workflows/nvidia_workflow.yml index 857331b8..7de49097 100644 --- a/.github/workflows/nvidia_workflow.yml +++ b/.github/workflows/nvidia_workflow.yml @@ -29,6 +29,8 @@ jobs: shell: bash run: | # install jq + apt update + apt install snapd snap install jq # Extract the payload content without printing it PAYLOAD=$(jq -r '.inputs.payload' $GITHUB_EVENT_PATH) From 1f54c968db3b4ae011d068b778848e285eb99fb6 Mon Sep 17 00:00:00 2001 From: Sahan Paliskara Date: Wed, 28 May 2025 21:50:47 -0700 Subject: [PATCH 20/42] test push --- .github/workflows/nvidia_workflow.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nvidia_workflow.yml b/.github/workflows/nvidia_workflow.yml index 7de49097..2113cf75 100644 --- a/.github/workflows/nvidia_workflow.yml +++ b/.github/workflows/nvidia_workflow.yml @@ -30,7 +30,7 @@ jobs: run: | # install jq apt update - apt install snapd + apt install -y snapd snap install jq # Extract the payload content without printing it PAYLOAD=$(jq -r '.inputs.payload' $GITHUB_EVENT_PATH) From 7f8cf648eb9e16dcf850b18a6fb16c5e6f08a7c3 Mon Sep 17 00:00:00 2001 From: Sahan Paliskara Date: Wed, 28 May 2025 21:54:42 -0700 Subject: [PATCH 21/42] test push --- .github/workflows/nvidia_workflow.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/nvidia_workflow.yml b/.github/workflows/nvidia_workflow.yml index 2113cf75..340a8fb9 100644 --- a/.github/workflows/nvidia_workflow.yml +++ b/.github/workflows/nvidia_workflow.yml @@ -30,8 +30,9 @@ jobs: run: | # install jq apt update - apt install -y snapd - snap install jq + apt-get install -y jq + # apt install -y snapd + # snap install jq # Extract the payload content without printing it PAYLOAD=$(jq -r '.inputs.payload' $GITHUB_EVENT_PATH) From 79672f0071b7e81f1e84e68e04f5324f5f5f7d9a Mon Sep 17 00:00:00 2001 From: Sahan Paliskara Date: Wed, 28 May 2025 22:02:06 -0700 Subject: [PATCH 22/42] test push --- .github/workflows/nvidia_workflow.yml | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/.github/workflows/nvidia_workflow.yml b/.github/workflows/nvidia_workflow.yml index 340a8fb9..b50c01af 100644 --- a/.github/workflows/nvidia_workflow.yml +++ b/.github/workflows/nvidia_workflow.yml @@ -24,13 +24,19 @@ jobs: uses: actions/setup-python@v5 with: python-version: '3.10' - + + - name: 'Setup jq' + uses: dcarbone/install-jq-action@v3 + with: + version: '1.7.1' + force: true + - name: Create input files shell: bash run: | # install jq - apt update - apt-get install -y jq + # apt update + # apt install -y jq # apt install -y snapd # snap install jq # Extract the payload content without printing it From 419fb221dc4a07191fa50ae26982d7255f6f463d Mon Sep 17 00:00:00 2001 From: Sahan Paliskara Date: Wed, 28 May 2025 22:10:31 -0700 Subject: [PATCH 23/42] test push --- .github/workflows/nvidia_workflow.yml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/.github/workflows/nvidia_workflow.yml b/.github/workflows/nvidia_workflow.yml index b50c01af..9b1d8feb 100644 --- a/.github/workflows/nvidia_workflow.yml +++ b/.github/workflows/nvidia_workflow.yml @@ -25,12 +25,6 @@ jobs: with: python-version: '3.10' - - name: 'Setup jq' - uses: dcarbone/install-jq-action@v3 - with: - version: '1.7.1' - force: true - - name: Create input files shell: bash run: | From ea8812c3a1f34c2a5689dca6de3d5264fbcca34c Mon Sep 17 00:00:00 2001 From: Sahan Paliskara Date: Wed, 28 May 2025 22:16:54 -0700 Subject: [PATCH 24/42] test push --- .github/workflows/nvidia_workflow.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/nvidia_workflow.yml b/.github/workflows/nvidia_workflow.yml index 9b1d8feb..f2171f4f 100644 --- a/.github/workflows/nvidia_workflow.yml +++ b/.github/workflows/nvidia_workflow.yml @@ -29,8 +29,8 @@ jobs: shell: bash run: | # install jq - # apt update - # apt install -y jq + apt update + apt install -y jq # apt install -y snapd # snap install jq # Extract the payload content without printing it From 13b2aef2299de6eef4b9306befacbd18163f0cec Mon Sep 17 00:00:00 2001 From: Sahan Paliskara Date: Wed, 28 May 2025 22:56:11 -0700 Subject: [PATCH 25/42] test push --- src/discord-cluster-manager/run_eval.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/discord-cluster-manager/run_eval.py b/src/discord-cluster-manager/run_eval.py index 56850714..4789c638 100644 --- a/src/discord-cluster-manager/run_eval.py +++ b/src/discord-cluster-manager/run_eval.py @@ -306,10 +306,10 @@ def run_single_evaluation( else: bench_file.write(benchmarks) bench_file.flush() - return run_program(call + args + [mode, bench_file.name], seed=seed, timeout=timeout) + return run_program(call + [mode, bench_file.name], seed=seed, timeout=timeout) else: assert mode == "script" - return run_program(call + args, seed=seed, timeout=Timeout.SCRIPT) + return run_program(call, seed=seed, timeout=Timeout.SCRIPT) def make_system_info() -> SystemInfo: @@ -468,7 +468,7 @@ def run_pytorch_script( # noqa: C901 comp = None if not is_reference: try: - compile_run = run_program(["python", "submission.py"] + args, seed=1, timeout=Timeout.COMPILE) + compile_run = run_program(["python", "submission.py"], seed=1, timeout=Timeout.COMPILE) if "-DTORCH_EXTENSION_NAME" in compile_run.stdout: comp = CompileResult( nvcc_found=True, From 3ae64d59ed97bbbc5bf12a65c78fae5a7e2156c2 Mon Sep 17 00:00:00 2001 From: Sahan Paliskara Date: Wed, 28 May 2025 23:09:26 -0700 Subject: [PATCH 26/42] test push --- .github/workflows/nvidia_workflow.yml | 32 ++++++++++++++------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/.github/workflows/nvidia_workflow.yml b/.github/workflows/nvidia_workflow.yml index f2171f4f..21ed6999 100644 --- a/.github/workflows/nvidia_workflow.yml +++ b/.github/workflows/nvidia_workflow.yml @@ -26,21 +26,23 @@ jobs: python-version: '3.10' - name: Create input files - shell: bash - run: | - # install jq - apt update - apt install -y jq - # apt install -y snapd - # snap install jq - # Extract the payload content without printing it - PAYLOAD=$(jq -r '.inputs.payload' $GITHUB_EVENT_PATH) - - # Apply mask to the extracted content - echo "::add-mask::$PAYLOAD" - - # Now write to file (won't be logged since it's masked) - echo "$PAYLOAD" > payload.json + uses: nick-fields/retry@v3 + with: + timeout_minutes: 2 + max_attempts: 5 + shell: bash + command: | + # install jq + apt update + apt install -y jq + # Extract the payload content without printing it + PAYLOAD=$(jq -r '.inputs.payload' $GITHUB_EVENT_PATH) + + # Apply mask to the extracted content + echo "::add-mask::$PAYLOAD" + + # Now write to file (won't be logged since it's masked) + echo "$PAYLOAD" > payload.json - name: Install uv uses: astral-sh/setup-uv@v3 From f265a377403eb9289ea51c8347887762faafb6d4 Mon Sep 17 00:00:00 2001 From: Sahan Paliskara Date: Wed, 28 May 2025 23:18:17 -0700 Subject: [PATCH 27/42] test push --- examples/eval.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/eval.py b/examples/eval.py index 5e747d31..24221afb 100644 --- a/examples/eval.py +++ b/examples/eval.py @@ -337,6 +337,7 @@ def main(): return 2 mode = sys.argv[1] + print(f"Running in mode {mode}") seed = os.getenv("POPCORN_SEED") os.unsetenv("POPCORN_SEED") seed = int(seed) if seed else None @@ -379,6 +380,7 @@ def main(): run_profiling(logger, tests) else: # TODO: Implement script mode + print(f"mode {mode} not implemented") return 2 From 5b8c698a93fddc19118eed32884c0c7b59cbb63e Mon Sep 17 00:00:00 2001 From: Sahan Paliskara Date: Wed, 28 May 2025 23:30:11 -0700 Subject: [PATCH 28/42] test push --- examples/eval.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/eval.py b/examples/eval.py index 24221afb..be778b40 100644 --- a/examples/eval.py +++ b/examples/eval.py @@ -336,7 +336,7 @@ def main(): if len(sys.argv) < 3: return 2 - mode = sys.argv[1] + mode = sys.argv[1].strip() print(f"Running in mode {mode}") seed = os.getenv("POPCORN_SEED") os.unsetenv("POPCORN_SEED") @@ -345,6 +345,7 @@ def main(): tests = get_test_cases(sys.argv[2], seed) with PopcornOutput(int(fd)) as logger: + logger.log(f"running in mode {mode}") import multiprocessing mp_context = multiprocessing.get_context('spawn') with mp_context.Pool(1) as pool: From fdb1ed6e6663ab113f6d9abf171cd365b356fd5d Mon Sep 17 00:00:00 2001 From: Sahan Paliskara Date: Wed, 28 May 2025 23:47:24 -0700 Subject: [PATCH 29/42] test push --- .github/workflows/nvidia_workflow.yml | 5 ++++- examples/eval.py | 5 +++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/.github/workflows/nvidia_workflow.yml b/.github/workflows/nvidia_workflow.yml index 21ed6999..78d950da 100644 --- a/.github/workflows/nvidia_workflow.yml +++ b/.github/workflows/nvidia_workflow.yml @@ -18,7 +18,10 @@ jobs: container: image: nvidia/cuda:12.4.0-devel-ubuntu22.04 steps: - - uses: actions/checkout@v3 + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ github.event.pull_request.head.sha }} - name: Setup Python uses: actions/setup-python@v5 diff --git a/examples/eval.py b/examples/eval.py index be778b40..fd643516 100644 --- a/examples/eval.py +++ b/examples/eval.py @@ -334,7 +334,7 @@ def main(): return 111 if len(sys.argv) < 3: - return 2 + return 222 mode = sys.argv[1].strip() print(f"Running in mode {mode}") @@ -381,8 +381,9 @@ def main(): run_profiling(logger, tests) else: # TODO: Implement script mode + logger.log(mode, "not implemented") print(f"mode {mode} not implemented") - return 2 + return 333 if __name__ == "__main__": From 2e489447f310603da91b49bd41c78f1776ee2303 Mon Sep 17 00:00:00 2001 From: Sahan Paliskara Date: Thu, 29 May 2025 10:28:08 -0700 Subject: [PATCH 30/42] test push --- examples/eval.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/eval.py b/examples/eval.py index fd643516..e5f25576 100644 --- a/examples/eval.py +++ b/examples/eval.py @@ -345,7 +345,7 @@ def main(): tests = get_test_cases(sys.argv[2], seed) with PopcornOutput(int(fd)) as logger: - logger.log(f"running in mode {mode}") + logger.log("debug message", f"running in mode {mode}") import multiprocessing mp_context = multiprocessing.get_context('spawn') with mp_context.Pool(1) as pool: @@ -359,9 +359,9 @@ def main(): # warmup run_single_benchmark(pool, tests[0], False, 100, 1e7, is_reference_run) if is_reference_run: - logger.log("Running reference run") + logger.log("debug message", "Running reference run") else: - logger.log("Running leaderboard run") + logger.log("debug message", "Running leaderboard run") logger.log("benchmark-count", len(tests)) passed = True for i in range(len(tests)): From d78119caff5e82f3a10e2f5fd085ddf0b50a4deb Mon Sep 17 00:00:00 2001 From: Sahan Paliskara Date: Thu, 29 May 2025 10:41:35 -0700 Subject: [PATCH 31/42] test push --- examples/eval.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/eval.py b/examples/eval.py index e5f25576..aa77e3ae 100644 --- a/examples/eval.py +++ b/examples/eval.py @@ -241,7 +241,7 @@ def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_t end = time.perf_counter_ns() if recheck: - good, message = check_implementation(check_copy, output) + good, message = wrap_check_implementation(check_copy, output) if not good: return message From 474d9c413979806b253b917dcfbc88052c1a71cc Mon Sep 17 00:00:00 2001 From: Sahan Paliskara Date: Thu, 29 May 2025 11:54:12 -0700 Subject: [PATCH 32/42] test push --- src/discord-cluster-manager/cogs/leaderboard_cog.py | 5 +++-- src/discord-cluster-manager/cogs/submit_cog.py | 2 +- src/discord-cluster-manager/report.py | 13 +++++++++++++ 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/src/discord-cluster-manager/cogs/leaderboard_cog.py b/src/discord-cluster-manager/cogs/leaderboard_cog.py index 73a5f285..4c2e00ac 100644 --- a/src/discord-cluster-manager/cogs/leaderboard_cog.py +++ b/src/discord-cluster-manager/cogs/leaderboard_cog.py @@ -182,7 +182,7 @@ async def on_submit_hook( # noqa: C901 with self.bot.leaderboard_db as db: db.mark_submission_done(sub_id) - if mode == SubmissionMode.LEADERBOARD: + if mode == SubmissionMode.LEADERBOARD or mode == SubmissionMode.REFERENCE: await self.post_submit_hook(interaction, sub_id) return sub_id @@ -493,7 +493,8 @@ async def _display_lb_submissions_helper( processed_submissions = [ { "Rank": submission["rank"], - "User": await get_user_from_id(self.bot, submission["user_id"]), + # "User": await get_user_from_id(self.bot, submission["user_id"]), + "User": submission["user_id"], "Score": f"{format_time(float(submission['submission_score']) * 1e9)}", "Submission Name": submission["submission_name"], } diff --git a/src/discord-cluster-manager/cogs/submit_cog.py b/src/discord-cluster-manager/cogs/submit_cog.py index 0657641f..03f51d32 100644 --- a/src/discord-cluster-manager/cogs/submit_cog.py +++ b/src/discord-cluster-manager/cogs/submit_cog.py @@ -221,7 +221,7 @@ async def _handle_submission( await reporter.update_title(reporter.title + " ✅ success") short_report = make_short_report( - result.runs, full=mode in [SubmissionMode.PRIVATE, SubmissionMode.LEADERBOARD] + result.runs, full=mode in [SubmissionMode.PRIVATE, SubmissionMode.LEADERBOARD, SubmissionMode.REFERENCE] ) await reporter.push(short_report) if mode != SubmissionMode.PRIVATE: diff --git a/src/discord-cluster-manager/report.py b/src/discord-cluster-manager/report.py index 4e09c2c5..b7901214 100644 --- a/src/discord-cluster-manager/report.py +++ b/src/discord-cluster-manager/report.py @@ -160,6 +160,7 @@ def make_short_report(runs: dict[str, EvalResult], full=True) -> list[str]: # n Creates a minimalistic report for `runs`, returned as a list of status strings """ + any_compile = False result = [] for r in runs.values(): @@ -218,6 +219,18 @@ def make_short_report(runs: dict[str, EvalResult], full=True) -> list[str]: # n result.append("✅ Leaderboard run successful") elif full: result.append("❌ Leaderboard missing") + + if "reference" in runs: + ref_run = runs["reference"].run + if not ref_run.success: + result.append("❌ Running reference failed" + _short_fail_reason(ref_run)) + elif not ref_run.passed: + result.append("❌ Reference run failed") + else: + result.append("✅ Reference run successful") + elif full: + result.append("❌ Reference missing") + return result From 6fda9308127e127f65e252d23c60080518a9c385 Mon Sep 17 00:00:00 2001 From: Sahan Paliskara Date: Thu, 29 May 2025 14:02:55 -0700 Subject: [PATCH 33/42] it works now cleanup --- .../cogs/leaderboard_cog.py | 6 ++--- .../cogs/submit_cog.py | 24 ++++++++++++------- src/discord-cluster-manager/leaderboard_db.py | 5 ++++ src/discord-cluster-manager/report.py | 2 -- 4 files changed, 24 insertions(+), 13 deletions(-) diff --git a/src/discord-cluster-manager/cogs/leaderboard_cog.py b/src/discord-cluster-manager/cogs/leaderboard_cog.py index 4c2e00ac..df234134 100644 --- a/src/discord-cluster-manager/cogs/leaderboard_cog.py +++ b/src/discord-cluster-manager/cogs/leaderboard_cog.py @@ -221,16 +221,16 @@ def generate_run_verdict(self, run: RunItem, sub_data: SubmissionItem): async def post_submit_hook(self, interaction: discord.Interaction, sub_id: int): with self.bot.leaderboard_db as db: sub_data: SubmissionItem = db.get_submission_by_id(sub_id) - + print(f"sub_data: {sub_data}") result_lines = [] for run in sub_data["runs"]: if ( not run["secret"] - and run["mode"] == SubmissionMode.LEADERBOARD.value + and (run["mode"] == SubmissionMode.LEADERBOARD.value or run["mode"] == SubmissionMode.REFERENCE.value) and run["passed"] ): result_lines.append(self.generate_run_verdict(run, sub_data)) - + print(f"result_lines: {result_lines}") if len(result_lines) > 0: await send_discord_message( interaction, diff --git a/src/discord-cluster-manager/cogs/submit_cog.py b/src/discord-cluster-manager/cogs/submit_cog.py index 03f51d32..af26077b 100644 --- a/src/discord-cluster-manager/cogs/submit_cog.py +++ b/src/discord-cluster-manager/cogs/submit_cog.py @@ -104,12 +104,21 @@ async def submit_leaderboard( # noqa: C901 if result.success: score = None if ( - "leaderboard" in result.runs + ("leaderboard" in result.runs and result.runs["leaderboard"].run.success - and result.runs["leaderboard"].run.passed + and result.runs["leaderboard"].run.passed) + or ("reference" in result.runs + and result.runs["reference"].run.success + and result.runs["reference"].run.passed) ): + if "leaderboard" in result.runs: + key = "leaderboard" + elif "reference" in result.runs: + key = "reference" + else: + raise KernelBotError("Leaderboard or reference run failed") score = 0.0 - num_benchmarks = int(result.runs["leaderboard"].run.result["benchmark-count"]) + num_benchmarks = int(result.runs[key].run.result["benchmark-count"]) if task.ranking_by == RankCriterion.LAST: if num_benchmarks != 1: logger.error( @@ -122,19 +131,18 @@ async def submit_leaderboard( # noqa: C901 f"Expected submission to have exactly one benchmark," f"got {num_benchmarks}." ) - score = float(result.runs["leaderboard"].run.result["benchmark.0.mean"]) / 1e9 + score = float(result.runs[key].run.result["benchmark.0.mean"]) / 1e9 else: scores = [] for i in range(num_benchmarks): scores.append( - float(result.runs["leaderboard"].run.result[f"benchmark.{i}.mean"]) - / 1e9 + float(result.runs[key].run.result[f"benchmark.{i}.mean"]) / 1e9 ) if task.ranking_by == RankCriterion.MEAN: score = sum(scores) / len(scores) elif task.ranking_by == RankCriterion.GEOM: score = math.pow(math.prod(scores), 1.0 / num_benchmarks) - + print(f"\nScore: {score}\n") # verifyruns uses a fake submission id of -1 if submission_id != -1: with self.bot.leaderboard_db as db: @@ -145,7 +153,7 @@ async def submit_leaderboard( # noqa: C901 value.end, mode=key, runner=gpu_type.name, - score=None if key != "leaderboard" else score, + score=None if (key != "leaderboard" and key != "reference") else score, secret=mode == SubmissionMode.PRIVATE, compilation=value.compilation, result=value.run, diff --git a/src/discord-cluster-manager/leaderboard_db.py b/src/discord-cluster-manager/leaderboard_db.py index 3b979e69..e570beec 100644 --- a/src/discord-cluster-manager/leaderboard_db.py +++ b/src/discord-cluster-manager/leaderboard_db.py @@ -345,6 +345,11 @@ def create_submission_run( result: RunResult, system: SystemInfo, ): + print(f"\n\nCreating run for submission {submission} with mode {mode} and runner {runner}\n\n") + print(f"Result: {result}\n") + print(f"System: {system}\n") + print(f"Compilation: {compilation}\n") + print(f"Score: {score}\n") try: if compilation is not None: compilation = json.dumps(dataclasses.asdict(compilation)) diff --git a/src/discord-cluster-manager/report.py b/src/discord-cluster-manager/report.py index b7901214..5572c95c 100644 --- a/src/discord-cluster-manager/report.py +++ b/src/discord-cluster-manager/report.py @@ -228,8 +228,6 @@ def make_short_report(runs: dict[str, EvalResult], full=True) -> list[str]: # n result.append("❌ Reference run failed") else: result.append("✅ Reference run successful") - elif full: - result.append("❌ Reference missing") return result From 89679562cf637397cfdbd15d8a0f96dcf052f8f3 Mon Sep 17 00:00:00 2001 From: Sahan Paliskara Date: Thu, 29 May 2025 16:04:13 -0700 Subject: [PATCH 34/42] cleanup --- examples/eval.py | 5 -- src/discord-cluster-manager/cogs/admin_cog.py | 64 ++++++++++++++++++- .../cogs/leaderboard_cog.py | 33 +--------- .../cogs/submit_cog.py | 1 - src/discord-cluster-manager/leaderboard_db.py | 18 ++---- src/discord-cluster-manager/utils.py | 1 - 6 files changed, 68 insertions(+), 54 deletions(-) diff --git a/examples/eval.py b/examples/eval.py index aa77e3ae..a66fb45b 100644 --- a/examples/eval.py +++ b/examples/eval.py @@ -345,7 +345,6 @@ def main(): tests = get_test_cases(sys.argv[2], seed) with PopcornOutput(int(fd)) as logger: - logger.log("debug message", f"running in mode {mode}") import multiprocessing mp_context = multiprocessing.get_context('spawn') with mp_context.Pool(1) as pool: @@ -358,10 +357,6 @@ def main(): is_reference_run = mode == "reference" # warmup run_single_benchmark(pool, tests[0], False, 100, 1e7, is_reference_run) - if is_reference_run: - logger.log("debug message", "Running reference run") - else: - logger.log("debug message", "Running leaderboard run") logger.log("benchmark-count", len(tests)) passed = True for i in range(len(tests)): diff --git a/src/discord-cluster-manager/cogs/admin_cog.py b/src/discord-cluster-manager/cogs/admin_cog.py index 8e39ee2f..7fc480bd 100644 --- a/src/discord-cluster-manager/cogs/admin_cog.py +++ b/src/discord-cluster-manager/cogs/admin_cog.py @@ -1,6 +1,7 @@ import json import subprocess import tempfile +import yaml from datetime import datetime, timedelta, timezone from decimal import Decimal from io import StringIO @@ -9,8 +10,7 @@ import discord import env -import yaml -from consts import GitHubGPU, ModalGPU +from consts import GitHubGPU, ModalGPU, SubmissionMode from discord import app_commands from discord.ext import commands, tasks from leaderboard_db import leaderboard_name_autocomplete @@ -24,6 +24,7 @@ setup_logging, with_error_handling, ) +from cogs.leaderboard_cog import LeaderboardSubmitCog if TYPE_CHECKING: from ..bot import ClusterBot @@ -120,6 +121,10 @@ def __init__(self, bot: "ClusterBot"): name="set-forum-ids", description="Sets forum IDs" )(self.set_forum_ids) + self.reference_run = bot.admin_group.command( + name="reference-run", description="Create a reference run for a leaderboard" + )(self.reference_run) + self._scheduled_cleanup_temp_users.start() # -------------------------------------------------------------------------- @@ -1025,3 +1030,58 @@ async def set_forum_ids(self, interaction: discord.Interaction): error_message = f"Error updating forum ids: {str(e)}" logger.error(error_message, exc_info=True) await send_discord_message(interaction, error_message, ephemeral=True) + + # ---------------------------------------------------------------------- + # Reference run submission (admin only) + # ---------------------------------------------------------------------- + @discord.app_commands.describe( + leaderboard_name="Name of the leaderboard to create a reference run for", + gpu="GPU(s) to use; leave empty for interactive selection", + force="Create another reference run even if one already exists.", + ) + @discord.app_commands.autocomplete( + leaderboard_name=leaderboard_name_autocomplete, + ) + @with_error_handling + async def reference_run( + self, + interaction: discord.Interaction, + leaderboard_name: str, + gpu: Optional[str] = None, + force: bool = False, + ): + """Admin command to create (or force-create) a reference run.""" + + # Ensure caller is admin + is_admin = await self.admin_check(interaction) + if not is_admin: + await send_discord_message( + interaction, + "You need Admin permissions to run this command.", + ephemeral=True, + ) + return + + # Check for existing reference run unless forcing + if not force: + with self.bot.leaderboard_db as db: + if db.has_reference_run(leaderboard_name): + await send_discord_message( + interaction, + ( + "A reference run already exists for this leaderboard. " + "Use the 'force' flag to create another." + ), + ephemeral=True, + ) + return + + lb_cog = LeaderboardSubmitCog(self.bot) + + await lb_cog.submit( + interaction=interaction, + leaderboard_name=leaderboard_name, + script=None, + mode=SubmissionMode.REFERENCE, + gpu=gpu, + ) diff --git a/src/discord-cluster-manager/cogs/leaderboard_cog.py b/src/discord-cluster-manager/cogs/leaderboard_cog.py index df234134..d4a9b49a 100644 --- a/src/discord-cluster-manager/cogs/leaderboard_cog.py +++ b/src/discord-cluster-manager/cogs/leaderboard_cog.py @@ -221,7 +221,6 @@ def generate_run_verdict(self, run: RunItem, sub_data: SubmissionItem): async def post_submit_hook(self, interaction: discord.Interaction, sub_id: int): with self.bot.leaderboard_db as db: sub_data: SubmissionItem = db.get_submission_by_id(sub_id) - print(f"sub_data: {sub_data}") result_lines = [] for run in sub_data["runs"]: if ( @@ -230,7 +229,6 @@ async def post_submit_hook(self, interaction: discord.Interaction, sub_id: int): and run["passed"] ): result_lines.append(self.generate_run_verdict(run, sub_data)) - print(f"result_lines: {result_lines}") if len(result_lines) > 0: await send_discord_message( interaction, @@ -355,34 +353,6 @@ async def submit_ranked( interaction, leaderboard_name, script, mode=SubmissionMode.LEADERBOARD, gpu=gpu ) - @app_commands.command(name="reference_run", description="Create a reference run for a leaderboard") - @app_commands.describe( - leaderboard_name="Name of the leaderboard to create a reference run for", - gpu="Select GPU. Leave empty for interactive selection.", - ) - @app_commands.autocomplete(leaderboard_name=leaderboard_name_autocomplete) - @with_error_handling - async def submit_reference( - self, - interaction: discord.Interaction, - leaderboard_name: str, - gpu: Optional[str] = None, - ): - # Check if reference run already exists - # with self.bot.leaderboard_db as db: - # if db.has_reference_run(leaderboard_name): - # await send_discord_message( - # interaction, - # f"A reference run for leaderboard '{leaderboard_name}' already exists.", - # ephemeral=True, - # ) - # return - # Process as a special submission - return await self.submit( - interaction, leaderboard_name, None, mode=SubmissionMode.REFERENCE, gpu=gpu - ) - - async def lang_autocomplete( interaction: discord.Interaction, current: str, @@ -493,8 +463,7 @@ async def _display_lb_submissions_helper( processed_submissions = [ { "Rank": submission["rank"], - # "User": await get_user_from_id(self.bot, submission["user_id"]), - "User": submission["user_id"], + "User": await get_user_from_id(self.bot, submission["user_id"]), "Score": f"{format_time(float(submission['submission_score']) * 1e9)}", "Submission Name": submission["submission_name"], } diff --git a/src/discord-cluster-manager/cogs/submit_cog.py b/src/discord-cluster-manager/cogs/submit_cog.py index af26077b..d689fea0 100644 --- a/src/discord-cluster-manager/cogs/submit_cog.py +++ b/src/discord-cluster-manager/cogs/submit_cog.py @@ -142,7 +142,6 @@ async def submit_leaderboard( # noqa: C901 score = sum(scores) / len(scores) elif task.ranking_by == RankCriterion.GEOM: score = math.pow(math.prod(scores), 1.0 / num_benchmarks) - print(f"\nScore: {score}\n") # verifyruns uses a fake submission id of -1 if submission_id != -1: with self.bot.leaderboard_db as db: diff --git a/src/discord-cluster-manager/leaderboard_db.py b/src/discord-cluster-manager/leaderboard_db.py index e570beec..d3f43b87 100644 --- a/src/discord-cluster-manager/leaderboard_db.py +++ b/src/discord-cluster-manager/leaderboard_db.py @@ -298,15 +298,12 @@ def has_reference_run(self, leaderboard_name: str) -> bool: try: self.cursor.execute( """ - SELECT COUNT(*) FROM leaderboard.runs - WHERE leaderboard.runs.leaderboard_id = ( - SELECT leaderboard.leaderboard.id - FROM leaderboard.leaderboard - WHERE leaderboard.leaderboard.name = %s - ) - AND leaderboard.runs.user_id = %s; + SELECT COUNT(*) FROM leaderboard.runs r + JOIN leaderboard.submission s ON r.submission_id = s.id + JOIN leaderboard.leaderboard l ON s.leaderboard_id = l.id + WHERE l.name = %s AND s.user_id = %s """, - (leaderboard_name, REFERENCE_USER_ID), + (leaderboard_name, str(REFERENCE_USER_ID)), ) return self.cursor.fetchone()[0] > 0 except psycopg2.Error as e: @@ -345,11 +342,6 @@ def create_submission_run( result: RunResult, system: SystemInfo, ): - print(f"\n\nCreating run for submission {submission} with mode {mode} and runner {runner}\n\n") - print(f"Result: {result}\n") - print(f"System: {system}\n") - print(f"Compilation: {compilation}\n") - print(f"Score: {score}\n") try: if compilation is not None: compilation = json.dumps(dataclasses.asdict(compilation)) diff --git a/src/discord-cluster-manager/utils.py b/src/discord-cluster-manager/utils.py index 769debdb..71b4f079 100644 --- a/src/discord-cluster-manager/utils.py +++ b/src/discord-cluster-manager/utils.py @@ -263,7 +263,6 @@ def build_task_config( all_files[n] = submission_content else: all_files[n] = c - print(f"all_files: {all_files}") common = { "lang": task.lang.value, "arch": arch, From 4bc6e8d7bdbe00d5551e75f70e34f3348f4f6fe0 Mon Sep 17 00:00:00 2001 From: Sahan Paliskara Date: Thu, 29 May 2025 19:12:57 -0700 Subject: [PATCH 35/42] test push --- examples/eval.py | 20 +++---- src/discord-cluster-manager/cogs/admin_cog.py | 28 ++++----- .../cogs/leaderboard_cog.py | 60 +++++-------------- .../cogs/submit_cog.py | 16 ++--- src/discord-cluster-manager/consts.py | 17 ++++-- src/discord-cluster-manager/eval.py | 59 +++++------------- src/discord-cluster-manager/leaderboard_db.py | 8 +-- src/discord-cluster-manager/report.py | 10 ++-- src/discord-cluster-manager/run_eval.py | 30 ++++------ src/discord-cluster-manager/submission.py | 4 +- src/discord-cluster-manager/utils.py | 8 +-- 11 files changed, 99 insertions(+), 161 deletions(-) diff --git a/examples/eval.py b/examples/eval.py index a66fb45b..f80bfcfa 100644 --- a/examples/eval.py +++ b/examples/eval.py @@ -21,9 +21,9 @@ # ----------------------------------------------------------------------------- -# Determine which kernel to use (reference or submission) +# Determine which kernel to use (baseline or submission) # ----------------------------------------------------------------------------- -MODE_REFERENCE_STRING = "reference" # Define the string to check for mode +MODE_BASELINE_STRING = "baseline" # Define the string to check for mode class PopcornOutput: def __init__(self, fd: int): self.file = os.fdopen(fd, 'w') @@ -202,11 +202,11 @@ def run_testing(logger: PopcornOutput, pool: multiprocessing.Pool, tests: list[T return 112 -def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_time_ns: float, is_reference_run: bool) -> Stats | Any: +def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_time_ns: float, is_baseline_run: bool) -> Stats | Any: """ Runs one benchmark. Do not call directly. """ - if not is_reference_run: + if not is_baseline_run: # submission does not exist for a reference run from submission import custom_kernel @@ -214,7 +214,7 @@ def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_t # generate input data once data = generate_input(**test.args) check_copy = _clone_data(data) - active_kernel = ref_kernel if is_reference_run else custom_kernel + active_kernel = ref_kernel if is_baseline_run else custom_kernel # first, one obligatory correctness check output = active_kernel(data) good, message = wrap_check_implementation(check_copy, output) @@ -256,7 +256,7 @@ def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_t return calculate_stats(durations) -def run_single_benchmark(pool: multiprocessing.Pool, test: TestCase, recheck: bool, max_repeats: int, max_time_ns: float, is_reference_run: bool = False): +def run_single_benchmark(pool: multiprocessing.Pool, test: TestCase, recheck: bool, max_repeats: int, max_time_ns: float, is_baseline_run: bool = False): """ For a particular test case, check correctness (if applicable) and grab runtime results. @@ -267,7 +267,7 @@ def run_single_benchmark(pool: multiprocessing.Pool, test: TestCase, recheck: bo @param max_time_ns: Timeout time in nanoseconds. @return: A Stats object for this particular benchmark case or an error if the test fails. """ - return pool.apply(_run_single_benchmark, (test, recheck, max_repeats, max_time_ns, is_reference_run)) + return pool.apply(_run_single_benchmark, (test, recheck, max_repeats, max_time_ns, is_baseline_run)) def run_benchmarking(logger: PopcornOutput, pool: multiprocessing.Pool, tests: list[TestCase]): @@ -354,13 +354,13 @@ def main(): return run_benchmarking(logger, pool, tests) if mode == "leaderboard" or mode == "reference": - is_reference_run = mode == "reference" + is_baseline_run = mode == "reference" # warmup - run_single_benchmark(pool, tests[0], False, 100, 1e7, is_reference_run) + run_single_benchmark(pool, tests[0], False, 100, 1e7, is_baseline_run) logger.log("benchmark-count", len(tests)) passed = True for i in range(len(tests)): - result = run_single_benchmark(pool, tests[i], True, 100, 30e9, is_reference_run) + result = run_single_benchmark(pool, tests[i], True, 100, 30e9, is_baseline_run) logger.log(f"benchmark.{i}.spec", tests[i].spec) if isinstance(result, Stats): for field in dataclasses.fields(Stats): diff --git a/src/discord-cluster-manager/cogs/admin_cog.py b/src/discord-cluster-manager/cogs/admin_cog.py index 7fc480bd..a20c9bb3 100644 --- a/src/discord-cluster-manager/cogs/admin_cog.py +++ b/src/discord-cluster-manager/cogs/admin_cog.py @@ -1,7 +1,6 @@ import json import subprocess import tempfile -import yaml from datetime import datetime, timedelta, timezone from decimal import Decimal from io import StringIO @@ -10,6 +9,8 @@ import discord import env +import yaml +from cogs.leaderboard_cog import LeaderboardSubmitCog from consts import GitHubGPU, ModalGPU, SubmissionMode from discord import app_commands from discord.ext import commands, tasks @@ -24,7 +25,6 @@ setup_logging, with_error_handling, ) -from cogs.leaderboard_cog import LeaderboardSubmitCog if TYPE_CHECKING: from ..bot import ClusterBot @@ -121,9 +121,9 @@ def __init__(self, bot: "ClusterBot"): name="set-forum-ids", description="Sets forum IDs" )(self.set_forum_ids) - self.reference_run = bot.admin_group.command( - name="reference-run", description="Create a reference run for a leaderboard" - )(self.reference_run) + self.baseline_run = bot.admin_group.command( + name="baseline-run", description="Create a baseline run for a leaderboard" + )(self.baseline_run) self._scheduled_cleanup_temp_users.start() @@ -1032,25 +1032,25 @@ async def set_forum_ids(self, interaction: discord.Interaction): await send_discord_message(interaction, error_message, ephemeral=True) # ---------------------------------------------------------------------- - # Reference run submission (admin only) + # Baseline run submission (admin only) # ---------------------------------------------------------------------- @discord.app_commands.describe( - leaderboard_name="Name of the leaderboard to create a reference run for", + leaderboard_name="Name of the leaderboard to create a baseline run for", gpu="GPU(s) to use; leave empty for interactive selection", - force="Create another reference run even if one already exists.", + force="Create another baseline run even if one already exists.", ) @discord.app_commands.autocomplete( leaderboard_name=leaderboard_name_autocomplete, ) @with_error_handling - async def reference_run( + async def baseline_run( self, interaction: discord.Interaction, leaderboard_name: str, gpu: Optional[str] = None, force: bool = False, ): - """Admin command to create (or force-create) a reference run.""" + """Admin command to create (or force-create) a baseline run.""" # Ensure caller is admin is_admin = await self.admin_check(interaction) @@ -1062,14 +1062,14 @@ async def reference_run( ) return - # Check for existing reference run unless forcing + # Check for existing baseline run unless forcing if not force: with self.bot.leaderboard_db as db: - if db.has_reference_run(leaderboard_name): + if db.has_baseline_run(leaderboard_name): await send_discord_message( interaction, ( - "A reference run already exists for this leaderboard. " + "A baseline run already exists for this leaderboard. " "Use the 'force' flag to create another." ), ephemeral=True, @@ -1082,6 +1082,6 @@ async def reference_run( interaction=interaction, leaderboard_name=leaderboard_name, script=None, - mode=SubmissionMode.REFERENCE, + mode=SubmissionMode.BASELINE, gpu=gpu, ) diff --git a/src/discord-cluster-manager/cogs/leaderboard_cog.py b/src/discord-cluster-manager/cogs/leaderboard_cog.py index d4a9b49a..9171efc6 100644 --- a/src/discord-cluster-manager/cogs/leaderboard_cog.py +++ b/src/discord-cluster-manager/cogs/leaderboard_cog.py @@ -7,6 +7,8 @@ from consts import ( SubmissionMode, get_gpu_by_name, + BASELINE_USER_ID, + BASELINE_USER, ) from discord import app_commands from discord.ext import commands @@ -27,8 +29,6 @@ with_error_handling, ) -from consts import REFERENCE_USER_ID, REFERENCE_USER - if TYPE_CHECKING: from ..bot import ClusterBot @@ -74,10 +74,10 @@ async def on_submit_hook( # noqa: C901 """ if script is None: - if mode != SubmissionMode.REFERENCE: + if mode != SubmissionMode.BASELINE and not script: await send_discord_message( interaction, - "Script attachment is required for this unless submission mode is reference", + "Script attachment is required for this unless submission mode is baseline", ephemeral=True, ) return -1 @@ -94,12 +94,12 @@ async def on_submit_hook( # noqa: C901 interaction, "Could not decode your file. Is it UTF-8?", ephemeral=True ) return -1 - if mode == SubmissionMode.REFERENCE: - # create fake reference submission + if mode == SubmissionMode.BASELINE: + # create fake baseline submission file_name = None submission_content = None - user_id = REFERENCE_USER_ID - user_name = REFERENCE_USER + user_id = BASELINE_USER_ID + user_name = BASELINE_USER else: file_name = script.filename submission_content = submission_content @@ -140,8 +140,8 @@ async def on_submit_hook( # noqa: C901 time=datetime.now(), user_name=user_name, ) - if mode == SubmissionMode.REFERENCE: - run_msg = f"Submission **{sub_id}**: is a reference submission for `{req.leaderboard}`" + if mode == SubmissionMode.BASELINE: + run_msg = f"Submission **{sub_id}**: is a baseline submission for `{req.leaderboard}`" else: run_msg = f"Submission **{sub_id}**: `{file_name}` for `{req.leaderboard}`" @@ -182,42 +182,10 @@ async def on_submit_hook( # noqa: C901 with self.bot.leaderboard_db as db: db.mark_submission_done(sub_id) - if mode == SubmissionMode.LEADERBOARD or mode == SubmissionMode.REFERENCE: + if mode == SubmissionMode.LEADERBOARD or mode == SubmissionMode.BASELINE: await self.post_submit_hook(interaction, sub_id) return sub_id - def generate_run_verdict(self, run: RunItem, sub_data: SubmissionItem): - medals = {1: "🥇 First", 2: "🥈 Second", 3: "🥉 Third"} - - # get the competition - with self.bot.leaderboard_db as db: - competition = db.get_leaderboard_submissions( - sub_data["leaderboard_name"], run["runner"] - ) - # compare against the competition - other_by_user = False - run_time = float(run["score"]) - score_text = format_time(run_time * 1e9) - - for entry in competition: - # can we find our own run? Only if it is the fastest submission by this user - if entry["submission_id"] == sub_data["submission_id"]: - rank = entry["rank"] - if 1 <= rank <= 3: - return f"> {medals[rank]} place on {run['runner']}: {score_text}" - elif rank <= 10: - return f"> {rank}th place on {run['runner']}: {score_text}" - else: - return f"> Personal best on {run['runner']}: {score_text}" - elif entry["user_id"] == sub_data["user_id"]: - other_by_user = True - if other_by_user: - # User already has a submission that is faster - return f"> Successful on {run['runner']}: {score_text}" - else: - # no submission by the user exists - return f"> 🍾 First successful submission on {run['runner']}: {score_text}" - async def post_submit_hook(self, interaction: discord.Interaction, sub_id: int): with self.bot.leaderboard_db as db: sub_data: SubmissionItem = db.get_submission_by_id(sub_id) @@ -225,7 +193,7 @@ async def post_submit_hook(self, interaction: discord.Interaction, sub_id: int): for run in sub_data["runs"]: if ( not run["secret"] - and (run["mode"] == SubmissionMode.LEADERBOARD.value or run["mode"] == SubmissionMode.REFERENCE.value) + and (run["mode"] == SubmissionMode.LEADERBOARD.value or run["mode"] == SubmissionMode.BASELINE.value) and run["passed"] ): result_lines.append(self.generate_run_verdict(run, sub_data)) @@ -254,10 +222,10 @@ async def submit( gpu: Optional[str], ): - if not mode == SubmissionMode.REFERENCE and not script: + if not mode == SubmissionMode.BASELINE and not script: await send_discord_message( interaction, - "Script attachment is required for this unless submission mode is reference", + "Script attachment is required for this unless submission mode is baseline", ephemeral=True, ) return diff --git a/src/discord-cluster-manager/cogs/submit_cog.py b/src/discord-cluster-manager/cogs/submit_cog.py index d689fea0..0ac10e2a 100644 --- a/src/discord-cluster-manager/cogs/submit_cog.py +++ b/src/discord-cluster-manager/cogs/submit_cog.py @@ -107,16 +107,16 @@ async def submit_leaderboard( # noqa: C901 ("leaderboard" in result.runs and result.runs["leaderboard"].run.success and result.runs["leaderboard"].run.passed) - or ("reference" in result.runs - and result.runs["reference"].run.success - and result.runs["reference"].run.passed) + or ("baseline" in result.runs + and result.runs["baseline"].run.success + and result.runs["baseline"].run.passed) ): if "leaderboard" in result.runs: key = "leaderboard" - elif "reference" in result.runs: - key = "reference" + elif "baseline" in result.runs: + key = "baseline" else: - raise KernelBotError("Leaderboard or reference run failed") + raise KernelBotError("Leaderboard or baseline run failed") score = 0.0 num_benchmarks = int(result.runs[key].run.result["benchmark-count"]) if task.ranking_by == RankCriterion.LAST: @@ -152,7 +152,7 @@ async def submit_leaderboard( # noqa: C901 value.end, mode=key, runner=gpu_type.name, - score=None if (key != "leaderboard" and key != "reference") else score, + score=None if (key != "leaderboard" and key != "baseline") else score, secret=mode == SubmissionMode.PRIVATE, compilation=value.compilation, result=value.run, @@ -228,7 +228,7 @@ async def _handle_submission( await reporter.update_title(reporter.title + " ✅ success") short_report = make_short_report( - result.runs, full=mode in [SubmissionMode.PRIVATE, SubmissionMode.LEADERBOARD, SubmissionMode.REFERENCE] + result.runs, full=mode in [SubmissionMode.PRIVATE, SubmissionMode.LEADERBOARD] ) await reporter.push(short_report) if mode != SubmissionMode.PRIVATE: diff --git a/src/discord-cluster-manager/consts.py b/src/discord-cluster-manager/consts.py index cf50fdf2..d0934dd9 100644 --- a/src/discord-cluster-manager/consts.py +++ b/src/discord-cluster-manager/consts.py @@ -80,7 +80,7 @@ class SubmissionMode(Enum): """ Different types of submission that can be made: Test: Run tests and give detailed results about passed/failed tests. These have short timeouts. - Benchmark: Run larger benchmarks. Each benchmark is tested once, and then run multiple times. + Benchmark: Run larger benchmarks. Each benchmark is tested once, then run multiple times. Profile: Gather profiling information. One selected benchmark is run under the profiler. No testing is performed in this mode (sometimes, you need to profile deliberately broken code) Leaderboard: Official submission to the leaderboard. This first runs public tests, then a @@ -97,7 +97,10 @@ class SubmissionMode(Enum): LEADERBOARD = "leaderboard" PRIVATE = "private" SCRIPT = "script" - REFERENCE = "reference" + BASELINE = "baseline" + + # Alias for backward compatibility; to be removed in future release + REFERENCE = "baseline" class Language(Enum): @@ -159,6 +162,10 @@ class RankCriterion(Enum): torch """ -REFERENCE_USER = "REFERENCE_USER" -REFERENCE_USER_ID = -123 -REFERENCE_TIMING_ARG = "--reference-timing" \ No newline at end of file +# Constants used for baseline runs +BASELINE_USER = "BASELINE_USER" +BASELINE_USER_ID = -123 + +# Aliases for backward compatibility (to be removed in future release) +REFERENCE_USER = BASELINE_USER +REFERENCE_USER_ID = BASELINE_USER_ID \ No newline at end of file diff --git a/src/discord-cluster-manager/eval.py b/src/discord-cluster-manager/eval.py index 2945ce5c..c36de918 100644 --- a/src/discord-cluster-manager/eval.py +++ b/src/discord-cluster-manager/eval.py @@ -1,4 +1,3 @@ -import argparse import math import os import sys @@ -6,7 +5,7 @@ import torch from reference import check_implementation, generate_input, ref_kernel -from consts import REFERENCE_TIMING_ARG +from submission import custom_kernel class PopcornLogger: @@ -18,7 +17,6 @@ def log(self, key: str, value): def correctness(rng: torch.Generator) -> bool: - from submission import custom_kernel for _ in range(10): # check multiple times inputs = generate_input(torch.randint(0, int(2**31), (), generator=rng).item()) custom_output = custom_kernel(inputs) @@ -31,25 +29,15 @@ def correctness(rng: torch.Generator) -> bool: return True -def metric(logger: PopcornLogger, rng: torch.Generator, time_reference_impl: bool = False): - print("timing kernel") +def metric(logger: PopcornLogger, rng: torch.Generator): warmup_runs = 10 timed_runs = 100 - if time_reference_impl: - logger.log("Timing Reference Implementation") - else: - # in the case of a reference run we don't have a submission - logger.log("Timing Submitted Custom Implementation") - from submission import custom_kernel # Warmup Code print("warming up...") for _ in range(warmup_runs): inputs = generate_input(torch.randint(0, int(2**31), (), generator=rng).item()) - if time_reference_impl: - _ = ref_kernel(inputs) - else: - _ = custom_kernel(inputs) + _ = custom_kernel(inputs) torch.cuda.synchronize() # Timing Code @@ -59,20 +47,16 @@ def metric(logger: PopcornLogger, rng: torch.Generator, time_reference_impl: boo inputs = generate_input(torch.randint(0, int(2**31), (), generator=rng).item()) start_time = time.time() - if time_reference_impl: - ref_output = ref_kernel(inputs) - else: - custom_output = custom_kernel(inputs) + custom_output = custom_kernel(inputs) torch.cuda.synchronize() end_time = time.time() times.append(end_time - start_time) - if not time_reference_impl: - ref_output = ref_kernel(inputs) - torch.cuda.synchronize() - if not check_implementation(custom_output, ref_output): - logger.log("check", "fail") - exit(112) + ref_output = ref_kernel(inputs) + torch.cuda.synchronize() + if not check_implementation(custom_output, ref_output): + logger.log("check", "fail") + exit(112) total_time = sum(times) average_duration = total_time / timed_runs @@ -87,17 +71,10 @@ def metric(logger: PopcornLogger, rng: torch.Generator, time_reference_impl: boo logger.log("duration.best", min(times) * 1e9) logger.log("duration.worst", max(times) * 1e9) - kernel_name = "Reference" if time_reference_impl else "Submitted" - print(f"{kernel_name} kernel runtime: {average_duration:.4f} ± {standard_error:.4} seconds") + print(f"Submitted kernel runtime: {average_duration:.4f} ± {standard_error:.4} seconds") def main(): - parser = argparse.ArgumentParser(description='Evaluate kernel implementation.') - parser.add_argument( - REFERENCE_TIMING_ARG, action='store_true', help='Time ref kernel.' - ) - args = parser.parse_args() - print(f"starting script") try: logger = PopcornLogger(int(os.environ["POPCORN_FD"])) except Exception as e: @@ -107,16 +84,12 @@ def main(): seed = int(os.environ.get("POPCORN_FD", 42)) rng = torch.Generator() rng.manual_seed(seed) - print(f"seed: {seed}") - print(f"time ref: {args.time_ref}") - print(f"correctness: {not args.time_ref}") - if not args.time_ref: - if not correctness(rng): - logger.log("check", "fail") - exit(112) - - metric(logger, rng, time_reference_impl=args.time_ref) + + if not correctness(rng): + logger.log("check", "fail") + exit(112) + metric(logger, rng) if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/src/discord-cluster-manager/leaderboard_db.py b/src/discord-cluster-manager/leaderboard_db.py index d3f43b87..e327afab 100644 --- a/src/discord-cluster-manager/leaderboard_db.py +++ b/src/discord-cluster-manager/leaderboard_db.py @@ -27,7 +27,7 @@ setup_logging, ) -from consts import REFERENCE_USER_ID, REFERENCE_USER +from consts import BASELINE_USER_ID, BASELINE_USER leaderboard_name_cache = LRUCache(max_size=512) @@ -215,7 +215,7 @@ def create_submission( time: datetime.datetime, user_name: str = None, ) -> Optional[int]: - if user_id == REFERENCE_USER_ID and user_name == REFERENCE_USER: + if user_id == BASELINE_USER_ID and user_name == BASELINE_USER: # todo: add reference code to the database code = "" file_name = "reference.py" @@ -294,7 +294,7 @@ def create_submission( self.connection.rollback() # Ensure rollback if error occurs raise KernelBotError("Error during creation of submission") from e - def has_reference_run(self, leaderboard_name: str) -> bool: + def has_baseline_run(self, leaderboard_name: str) -> bool: try: self.cursor.execute( """ @@ -303,7 +303,7 @@ def has_reference_run(self, leaderboard_name: str) -> bool: JOIN leaderboard.leaderboard l ON s.leaderboard_id = l.id WHERE l.name = %s AND s.user_id = %s """, - (leaderboard_name, str(REFERENCE_USER_ID)), + (leaderboard_name, str(BASELINE_USER_ID)), ) return self.cursor.fetchone()[0] > 0 except psycopg2.Error as e: diff --git a/src/discord-cluster-manager/report.py b/src/discord-cluster-manager/report.py index 5572c95c..fb43d31f 100644 --- a/src/discord-cluster-manager/report.py +++ b/src/discord-cluster-manager/report.py @@ -220,14 +220,14 @@ def make_short_report(runs: dict[str, EvalResult], full=True) -> list[str]: # n elif full: result.append("❌ Leaderboard missing") - if "reference" in runs: - ref_run = runs["reference"].run + if "baseline" in runs: + ref_run = runs["baseline"].run if not ref_run.success: - result.append("❌ Running reference failed" + _short_fail_reason(ref_run)) + result.append("❌ Running baseline failed" + _short_fail_reason(ref_run)) elif not ref_run.passed: - result.append("❌ Reference run failed") + result.append("❌ Baseline run failed") else: - result.append("✅ Reference run successful") + result.append("✅ Baseline run successful") return result diff --git a/src/discord-cluster-manager/run_eval.py b/src/discord-cluster-manager/run_eval.py index 4789c638..7614d0bc 100644 --- a/src/discord-cluster-manager/run_eval.py +++ b/src/discord-cluster-manager/run_eval.py @@ -10,7 +10,7 @@ from types import NoneType from typing import Optional, Protocol, Union -from consts import CUDA_FLAGS, ExitCode, REFERENCE_TIMING_ARG, Timeout +from consts import CUDA_FLAGS, ExitCode, Timeout @dataclasses.dataclass @@ -218,8 +218,6 @@ def compile_cuda_script( # # noqa: C901 def run_program(args: list[str], seed: Optional[int], timeout: int) -> RunResult: - print("[Running]") - print("Running with args: %s", args) # set up a pipe so the tester can communicate its verdict with us env = os.environ.copy() pipe_read, pipe_write = os.pipe() @@ -434,6 +432,7 @@ def run_cuda_script( # # noqa: C901 def run_pytorch_script( # noqa: C901 sources: dict[str, str], main: str, + is_baseline: bool = False, **kwargs, ) -> EvalResult: """ @@ -448,17 +447,6 @@ def run_pytorch_script( # noqa: C901 RunResult """ start = datetime.datetime.now() - args = kwargs.get("args", []) - # log everything that's going on - print("Running with kwargs: %s" % kwargs) - print("Running with args: %s" % args) - print("Running with sources: %s" % sources) - print("Running with main: %s" % main) - is_reference = False - if REFERENCE_TIMING_ARG in args: - # pluck out submission.py from sources as it is not needed for the run and is None normally - sources.pop("submission.py", None) - is_reference = True try: assert main in sources.keys() _create_files(sources) @@ -466,7 +454,7 @@ def run_pytorch_script( # noqa: C901 # "compile" step: execute the script once. Will populate # `load_inline`'s compile cache, so the actual runs will be faster. comp = None - if not is_reference: + if not is_baseline: try: compile_run = run_program(["python", "submission.py"], seed=1, timeout=Timeout.COMPILE) if "-DTORCH_EXTENSION_NAME" in compile_run.stdout: @@ -522,7 +510,7 @@ def run_evaluation( require multiple runner calls. """ results: dict[str, EvalResult] = {} - if mode in ["test", "benchmark", "profile", "script", "reference"]: + if mode in ["test", "benchmark", "profile", "script", "baseline"]: results[mode] = call(mode=mode) elif mode in ["private", "leaderboard"]: # first, run the tests @@ -539,7 +527,7 @@ def run_evaluation( # if they pass, run the leaderboard validation results["leaderboard"] = call(mode="leaderboard") else: - raise AssertionError("Invalid mode") + raise AssertionError(f"Invalid mode: {mode}") return results @@ -555,6 +543,12 @@ def build_test_string(tests: list[dict]): def run_config(config: dict): + mode = config["mode"] + is_baseline = False + if mode == "baseline": + config["sources"].pop("submission.py", None) + is_baseline = True + common_args = { "tests": build_test_string(config.get("tests", [])), "benchmarks": build_test_string(config.get("benchmarks", [])), @@ -563,13 +557,13 @@ def run_config(config: dict): "ranked_timeout": config.get("ranked_timeout", Timeout.RANKED), "benchmark_timeout": config.get("benchmark_timeout", Timeout.BENCHMARK), "test_timeout": config.get("test_timeout", Timeout.TEST), - "args": config.get("args", []), } if config["lang"] == "py": runner = functools.partial( run_pytorch_script, sources=config["sources"], main=config["main"], + is_baseline=is_baseline, **common_args, ) elif config["lang"] == "cu": diff --git a/src/discord-cluster-manager/submission.py b/src/discord-cluster-manager/submission.py index 19c0c6a1..090f7dee 100644 --- a/src/discord-cluster-manager/submission.py +++ b/src/discord-cluster-manager/submission.py @@ -29,10 +29,10 @@ class ProcessedSubmissionRequest(SubmissionRequest): def prepare_submission(req: SubmissionRequest, lb_db: LeaderboardDB) -> ProcessedSubmissionRequest: # Detect reference submissions (no file name & no code provided) # A reference submission is identified by missing/empty code content (no user file) - is_reference_submission = not req.code + is_baseline_submission = not req.code # Perform filename/content related checks only for *non* reference submissions - if not is_reference_submission: + if not is_baseline_submission: if profanity.contains_profanity(req.file_name): raise KernelBotError("Please provide a non rude filename") diff --git a/src/discord-cluster-manager/utils.py b/src/discord-cluster-manager/utils.py index 71b4f079..d63a44e1 100644 --- a/src/discord-cluster-manager/utils.py +++ b/src/discord-cluster-manager/utils.py @@ -5,7 +5,7 @@ from typing import TYPE_CHECKING, Any, List, NotRequired, Optional, TypedDict import discord -from consts import Language, SubmissionMode, REFERENCE_TIMING_ARG +from consts import Language, SubmissionMode if TYPE_CHECKING: from task import LeaderboardTask @@ -246,8 +246,7 @@ def build_task_config( if lang == "py": config["main"] = "eval.py" args = [] - if mode == SubmissionMode.REFERENCE: - args.append(REFERENCE_TIMING_ARG) + if mode == SubmissionMode.BASELINE: submission_content = "" config["args"] = args return { @@ -274,10 +273,7 @@ def build_task_config( "ranked_timeout": task.ranked_timeout, "ranking_by": task.ranking_by.value, "seed": task.seed, - "args": [], } - if mode == SubmissionMode.REFERENCE: - common["args"].append(REFERENCE_TIMING_ARG) if task.lang == Language.Python: return { From f1eb9a73a5aea8976d4f3940c37fba22333d8119 Mon Sep 17 00:00:00 2001 From: Sahan Paliskara Date: Thu, 29 May 2025 19:28:33 -0700 Subject: [PATCH 36/42] test push --- src/discord-cluster-manager/run_eval.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/discord-cluster-manager/run_eval.py b/src/discord-cluster-manager/run_eval.py index 7614d0bc..2f6aaf1d 100644 --- a/src/discord-cluster-manager/run_eval.py +++ b/src/discord-cluster-manager/run_eval.py @@ -296,7 +296,7 @@ def run_single_evaluation( tests_file.write(tests) tests_file.flush() return run_program(call + [mode, tests_file.name], seed=seed, timeout=test_timeout) - elif mode in ["benchmark", "profile", "leaderboard", "reference"]: + elif mode in ["benchmark", "profile", "leaderboard", "baseline"]: timeout = ranked_timeout if mode == "leaderboard" else benchmark_timeout with tempfile.NamedTemporaryFile("w") as bench_file: if ranking_by == "last": From 748a5371cd023770c36bf180d4e78195580213dd Mon Sep 17 00:00:00 2001 From: Sahan Paliskara Date: Thu, 29 May 2025 19:50:53 -0700 Subject: [PATCH 37/42] fix lint errors and cleanup --- src/discord-cluster-manager/cogs/leaderboard_cog.py | 8 ++++---- src/discord-cluster-manager/consts.py | 2 +- src/discord-cluster-manager/eval.py | 2 +- src/discord-cluster-manager/launchers/github.py | 1 - src/discord-cluster-manager/leaderboard_db.py | 3 +-- src/discord-cluster-manager/report.py | 4 ++-- src/discord-cluster-manager/run_eval.py | 6 ++++-- src/discord-cluster-manager/submission.py | 3 ++- src/discord-cluster-manager/task.py | 2 +- 9 files changed, 16 insertions(+), 15 deletions(-) diff --git a/src/discord-cluster-manager/cogs/leaderboard_cog.py b/src/discord-cluster-manager/cogs/leaderboard_cog.py index 9171efc6..63af6adb 100644 --- a/src/discord-cluster-manager/cogs/leaderboard_cog.py +++ b/src/discord-cluster-manager/cogs/leaderboard_cog.py @@ -5,10 +5,10 @@ import discord from consts import ( + BASELINE_USER, + BASELINE_USER_ID, SubmissionMode, get_gpu_by_name, - BASELINE_USER_ID, - BASELINE_USER, ) from discord import app_commands from discord.ext import commands @@ -20,7 +20,6 @@ from utils import ( LeaderboardItem, LeaderboardRankedEntry, - RunItem, SubmissionItem, format_time, get_user_from_id, @@ -193,7 +192,8 @@ async def post_submit_hook(self, interaction: discord.Interaction, sub_id: int): for run in sub_data["runs"]: if ( not run["secret"] - and (run["mode"] == SubmissionMode.LEADERBOARD.value or run["mode"] == SubmissionMode.BASELINE.value) + and (run["mode"] == SubmissionMode.LEADERBOARD.value + or run["mode"] == SubmissionMode.BASELINE.value) and run["passed"] ): result_lines.append(self.generate_run_verdict(run, sub_data)) diff --git a/src/discord-cluster-manager/consts.py b/src/discord-cluster-manager/consts.py index d0934dd9..2018ac4a 100644 --- a/src/discord-cluster-manager/consts.py +++ b/src/discord-cluster-manager/consts.py @@ -168,4 +168,4 @@ class RankCriterion(Enum): # Aliases for backward compatibility (to be removed in future release) REFERENCE_USER = BASELINE_USER -REFERENCE_USER_ID = BASELINE_USER_ID \ No newline at end of file +REFERENCE_USER_ID = BASELINE_USER_ID diff --git a/src/discord-cluster-manager/eval.py b/src/discord-cluster-manager/eval.py index c36de918..d0f693e1 100644 --- a/src/discord-cluster-manager/eval.py +++ b/src/discord-cluster-manager/eval.py @@ -92,4 +92,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/src/discord-cluster-manager/launchers/github.py b/src/discord-cluster-manager/launchers/github.py index 7a5c13e8..10232ebd 100644 --- a/src/discord-cluster-manager/launchers/github.py +++ b/src/discord-cluster-manager/launchers/github.py @@ -44,7 +44,6 @@ async def run_submission( raise ValueError(f"Invalid GPU type: {gpu_type.value}") lang = config["lang"] - args = config.get("args", []) if lang == "cu" and gpu_vendor == "AMD": # TODO implement HIP raise NotImplementedError("Cannot use CUDA runs with AMD GPUs") diff --git a/src/discord-cluster-manager/leaderboard_db.py b/src/discord-cluster-manager/leaderboard_db.py index e327afab..69a08d7a 100644 --- a/src/discord-cluster-manager/leaderboard_db.py +++ b/src/discord-cluster-manager/leaderboard_db.py @@ -6,6 +6,7 @@ import discord import psycopg2 +from consts import BASELINE_USER, BASELINE_USER_ID from env import ( DATABASE_URL, DISABLE_SSL, @@ -27,8 +28,6 @@ setup_logging, ) -from consts import BASELINE_USER_ID, BASELINE_USER - leaderboard_name_cache = LRUCache(max_size=512) logger = setup_logging(__name__) diff --git a/src/discord-cluster-manager/report.py b/src/discord-cluster-manager/report.py index fb43d31f..07e3b517 100644 --- a/src/discord-cluster-manager/report.py +++ b/src/discord-cluster-manager/report.py @@ -160,7 +160,7 @@ def make_short_report(runs: dict[str, EvalResult], full=True) -> list[str]: # n Creates a minimalistic report for `runs`, returned as a list of status strings """ - + any_compile = False result = [] for r in runs.values(): @@ -219,7 +219,7 @@ def make_short_report(runs: dict[str, EvalResult], full=True) -> list[str]: # n result.append("✅ Leaderboard run successful") elif full: result.append("❌ Leaderboard missing") - + if "baseline" in runs: ref_run = runs["baseline"].run if not ref_run.success: diff --git a/src/discord-cluster-manager/run_eval.py b/src/discord-cluster-manager/run_eval.py index 2f6aaf1d..9cc23bea 100644 --- a/src/discord-cluster-manager/run_eval.py +++ b/src/discord-cluster-manager/run_eval.py @@ -285,7 +285,6 @@ def run_single_evaluation( ranked_timeout: int = Timeout.RANKED, ranking_by: str = "last", seed: Optional[int] = None, - args: Optional[list[str]] = [], ) -> RunResult: """ A single runner run, either in the context of test files, or in the @@ -456,7 +455,10 @@ def run_pytorch_script( # noqa: C901 comp = None if not is_baseline: try: - compile_run = run_program(["python", "submission.py"], seed=1, timeout=Timeout.COMPILE) + compile_run = run_program(["python", + "submission.py"], + seed=1, + timeout=Timeout.COMPILE) if "-DTORCH_EXTENSION_NAME" in compile_run.stdout: comp = CompileResult( nvcc_found=True, diff --git a/src/discord-cluster-manager/submission.py b/src/discord-cluster-manager/submission.py index 090f7dee..37c14de6 100644 --- a/src/discord-cluster-manager/submission.py +++ b/src/discord-cluster-manager/submission.py @@ -48,7 +48,8 @@ def prepare_submission(req: SubmissionRequest, lb_db: LeaderboardDB) -> Processe # Ensure leaderboard name is present (might have come from the command directly) if req.leaderboard is None: raise KernelBotError( - "Missing leaderboard name. Either supply one as a command argument or via ``#!POPCORN leaderboard `` directive.", + "Missing leaderboard name. Either supply one as a command \ + argument or via ``#!POPCORN leaderboard `` directive.", ) leaderboard = lookup_leaderboard(req.leaderboard, lb_db) diff --git a/src/discord-cluster-manager/task.py b/src/discord-cluster-manager/task.py index db3adbb8..8641d4f7 100644 --- a/src/discord-cluster-manager/task.py +++ b/src/discord-cluster-manager/task.py @@ -64,7 +64,7 @@ class LeaderboardTask: ranking_by: RankCriterion = RankCriterion.LAST templates: dict[str, str] = dataclasses.field(default_factory=dict) seed: Optional[int] = None - + @staticmethod def from_dict(data: dict): From 8224cb0fb2d4076d7c7b045264c16cea44ec7a1c Mon Sep 17 00:00:00 2001 From: Sahan Paliskara Date: Fri, 30 May 2025 09:46:16 -0700 Subject: [PATCH 38/42] fix lint errors and cleanup --- examples/eval.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/examples/eval.py b/examples/eval.py index f80bfcfa..e7b89104 100644 --- a/examples/eval.py +++ b/examples/eval.py @@ -20,10 +20,6 @@ from reference import check_implementation, generate_input, ref_kernel -# ----------------------------------------------------------------------------- -# Determine which kernel to use (baseline or submission) -# ----------------------------------------------------------------------------- -MODE_BASELINE_STRING = "baseline" # Define the string to check for mode class PopcornOutput: def __init__(self, fd: int): self.file = os.fdopen(fd, 'w') @@ -207,7 +203,7 @@ def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_t Runs one benchmark. Do not call directly. """ if not is_baseline_run: - # submission does not exist for a reference run + # submission does not exist for a baseline run from submission import custom_kernel durations = [] @@ -337,7 +333,6 @@ def main(): return 222 mode = sys.argv[1].strip() - print(f"Running in mode {mode}") seed = os.getenv("POPCORN_SEED") os.unsetenv("POPCORN_SEED") seed = int(seed) if seed else None @@ -353,8 +348,8 @@ def main(): if mode == "benchmark": return run_benchmarking(logger, pool, tests) - if mode == "leaderboard" or mode == "reference": - is_baseline_run = mode == "reference" + if (mode == "leaderboard") or (mode == "baseline"): + is_baseline_run = mode == "baseline" # warmup run_single_benchmark(pool, tests[0], False, 100, 1e7, is_baseline_run) logger.log("benchmark-count", len(tests)) From 9ea4e778eea6829cc13bd2e203d94b1fcfb61567 Mon Sep 17 00:00:00 2001 From: Sahan Paliskara Date: Fri, 30 May 2025 10:10:01 -0700 Subject: [PATCH 39/42] final cleanup --- .github/workflows/nvidia_workflow.yml | 5 +-- .../cogs/leaderboard_cog.py | 33 +++++++++++++++++++ src/discord-cluster-manager/consts.py | 8 ----- 3 files changed, 34 insertions(+), 12 deletions(-) diff --git a/.github/workflows/nvidia_workflow.yml b/.github/workflows/nvidia_workflow.yml index 78d950da..52eb4a81 100644 --- a/.github/workflows/nvidia_workflow.yml +++ b/.github/workflows/nvidia_workflow.yml @@ -18,10 +18,7 @@ jobs: container: image: nvidia/cuda:12.4.0-devel-ubuntu22.04 steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - ref: ${{ github.event.pull_request.head.sha }} + - uses: actions/checkout@v4 - name: Setup Python uses: actions/setup-python@v5 diff --git a/src/discord-cluster-manager/cogs/leaderboard_cog.py b/src/discord-cluster-manager/cogs/leaderboard_cog.py index 63af6adb..354752f7 100644 --- a/src/discord-cluster-manager/cogs/leaderboard_cog.py +++ b/src/discord-cluster-manager/cogs/leaderboard_cog.py @@ -20,6 +20,7 @@ from utils import ( LeaderboardItem, LeaderboardRankedEntry, + RunItem, SubmissionItem, format_time, get_user_from_id, @@ -185,6 +186,38 @@ async def on_submit_hook( # noqa: C901 await self.post_submit_hook(interaction, sub_id) return sub_id + def generate_run_verdict(self, run: RunItem, sub_data: SubmissionItem): + medals = {1: "🥇 First", 2: "🥈 Second", 3: "🥉 Third"} + + # get the competition + with self.bot.leaderboard_db as db: + competition = db.get_leaderboard_submissions( + sub_data["leaderboard_name"], run["runner"] + ) + # compare against the competition + other_by_user = False + run_time = float(run["score"]) + score_text = format_time(run_time * 1e9) + + for entry in competition: + # can we find our own run? Only if it is the fastest submission by this user + if entry["submission_id"] == sub_data["submission_id"]: + rank = entry["rank"] + if 1 <= rank <= 3: + return f"> {medals[rank]} place on {run['runner']}: {score_text}" + elif rank <= 10: + return f"> {rank}th place on {run['runner']}: {score_text}" + else: + return f"> Personal best on {run['runner']}: {score_text}" + elif entry["user_id"] == sub_data["user_id"]: + other_by_user = True + if other_by_user: + # User already has a submission that is faster + return f"> Successful on {run['runner']}: {score_text}" + else: + # no submission by the user exists + return f"> 🍾 First successful submission on {run['runner']}: {score_text}" + async def post_submit_hook(self, interaction: discord.Interaction, sub_id: int): with self.bot.leaderboard_db as db: sub_data: SubmissionItem = db.get_submission_by_id(sub_id) diff --git a/src/discord-cluster-manager/consts.py b/src/discord-cluster-manager/consts.py index 2018ac4a..03ada325 100644 --- a/src/discord-cluster-manager/consts.py +++ b/src/discord-cluster-manager/consts.py @@ -99,10 +99,6 @@ class SubmissionMode(Enum): SCRIPT = "script" BASELINE = "baseline" - # Alias for backward compatibility; to be removed in future release - REFERENCE = "baseline" - - class Language(Enum): Python = "py" CUDA = "cu" @@ -165,7 +161,3 @@ class RankCriterion(Enum): # Constants used for baseline runs BASELINE_USER = "BASELINE_USER" BASELINE_USER_ID = -123 - -# Aliases for backward compatibility (to be removed in future release) -REFERENCE_USER = BASELINE_USER -REFERENCE_USER_ID = BASELINE_USER_ID From d9b2d0245fc64a2983c197a14504acfc39875e81 Mon Sep 17 00:00:00 2001 From: Sahan Paliskara Date: Fri, 30 May 2025 10:57:12 -0700 Subject: [PATCH 40/42] get full error to fix bug in ci --- scripts/ci_test_python.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/ci_test_python.py b/scripts/ci_test_python.py index d36a7c9a..410a0a55 100644 --- a/scripts/ci_test_python.py +++ b/scripts/ci_test_python.py @@ -49,6 +49,7 @@ def custom_kernel(input): """ run = run_pytorch_helper({**files, "submission.py": sub}) + print(f"full run is: \n {run}") assert run.success is True assert run.passed is False assert "python eval.py test" in run.command From ad9e7582ba89d67e99731d97456bcd840ac8abbc Mon Sep 17 00:00:00 2001 From: Sahan Paliskara Date: Fri, 30 May 2025 11:02:15 -0700 Subject: [PATCH 41/42] fix ci bug --- examples/eval.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/eval.py b/examples/eval.py index e7b89104..37d3eef9 100644 --- a/examples/eval.py +++ b/examples/eval.py @@ -156,7 +156,7 @@ def _run_single_test(test: TestCase): from submission import custom_kernel data = generate_input(**test.args) torch.cuda.synchronize() - submission_output = active_kernel(_clone_data(data)) + submission_output = custom_kernel(_clone_data(data)) torch.cuda.synchronize() return wrap_check_implementation(data, submission_output) From 6a365ff82cfa6a2a1b304b2fd4bf64a94f8cd963 Mon Sep 17 00:00:00 2001 From: Sahan Paliskara Date: Fri, 30 May 2025 11:02:23 -0700 Subject: [PATCH 42/42] fix ci bug --- scripts/ci_test_python.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/ci_test_python.py b/scripts/ci_test_python.py index 410a0a55..d36a7c9a 100644 --- a/scripts/ci_test_python.py +++ b/scripts/ci_test_python.py @@ -49,7 +49,6 @@ def custom_kernel(input): """ run = run_pytorch_helper({**files, "submission.py": sub}) - print(f"full run is: \n {run}") assert run.success is True assert run.passed is False assert "python eval.py test" in run.command