From 059ea0fec4fc29574777dacb1060508b966f6e48 Mon Sep 17 00:00:00 2001 From: S1ro1 Date: Sat, 31 May 2025 00:03:00 +0200 Subject: [PATCH 1/2] =?UTF-8?q?Feat:=20enable=20new=20modal=20gpus=20?= =?UTF-8?q?=F0=9F=9A=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/discord-cluster-manager/consts.py | 2 ++ src/discord-cluster-manager/modal_runner_archs.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/discord-cluster-manager/consts.py b/src/discord-cluster-manager/consts.py index 928f59d4..bf0cec35 100644 --- a/src/discord-cluster-manager/consts.py +++ b/src/discord-cluster-manager/consts.py @@ -29,6 +29,7 @@ class ModalGPU(Enum): A100 = "A100" H100 = "H100" B200 = "B200" + H200 = "H200" @dataclasses.dataclass @@ -115,6 +116,7 @@ class RankCriterion(Enum): "L4": "80", "A100": "80", "H100": "90a", + "H200": "90a", "B200": "100", "NVIDIA": None, "MI300": None, diff --git a/src/discord-cluster-manager/modal_runner_archs.py b/src/discord-cluster-manager/modal_runner_archs.py index 75cd45bf..1c19fc06 100644 --- a/src/discord-cluster-manager/modal_runner_archs.py +++ b/src/discord-cluster-manager/modal_runner_archs.py @@ -4,7 +4,7 @@ from modal_utils import deserialize_full_result from run_eval import FullResult, SystemInfo -gpus = ["T4", "L4", "A100-80GB", "H100!"] +gpus = ["T4", "L4", "A100-80GB", "H100!", "B200", "H200"] for gpu in gpus: gpu_slug = gpu.lower().split("-")[0].strip("!") app.function(gpu=gpu, image=cuda_image, name=f"run_cuda_script_{gpu_slug}", serialized=True)( From 562ff9ea77221bc311e0c138656ea7a8b1664c01 Mon Sep 17 00:00:00 2001 From: S1ro1 Date: Sat, 31 May 2025 02:46:29 +0200 Subject: [PATCH 2/2] Feat: enable B200 --- src/discord-cluster-manager/modal_runner.py | 9 +++++ .../modal_runner_archs.py | 34 ++----------------- 2 files changed, 12 insertions(+), 31 deletions(-) diff --git a/src/discord-cluster-manager/modal_runner.py b/src/discord-cluster-manager/modal_runner.py index 5766e8a2..d4f70feb 100644 --- a/src/discord-cluster-manager/modal_runner.py +++ b/src/discord-cluster-manager/modal_runner.py @@ -19,6 +19,8 @@ tag = f"{cuda_version}-{flavor}-{operating_sys}" # Move this to another file later: + + cuda_image = ( Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.11") .apt_install( @@ -50,6 +52,7 @@ .pip_install("requests") ) + cuda_image = cuda_image.add_local_python_source( "consts", "modal_runner", @@ -57,6 +60,12 @@ "run_eval", ) +cuda_image_b200 = ( + Image.from_registry("nvidia/cuda:12.8.0-devel-ubuntu24.04", add_python="3.11") + .pip_install("ninja", "packaging", "requests") + .pip_install("torch==2.7.0", extra_index_url="https://download.pytorch.org/whl/cu128") +) + class TimeoutException(Exception): pass diff --git a/src/discord-cluster-manager/modal_runner_archs.py b/src/discord-cluster-manager/modal_runner_archs.py index 1c19fc06..725e63e6 100644 --- a/src/discord-cluster-manager/modal_runner_archs.py +++ b/src/discord-cluster-manager/modal_runner_archs.py @@ -1,8 +1,6 @@ # This file contains wrapper functions for running # Modal apps on specific devices. We will fix this later. -from modal_runner import app, cuda_image, modal_run_config -from modal_utils import deserialize_full_result -from run_eval import FullResult, SystemInfo +from modal_runner import app, cuda_image, cuda_image_b200, modal_run_config gpus = ["T4", "L4", "A100-80GB", "H100!", "B200", "H200"] for gpu in gpus: @@ -10,33 +8,7 @@ app.function(gpu=gpu, image=cuda_image, name=f"run_cuda_script_{gpu_slug}", serialized=True)( modal_run_config ) - app.function(gpu=gpu, image=cuda_image, name=f"run_pytorch_script_{gpu_slug}", serialized=True)( + img = cuda_image if gpu != "B200" else cuda_image_b200 + app.function(gpu=gpu, image=img, name=f"run_pytorch_script_{gpu_slug}", serialized=True)( modal_run_config ) - - -@app.function(image=cuda_image, max_containers=1, timeout=600) -def run_pytorch_script_b200(config: dict, timeout: int = 300): - """Send a config and timeout to the server and return the response.""" - import requests - - ip_addr = "34.59.196.5" - port = "33001" - - payload = {"config": config, "timeout": timeout} - - try: - response = requests.post(f"http://{ip_addr}:{port}", json=payload, timeout=timeout + 5) - response.raise_for_status() - print("ORIGINAL", response.json()) - - print("DESERIALIZED", deserialize_full_result(response.json())) - return deserialize_full_result(response.json()) - except requests.RequestException as e: - return FullResult(success=False, error=str(e), runs={}, system=SystemInfo()) - - -@app.local_entrypoint() -def test_b200(timeout: int = 300): - config = {} - run_pytorch_script_b200.remote(config, timeout)