From 059ea0fec4fc29574777dacb1060508b966f6e48 Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sat, 31 May 2025 00:03:00 +0200
Subject: [PATCH 1/2] =?UTF-8?q?Feat:=20enable=20new=20modal=20gpus=20?=
 =?UTF-8?q?=F0=9F=9A=80?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/discord-cluster-manager/consts.py             | 2 ++
 src/discord-cluster-manager/modal_runner_archs.py | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/discord-cluster-manager/consts.py b/src/discord-cluster-manager/consts.py
index 928f59d4..bf0cec35 100644
--- a/src/discord-cluster-manager/consts.py
+++ b/src/discord-cluster-manager/consts.py
@@ -29,6 +29,7 @@ class ModalGPU(Enum):
     A100 = "A100"
     H100 = "H100"
     B200 = "B200"
+    H200 = "H200"
 
 
 @dataclasses.dataclass
@@ -115,6 +116,7 @@ class RankCriterion(Enum):
     "L4": "80",
     "A100": "80",
     "H100": "90a",
+    "H200": "90a",
     "B200": "100",
     "NVIDIA": None,
     "MI300": None,
diff --git a/src/discord-cluster-manager/modal_runner_archs.py b/src/discord-cluster-manager/modal_runner_archs.py
index 75cd45bf..1c19fc06 100644
--- a/src/discord-cluster-manager/modal_runner_archs.py
+++ b/src/discord-cluster-manager/modal_runner_archs.py
@@ -4,7 +4,7 @@
 from modal_utils import deserialize_full_result
 from run_eval import FullResult, SystemInfo
 
-gpus = ["T4", "L4", "A100-80GB", "H100!"]
+gpus = ["T4", "L4", "A100-80GB", "H100!", "B200", "H200"]
 for gpu in gpus:
     gpu_slug = gpu.lower().split("-")[0].strip("!")
     app.function(gpu=gpu, image=cuda_image, name=f"run_cuda_script_{gpu_slug}", serialized=True)(

From 562ff9ea77221bc311e0c138656ea7a8b1664c01 Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sat, 31 May 2025 02:46:29 +0200
Subject: [PATCH 2/2] Feat: enable B200

---
 src/discord-cluster-manager/modal_runner.py   |  9 +++++
 .../modal_runner_archs.py                     | 34 ++-----------------
 2 files changed, 12 insertions(+), 31 deletions(-)

diff --git a/src/discord-cluster-manager/modal_runner.py b/src/discord-cluster-manager/modal_runner.py
index 5766e8a2..d4f70feb 100644
--- a/src/discord-cluster-manager/modal_runner.py
+++ b/src/discord-cluster-manager/modal_runner.py
@@ -19,6 +19,8 @@
 tag = f"{cuda_version}-{flavor}-{operating_sys}"
 
 # Move this to another file later:
+
+
 cuda_image = (
     Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.11")
     .apt_install(
@@ -50,6 +52,7 @@
     .pip_install("requests")
 )
 
+
 cuda_image = cuda_image.add_local_python_source(
     "consts",
     "modal_runner",
@@ -57,6 +60,12 @@
     "run_eval",
 )
 
+cuda_image_b200 = (
+    Image.from_registry("nvidia/cuda:12.8.0-devel-ubuntu24.04", add_python="3.11")
+    .pip_install("ninja", "packaging", "requests")
+    .pip_install("torch==2.7.0", extra_index_url="https://download.pytorch.org/whl/cu128")
+)
+
 
 class TimeoutException(Exception):
     pass
diff --git a/src/discord-cluster-manager/modal_runner_archs.py b/src/discord-cluster-manager/modal_runner_archs.py
index 1c19fc06..725e63e6 100644
--- a/src/discord-cluster-manager/modal_runner_archs.py
+++ b/src/discord-cluster-manager/modal_runner_archs.py
@@ -1,8 +1,6 @@
 # This file contains wrapper functions for running
 # Modal apps on specific devices. We will fix this later.
-from modal_runner import app, cuda_image, modal_run_config
-from modal_utils import deserialize_full_result
-from run_eval import FullResult, SystemInfo
+from modal_runner import app, cuda_image, cuda_image_b200, modal_run_config
 
 gpus = ["T4", "L4", "A100-80GB", "H100!", "B200", "H200"]
 for gpu in gpus:
@@ -10,33 +8,7 @@
     app.function(gpu=gpu, image=cuda_image, name=f"run_cuda_script_{gpu_slug}", serialized=True)(
         modal_run_config
     )
-    app.function(gpu=gpu, image=cuda_image, name=f"run_pytorch_script_{gpu_slug}", serialized=True)(
+    img = cuda_image if gpu != "B200" else cuda_image_b200
+    app.function(gpu=gpu, image=img, name=f"run_pytorch_script_{gpu_slug}", serialized=True)(
         modal_run_config
     )
-
-
-@app.function(image=cuda_image, max_containers=1, timeout=600)
-def run_pytorch_script_b200(config: dict, timeout: int = 300):
-    """Send a config and timeout to the server and return the response."""
-    import requests
-
-    ip_addr = "34.59.196.5"
-    port = "33001"
-
-    payload = {"config": config, "timeout": timeout}
-
-    try:
-        response = requests.post(f"http://{ip_addr}:{port}", json=payload, timeout=timeout + 5)
-        response.raise_for_status()
-        print("ORIGINAL", response.json())
-
-        print("DESERIALIZED", deserialize_full_result(response.json()))
-        return deserialize_full_result(response.json())
-    except requests.RequestException as e:
-        return FullResult(success=False, error=str(e), runs={}, system=SystemInfo())
-
-
-@app.local_entrypoint()
-def test_b200(timeout: int = 300):
-    config = {}
-    run_pytorch_script_b200.remote(config, timeout)