Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/discord-cluster-manager/consts.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ class ModalGPU(Enum):
A100 = "A100"
H100 = "H100"
B200 = "B200"
H200 = "H200"


@dataclasses.dataclass
Expand Down Expand Up @@ -115,6 +116,7 @@ class RankCriterion(Enum):
"L4": "80",
"A100": "80",
"H100": "90a",
"H200": "90a",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should we enable H200 tho?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't have a strong opinion, probably not too interesting GPU so idc if we remove.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah'd rather we remove

"B200": "100",
"NVIDIA": None,
"MI300": None,
Expand Down
9 changes: 9 additions & 0 deletions src/discord-cluster-manager/modal_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
tag = f"{cuda_version}-{flavor}-{operating_sys}"

# Move this to another file later:


cuda_image = (
Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.11")
.apt_install(
Expand Down Expand Up @@ -50,13 +52,20 @@
.pip_install("requests")
)


cuda_image = cuda_image.add_local_python_source(
"consts",
"modal_runner",
"modal_runner_archs",
"run_eval",
)

cuda_image_b200 = (
Image.from_registry("nvidia/cuda:12.8.0-devel-ubuntu24.04", add_python="3.11")
.pip_install("ninja", "packaging", "requests")
.pip_install("torch==2.7.0", extra_index_url="https://download.pytorch.org/whl/cu128")
)


class TimeoutException(Exception):
pass
Expand Down
36 changes: 4 additions & 32 deletions src/discord-cluster-manager/modal_runner_archs.py
Original file line number Diff line number Diff line change
@@ -1,42 +1,14 @@
# This file contains wrapper functions for running
# Modal apps on specific devices. We will fix this later.
from modal_runner import app, cuda_image, modal_run_config
from modal_utils import deserialize_full_result
from run_eval import FullResult, SystemInfo
from modal_runner import app, cuda_image, cuda_image_b200, modal_run_config

gpus = ["T4", "L4", "A100-80GB", "H100!"]
gpus = ["T4", "L4", "A100-80GB", "H100!", "B200", "H200"]
for gpu in gpus:
gpu_slug = gpu.lower().split("-")[0].strip("!")
app.function(gpu=gpu, image=cuda_image, name=f"run_cuda_script_{gpu_slug}", serialized=True)(
modal_run_config
)
app.function(gpu=gpu, image=cuda_image, name=f"run_pytorch_script_{gpu_slug}", serialized=True)(
img = cuda_image if gpu != "B200" else cuda_image_b200
app.function(gpu=gpu, image=img, name=f"run_pytorch_script_{gpu_slug}", serialized=True)(
modal_run_config
)


@app.function(image=cuda_image, max_containers=1, timeout=600)
def run_pytorch_script_b200(config: dict, timeout: int = 300):
"""Send a config and timeout to the server and return the response."""
import requests

ip_addr = "34.59.196.5"
port = "33001"

payload = {"config": config, "timeout": timeout}

try:
response = requests.post(f"http://{ip_addr}:{port}", json=payload, timeout=timeout + 5)
response.raise_for_status()
print("ORIGINAL", response.json())

print("DESERIALIZED", deserialize_full_result(response.json()))
return deserialize_full_result(response.json())
except requests.RequestException as e:
return FullResult(success=False, error=str(e), runs={}, system=SystemInfo())


@app.local_entrypoint()
def test_b200(timeout: int = 300):
config = {}
run_pytorch_script_b200.remote(config, timeout)
Loading