Skip to content

Commit 5cdeb14

Browse files
authored
Allow amd-smi to fail when rocm-smi succeeds (#14)
We've seen a case where amd-smi fails with: ``` AttributeError: module 'amdsmi.amdsmi_interface' has no attribute 'amdsmi_get_gpu_kfd_info'. Did you mean: 'amdsmi_get_gpu_asic_info'? ``` But rocm-smi succeeds. We assume that this has to do with some local installation state but we can be more forgiving about this case for now anyways.
1 parent 3903ba0 commit 5cdeb14

File tree

1 file changed

+33
-21
lines changed

1 file changed

+33
-21
lines changed

mojo/mojo_host_platform.bzl

Lines changed: 33 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,34 @@ def _get_nvidia_constraint(rctx, lines, gpu_mapping):
5454
_fail(rctx, "Unrecognized nvidia-smi output, please add it to your gpu_mapping in the MODULE.bazel file: {}".format(lines))
5555
return None
5656

57+
def _get_amd_constraints_with_rocm_smi(rctx, rocm_smi, gpu_mapping):
58+
if not rocm_smi:
59+
return []
60+
61+
result = rctx.execute([rocm_smi, "--json", "--showproductname"])
62+
_log_result(rctx, rocm_smi, result)
63+
64+
constraints = []
65+
if result.return_code == 0:
66+
blob = json.decode(result.stdout)
67+
if len(blob.keys()) == 0:
68+
fail("rocm-smi succeeded but didn't actually have any GPUs, please report this issue")
69+
70+
rocm_constraint = _get_rocm_constraint(rctx, blob, gpu_mapping)
71+
if rocm_constraint:
72+
constraints.extend([
73+
rocm_constraint,
74+
"@mojo_gpu_toolchains//:amd_gpu",
75+
"@mojo_gpu_toolchains//:has_gpu",
76+
])
77+
78+
if len(blob.keys()) > 1:
79+
constraints.append("@mojo_gpu_toolchains//:has_multi_gpu")
80+
if len(blob.keys()) >= 4:
81+
constraints.append("@mojo_gpu_toolchains//:has_4_gpus")
82+
83+
return constraints
84+
5785
def _impl(rctx):
5886
constraints = []
5987

@@ -111,28 +139,12 @@ def _impl(rctx):
111139
constraints.append("@mojo_gpu_toolchains//:has_multi_gpu")
112140
if len(blob) >= 4:
113141
constraints.append("@mojo_gpu_toolchains//:has_4_gpus")
142+
else:
143+
# amd-smi can fail when rocm-smi succeeds, fallback accordingly
144+
constraints.extend(_get_amd_constraints_with_rocm_smi(rctx, rocm_smi, rctx.attr.gpu_mapping))
114145

115-
elif rocm_smi:
116-
result = rctx.execute([rocm_smi, "--json", "--showproductname"])
117-
_log_result(rctx, rocm_smi, result)
118-
119-
if result.return_code == 0:
120-
blob = json.decode(result.stdout)
121-
if len(blob.keys()) == 0:
122-
fail("rocm-smi succeeded but didn't actually have any GPUs, please report this issue")
123-
124-
rocm_constraint = _get_rocm_constraint(rctx, blob, rctx.attr.gpu_mapping)
125-
if rocm_constraint:
126-
constraints.extend([
127-
rocm_constraint,
128-
"@mojo_gpu_toolchains//:amd_gpu",
129-
"@mojo_gpu_toolchains//:has_gpu",
130-
])
131-
132-
if len(blob.keys()) > 1:
133-
constraints.append("@mojo_gpu_toolchains//:has_multi_gpu")
134-
if len(blob.keys()) >= 4:
135-
constraints.append("@mojo_gpu_toolchains//:has_4_gpus")
146+
else:
147+
constraints.extend(_get_amd_constraints_with_rocm_smi(rctx, rocm_smi, rctx.attr.gpu_mapping))
136148

137149
rctx.file("WORKSPACE.bazel", "workspace(name = {})".format(rctx.attr.name))
138150
rctx.file("BUILD.bazel", """

0 commit comments

Comments
 (0)