Skip to content

Commit 13270d4

Browse files
authored
Merge pull request #4623 from Neelabh94/feature/dcgm-dev-package
Adding "datacenter-gpu-manager-4-dev" as an additional installation in A* YAML files.
2 parents abd3535 + 046d02d commit 13270d4

File tree

7 files changed

+8
-1
lines changed

7 files changed

+8
-1
lines changed

examples/hypercompute_clusters/a3u-slurm-ubuntu-gcs/a3u-slurm-ubuntu-gcs.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -474,6 +474,7 @@ deployment_groups:
474474
nvidia_packages:
475475
- cuda-toolkit-12-8
476476
- datacenter-gpu-manager
477+
- datacenter-gpu-manager-4-dev
477478
- libnvidia-cfg1-570-server
478479
- libnvidia-nscq-570
479480
- nvidia-compute-utils-570-server

examples/machine-learning/a3-megagpu-8g/a3mega-slurm-blueprint.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,7 @@ deployment_groups:
186186
apt install -y cuda-toolkit-12-8
187187
apt install -y nvidia-container-toolkit
188188
apt install -y datacenter-gpu-manager-4-cuda12
189+
apt install -y datacenter-gpu-manager-4-dev
189190
# this duplicates the ulimits configuration of the HPC VM Image
190191
- type: data
191192
destination: /etc/security/limits.d/99-unlimited.conf

examples/machine-learning/a3-megagpu-8g/a3mega-slurm-gcsfuse-lssd-blueprint.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -478,7 +478,8 @@ deployment_groups:
478478
enable_nvidia_dcgm: $(vars.enable_nvidia_dcgm)
479479
nvidia_packages:
480480
- cuda-toolkit-12-8
481-
- datacenter-gpu-manager
481+
- datacenter-gpu-manager-4-cuda12
482+
- datacenter-gpu-manager-4-dev
482483
- libnvidia-cfg1-570-server
483484
- libnvidia-nscq-570
484485
- nvidia-compute-utils-570-server

examples/machine-learning/a3-ultragpu-8g/a3ultra-slurm-blueprint.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,7 @@ deployment_groups:
144144
add-nvidia-repositories -y
145145
apt install -y cuda-toolkit-12-8
146146
apt install -y datacenter-gpu-manager-4-cuda12
147+
apt install -y datacenter-gpu-manager-4-dev
147148
- type: ansible-local
148149
destination: settings_nvidia_dcgm.yml
149150
content: |

examples/machine-learning/a4-highgpu-8g/a4high-slurm-blueprint.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,7 @@ deployment_groups:
146146
apt update -y
147147
apt install -y cuda-toolkit-12-8
148148
apt install -y datacenter-gpu-manager-4-cuda12
149+
apt install -y datacenter-gpu-manager-4-dev
149150
- type: ansible-local
150151
destination: configure_cuda_dcgm.yml
151152
content: |

examples/machine-learning/a4x-highgpu-4g/a4xhigh-slurm-blueprint.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,7 @@ deployment_groups:
162162
nvidia_packages:
163163
- cuda-toolkit-12-8
164164
- datacenter-gpu-manager-4-cuda12
165+
- datacenter-gpu-manager-4-dev
165166
tasks:
166167
- name: Download NVIDIA repository package
167168
ansible.builtin.get_url:

examples/machine-learning/build-service-images/common/blueprint.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@ deployment_groups:
101101
nvidia_packages:
102102
- cuda-toolkit-12-8
103103
- datacenter-gpu-manager-4-cuda12
104+
- datacenter-gpu-manager-4-dev
104105
tasks:
105106
- name: Download NVIDIA repository package
106107
ansible.builtin.get_url:

0 commit comments

Comments
 (0)