Skip to content

Commit 5be94fd

Browse files
authored
[Feature] GPUOP-309 Make in-cluster driver image build base image configurable (#766) (#234)
1 parent 1cfbf05 commit 5be94fd

20 files changed

+264
-15
lines changed

api/v1alpha1/deviceconfig_types.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,11 @@ type DriverSpec struct {
140140
// +optional
141141
ImageSign ImageSignSpec `json:"imageSign,omitempty"`
142142

143+
// image build configs
144+
//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="ImageBuild",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:imageBuild"}
145+
// +optional
146+
ImageBuild ImageBuildSpec `json:"imageBuild,omitempty"`
147+
143148
// policy to upgrade the drivers
144149
//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="UpgradePolicy",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:upgradePolicy"}
145150
// +optional
@@ -337,6 +342,17 @@ type ImageSignSpec struct {
337342
CertSecret *v1.LocalObjectReference `json:"certSecret,omitempty"`
338343
}
339344

345+
type ImageBuildSpec struct {
346+
// image registry to fetch base image for building driver image, default value is docker.io, the builder will search for corresponding OS base image from given registry
347+
// e.g. if your worker node is using Ubuntu 22.04, by default the base image would be docker.io/ubuntu:22.04
348+
// NOTE: this field won't apply for OpenShift since OpenShift is using its own DriverToolKit image to build driver image
349+
// +kubebuilder:default=docker.io
350+
BaseImageRegistry string `json:"baseImageRegistry,omitempty"`
351+
352+
// TLS settings for fetching base image
353+
BaseImageRegistryTLS RegistryTLS `json:"baseImageRegistryTLS,omitempty"`
354+
}
355+
340356
// ServiceType string describes ingress methods for a service
341357
type ServiceType string
342358

api/v1alpha1/zz_generated.deepcopy.go

Lines changed: 17 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ metadata:
3232
capabilities: Seamless Upgrades
3333
categories: AI/Machine Learning,Monitoring
3434
containerImage: docker.io/rocm/gpu-operator:v1.2.0
35-
createdAt: "2025-06-11T22:50:16Z"
35+
createdAt: "2025-06-12T00:51:00Z"
3636
description: |-
3737
Operator responsible for deploying AMD GPU kernel drivers, device plugin, device test runner and device metrics exporter
3838
For more information, visit [documentation](https://instinct.docs.amd.com/projects/gpu-operator/en/latest/)
@@ -285,6 +285,22 @@ spec:
285285
path: driver.image
286286
x-descriptors:
287287
- urn:alm:descriptor:com.amd.deviceconfigs:image
288+
- description: image build configs
289+
displayName: ImageBuild
290+
path: driver.imageBuild
291+
x-descriptors:
292+
- urn:alm:descriptor:com.amd.deviceconfigs:imageBuild
293+
- description: If true, check if the container image already exists using plain
294+
HTTP.
295+
displayName: Insecure
296+
path: driver.imageBuild.baseImageRegistryTLS.insecure
297+
x-descriptors:
298+
- urn:alm:descriptor:com.amd.deviceconfigs:insecure
299+
- description: If true, skip any TLS server certificate validation
300+
displayName: InsecureSkipTLSVerify
301+
path: driver.imageBuild.baseImageRegistryTLS.insecureSkipTLSVerify
302+
x-descriptors:
303+
- urn:alm:descriptor:com.amd.deviceconfigs:insecureSkipTLSVerify
288304
- description: secrets used for pull/push images from/to private registry specified
289305
in driversImage
290306
displayName: ImageRegistrySecret

bundle/manifests/amd.com_deviceconfigs.yaml

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -384,6 +384,29 @@ spec:
384384
NOTE: Updating the driver image repository is not supported. Please delete the existing DeviceConfig and create a new one with the updated image repository
385385
pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[$a-zA-Z0-9_]+(?:[._-][$a-zA-Z0-9_]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
386386
type: string
387+
imageBuild:
388+
description: image build configs
389+
properties:
390+
baseImageRegistry:
391+
default: docker.io
392+
description: |-
393+
image registry to fetch base image for building driver image, default value is docker.io, the builder will search for corresponding OS base image from given registry
394+
e.g. if your worker node is using Ubuntu 22.04, by default the base image would be docker.io/ubuntu:22.04
395+
NOTE: this field won't apply for OpenShift since OpenShift is using its own DriverToolKit image to build driver image
396+
type: string
397+
baseImageRegistryTLS:
398+
description: TLS settings for fetching base image
399+
properties:
400+
insecure:
401+
description: If true, check if the container image already
402+
exists using plain HTTP.
403+
type: boolean
404+
insecureSkipTLSVerify:
405+
description: If true, skip any TLS server certificate
406+
validation
407+
type: boolean
408+
type: object
409+
type: object
387410
imageRegistrySecret:
388411
description: secrets used for pull/push images from/to private
389412
registry specified in driversImage

config/crd/bases/amd.com_deviceconfigs.yaml

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -380,6 +380,29 @@ spec:
380380
NOTE: Updating the driver image repository is not supported. Please delete the existing DeviceConfig and create a new one with the updated image repository
381381
pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[$a-zA-Z0-9_]+(?:[._-][$a-zA-Z0-9_]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
382382
type: string
383+
imageBuild:
384+
description: image build configs
385+
properties:
386+
baseImageRegistry:
387+
default: docker.io
388+
description: |-
389+
image registry to fetch base image for building driver image, default value is docker.io, the builder will search for corresponding OS base image from given registry
390+
e.g. if your worker node is using Ubuntu 22.04, by default the base image would be docker.io/ubuntu:22.04
391+
NOTE: this field won't apply for OpenShift since OpenShift is using its own DriverToolKit image to build driver image
392+
type: string
393+
baseImageRegistryTLS:
394+
description: TLS settings for fetching base image
395+
properties:
396+
insecure:
397+
description: If true, check if the container image already
398+
exists using plain HTTP.
399+
type: boolean
400+
insecureSkipTLSVerify:
401+
description: If true, skip any TLS server certificate
402+
validation
403+
type: boolean
404+
type: object
405+
type: object
383406
imageRegistrySecret:
384407
description: secrets used for pull/push images from/to private
385408
registry specified in driversImage

config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,22 @@ spec:
256256
path: driver.image
257257
x-descriptors:
258258
- urn:alm:descriptor:com.amd.deviceconfigs:image
259+
- description: image build configs
260+
displayName: ImageBuild
261+
path: driver.imageBuild
262+
x-descriptors:
263+
- urn:alm:descriptor:com.amd.deviceconfigs:imageBuild
264+
- description: If true, check if the container image already exists using plain
265+
HTTP.
266+
displayName: Insecure
267+
path: driver.imageBuild.baseImageRegistryTLS.insecure
268+
x-descriptors:
269+
- urn:alm:descriptor:com.amd.deviceconfigs:insecure
270+
- description: If true, skip any TLS server certificate validation
271+
displayName: InsecureSkipTLSVerify
272+
path: driver.imageBuild.baseImageRegistryTLS.insecureSkipTLSVerify
273+
x-descriptors:
274+
- urn:alm:descriptor:com.amd.deviceconfigs:insecureSkipTLSVerify
259275
- description: secrets used for pull/push images from/to private registry specified
260276
in driversImage
261277
displayName: ImageRegistrySecret

docs/dcm/device-config-manager-configmap.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ The Device Config Manager (DCM) job is to monitor for and apply different config
44

55
## ConfigMap
66

7-
As mentioned, the `config.json` data specifies different GPU partitioning profiles that can be set on the GPU nodes in your cluster. Below is an example Device Config Manager ConfigMap. This example ConfigMap is also available in the GPU Operator repo here: [_example/configmap.yaml_](https://github.com/pensando/gpu-operator/blob/main/example/configManager/configmap.yaml)
7+
As mentioned, the `config.json` data specifies different GPU partitioning profiles that can be set on the GPU nodes in your cluster. Below is an example Device Config Manager ConfigMap. This example ConfigMap is also available in the GPU Operator repo here: [_example/configmap.yaml_](https://github.com/ROCm/gpu-operator/blob/main/example/configManager/configmap.yaml)
88

99
```yaml
1010
apiVersion: v1

docs/fulldeviceconfig.rst

Lines changed: 31 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ Below is an example of a full DeviceConfig CR that can be used to install the AM
4343
# Not working for OpenShift cluster. OpenShift users please use the Machine Config Operator (MCO) resource to configure amdgpu blacklist.
4444
# Example MCO resource is available at https://instinct.docs.amd.com/projects/gpu-operator/en/latest/installation/openshift-olm.html#create-blacklist-for-installing-out-of-tree-kernel-module
4545
blacklist: false
46+
version: "6.4" # Specify the driver version you would like to be installed that coincides with a ROCm version number
4647
# Specify your repository to host driver image
4748
# Note:
4849
# 1. DO NOT include the image tag as AMD GPU Operator will automatically manage the image tag for you
@@ -53,14 +54,36 @@ Below is an example of a full DeviceConfig CR that can be used to install the AM
5354
# kubectl create secret docker-registry mysecret -n kmm-namespace --docker-username=xxx --docker-password=xxx
5455
# Make sure you created the secret within the namespace that KMM operator is running
5556
imageRegistrySecret:
56-
name: mysecret
57-
imageRegistryTLS:
58-
insecure: false # If true, check for the container image using plain HTTP
59-
insecureSkipTLSVerify: false # If true, skip any TLS server certificate validation (useful for self-signed certificates)
60-
version: "6.3" # Specify the driver version you would like to be installed that coincides with a ROCm version number
61-
upgradePolicy:
62-
enable: true
63-
maxParallelUpgrades: 3 # (Optional) Number of nodes that will be upgraded in parallel. Default is 1
57+
name: my-image-secret
58+
imageRegistryTLS:
59+
insecure: False # If True, check for the container image using plain HTTP
60+
insecureSkipTLSVerify: False # If True, skip any TLS server certificate validation (useful for self-signed certificates)
61+
upgradePolicy:
62+
enable: true # (Optional) set to true to enable auto driver upgrade, set to false to manage driver upgrade manually
63+
maxParallelUpgrades: 3 # (Optional) Number of nodes that will be upgraded in parallel. Default is 1
64+
# (Optional) specify the secret that saves the private and public keys used to sign the built driver
65+
# secure boot enabled node requires image signing to load the kernel module
66+
# you need to register the public key in the system's Machine Owner Key (MOK) database
67+
imageSign:
68+
keySecret:
69+
name: image-sign-private-key-secret
70+
certSecret:
71+
name: image-sign-public-key-secret
72+
# (Optional) configure the driver image build within the cluster
73+
imageBuild:
74+
# configure the registry to search for base image for building driver
75+
# e.g. if you are using worker node with ubuntu 22.04 and baseImageRegistry is docker.io
76+
# image builder will use docker.io/ubuntu:22.04 as base image
77+
baseImageRegistry: docker.io
78+
baseImageRegistryTLS:
79+
insecure: False # If True, check for the container image using plain HTTP
80+
insecureSkipTLSVerify: False # If True, skip any TLS server certificate validation (useful for self-signed certificates)
81+
# (Optional) specify driver toleration so operator can manage out-of-tree drivers on tainted nodes
82+
tolerations:
83+
- key: "example-key"
84+
operator: "Equal"
85+
value: "example-value"
86+
effect: "NoSchedule"
6487
## AMD K8s Device Plugin Configuration ##
6588
commonConfig:
6689
# (Optional) Specify common values used by all components.

hack/k8s-patch/metadata-patch/values.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,10 @@ deviceConfig:
5454
version: "6.4"
5555
# -- specify the secrets to sign the out-of-tree kernel module inside driver image for secure boot, e.g. input private / public key secret {"keySecret":{"name":"privateKeySecret"},"certSecret":{"name":"publicKeySecret"}}
5656
imageSign: {}
57+
# -- configure the out-of-tree driver image build within the cluster. e.g. {"baseImageRegistry":"docker.io","baseImageRegistryTLS":{"baseImageRegistry":"docker.io","baseImageRegistryTLS":{"insecure":"false","insecureSkipTLSVerify":"false"}}}
58+
imageBuild: {}
59+
# -- configure driver tolerations so that operator can manage out-of-tree drivers on tainted nodes
60+
tolerations: []
5761
upgradePolicy:
5862
# -- enable/disable automatic driver upgrade feature
5963
enable: true

hack/k8s-patch/template-patch/default-deviceconfig.yaml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,16 @@ spec:
4848
{{- toYaml . | nindent 6 }}
4949
{{- end }}
5050

51+
{{- with .imageBuild }}
52+
imageBuild:
53+
{{- toYaml . | nindent 6 }}
54+
{{- end }}
55+
56+
{{- with .tolerations }}
57+
tolerations:
58+
{{- toYaml . | nindent 6 }}
59+
{{- end }}
60+
5161
{{- with .upgradePolicy }}
5262
upgradePolicy:
5363
{{- toYaml . | nindent 6 }}

0 commit comments

Comments
 (0)