Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .markdownlint-cli2.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
ignores:
- "vendor/**/*.md"
5 changes: 2 additions & 3 deletions docs/.markdownlint.yaml → .markdownlint.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
ignores:
- CHANGELOG.md
- "vendor/**"
default: true
MD013: false
MD024:
Expand All @@ -12,3 +9,5 @@ MD029:
MD033: false
MD034: false
MD041: false
ignores:
- "vendor/**/*.md"
11 changes: 6 additions & 5 deletions .spellcheck.yml → .spellcheck.local.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,9 @@
matrix:
- name: Markdown
sources:
- ['docs/**/*.md', '!docs/doxygen/mainpage.md', '!docs/contributing/documentation-standards.md']
- ['tools/autotag/templates/**/*.md', '!tools/autotag/templates/**/5*.md', '!tools/autotag/templates/**/6.0*.md', '!tools/autotag/templates/**/6.1*.md']
expect_match: false
- '!vendor/**|docs/**/*.md|!docs/doxygen/mainpage.md|!docs/contributing/documentation-standards.md'
expect_match: true
jobs: 4
aspell:
lang: en
dictionary:
Expand Down Expand Up @@ -115,8 +115,9 @@ matrix:
- pyspelling.filters.url:
- name: reST
sources:
- 'docs/**/*.rst'
expect_match: false
- '!vendor/**|docs/**/*.rst'
expect_match: true
jobs: 4
aspell:
lang: en
dictionary:
Expand Down
11 changes: 5 additions & 6 deletions api/v1alpha1/deviceconfig_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ type DriverSpec struct {
// example tag is coreos-416.94-5.14.0-427.28.1.el9_4.x86_64-6.2.2 and ubuntu-22.04-5.15.0-94-generic-6.1.3
//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="Image",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:image"}
// +optional
// +kubebuilder:validation:Pattern=`^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$`
// +kubebuilder:validation:Pattern=`^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[$a-zA-Z0-9_]+(?:[._-][$a-zA-Z0-9_]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$`
Image string `json:"image,omitempty"`

// driver image registry TLS setting for the container image
Expand Down Expand Up @@ -251,12 +251,11 @@ type DevicePluginSpec struct {
// +optional
DevicePluginTolerations []v1.Toleration `json:"devicePluginTolerations,omitempty"`

// resource naming strategy for device plugin
//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="ResourceNamingStrategy",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:ResourceNamingStrategy"}
// +kubebuilder:validation:Enum=single;mixed
// +kubebuilder:default:="single"
// device plugin arguments is used to pass supported flags and their values while starting device plugin daemonset
// supported flag values: {"resource_naming_strategy": {"single", "mixed"}}
//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="DevicePluginArguments",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:devicePluginArguments"}
// +optional
ResourceNamingStrategy string `json:"resourceNamingStrategy,omitempty"`
DevicePluginArguments map[string]string `json:"devicePluginArguments,omitempty"`

// node labeller image
//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="NodeLabellerImage",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:nodeLabellerImage"}
Expand Down
7 changes: 7 additions & 0 deletions api/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

14 changes: 8 additions & 6 deletions bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ metadata:
}
]
capabilities: Basic Install
createdAt: "2025-03-20T06:06:57Z"
createdAt: "2025-03-25T06:19:27Z"
operatorframework.io/suggested-namespace: openshift-amd-gpu
operators.operatorframework.io/builder: operator-sdk-v1.32.0
operators.operatorframework.io/project_layout: go.kubebuilder.io/v3
Expand Down Expand Up @@ -152,6 +152,13 @@ spec:
path: devicePlugin
x-descriptors:
- urn:alm:descriptor:com.amd.deviceconfigs:devicePlugin
- description: 'device plugin arguments is used to pass supported flags and
their values while starting device plugin daemonset supported flag values:
{"resource_naming_strategy": {"single", "mixed"}}'
displayName: DevicePluginArguments
path: devicePlugin.devicePluginArguments
x-descriptors:
- urn:alm:descriptor:com.amd.deviceconfigs:devicePluginArguments
- description: device plugin image
displayName: DevicePluginImage
path: devicePlugin.devicePluginImage
Expand Down Expand Up @@ -192,11 +199,6 @@ spec:
path: devicePlugin.nodeLabellerTolerations
x-descriptors:
- urn:alm:descriptor:com.amd.deviceconfigs:nodeLabellerTolerations
- description: resource naming strategy for device plugin
displayName: ResourceNamingStrategy
path: devicePlugin.resourceNamingStrategy
x-descriptors:
- urn:alm:descriptor:com.amd.deviceconfigs:ResourceNamingStrategy
- description: upgrade policy for device plugin and node labeller daemons
displayName: UpgradePolicy
path: devicePlugin.upgradePolicy
Expand Down
16 changes: 8 additions & 8 deletions bundle/manifests/amd.com_deviceconfigs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,13 @@ spec:
devicePlugin:
description: device plugin
properties:
devicePluginArguments:
additionalProperties:
type: string
description: |-
device plugin arguments is used to pass supported flags and their values while starting device plugin daemonset
supported flag values: {"resource_naming_strategy": {"single", "mixed"}}
type: object
devicePluginImage:
description: device plugin image
pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
Expand Down Expand Up @@ -306,13 +313,6 @@ spec:
type: string
type: object
type: array
resourceNamingStrategy:
default: single
description: resource naming strategy for device plugin
enum:
- single
- mixed
type: string
upgradePolicy:
description: upgrade policy for device plugin and node labeller
daemons
Expand Down Expand Up @@ -357,7 +357,7 @@ spec:
for OpenShift the default value is image-registry.openshift-image-registry.svc:5000/$MOD_NAMESPACE/amdgpu_kmod
image tag will be in the format of <linux distro>-<release version>-<kernel version>-<driver version>
example tag is coreos-416.94-5.14.0-427.28.1.el9_4.x86_64-6.2.2 and ubuntu-22.04-5.15.0-94-generic-6.1.3
pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[$a-zA-Z0-9_]+(?:[._-][$a-zA-Z0-9_]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
type: string
imageRegistrySecret:
description: secrets used for pull/push images from/to private
Expand Down
12 changes: 7 additions & 5 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,6 @@ package main
import (
"flag"

"github.com/ROCm/gpu-operator/internal/configmanager"
"github.com/ROCm/gpu-operator/internal/metricsexporter"
"github.com/ROCm/gpu-operator/internal/testrunner"
kmmv1beta1 "github.com/rh-ecosystem-edge/kernel-module-management/api/v1beta1"
"k8s.io/apimachinery/pkg/runtime"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
Expand All @@ -51,11 +48,15 @@ import (
_ "k8s.io/client-go/plugin/pkg/client/auth"

gpuev1alpha1 "github.com/ROCm/gpu-operator/api/v1alpha1"
utils "github.com/ROCm/gpu-operator/internal"
"github.com/ROCm/gpu-operator/internal/cmd"
"github.com/ROCm/gpu-operator/internal/config"
"github.com/ROCm/gpu-operator/internal/configmanager"
"github.com/ROCm/gpu-operator/internal/controllers"
"github.com/ROCm/gpu-operator/internal/kmmmodule"
"github.com/ROCm/gpu-operator/internal/metricsexporter"
"github.com/ROCm/gpu-operator/internal/nodelabeller"
"github.com/ROCm/gpu-operator/internal/testrunner"
//+kubebuilder:scaffold:imports
)

Expand Down Expand Up @@ -107,8 +108,9 @@ func main() {
}

client := mgr.GetClient()
kmmHandler := kmmmodule.NewKMMModule(client, scheme)
nlHandler := nodelabeller.NewNodeLabeller(scheme)
isOpenShift := utils.IsOpenShift(setupLogger)
kmmHandler := kmmmodule.NewKMMModule(client, scheme, isOpenShift)
nlHandler := nodelabeller.NewNodeLabeller(scheme, isOpenShift)
metricsHandler := metricsexporter.NewMetricsExporter(scheme)
testrunnerHandler := testrunner.NewTestRunner(scheme)
configmanagerHandler := configmanager.NewConfigManager(scheme)
Expand Down
16 changes: 8 additions & 8 deletions config/crd/bases/amd.com_deviceconfigs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,13 @@ spec:
devicePlugin:
description: device plugin
properties:
devicePluginArguments:
additionalProperties:
type: string
description: |-
device plugin arguments is used to pass supported flags and their values while starting device plugin daemonset
supported flag values: {"resource_naming_strategy": {"single", "mixed"}}
type: object
devicePluginImage:
description: device plugin image
pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
Expand Down Expand Up @@ -302,13 +309,6 @@ spec:
type: string
type: object
type: array
resourceNamingStrategy:
default: single
description: resource naming strategy for device plugin
enum:
- single
- mixed
type: string
upgradePolicy:
description: upgrade policy for device plugin and node labeller
daemons
Expand Down Expand Up @@ -353,7 +353,7 @@ spec:
for OpenShift the default value is image-registry.openshift-image-registry.svc:5000/$MOD_NAMESPACE/amdgpu_kmod
image tag will be in the format of <linux distro>-<release version>-<kernel version>-<driver version>
example tag is coreos-416.94-5.14.0-427.28.1.el9_4.x86_64-6.2.2 and ubuntu-22.04-5.15.0-94-generic-6.1.3
pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[$a-zA-Z0-9_]+(?:[._-][$a-zA-Z0-9_]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
type: string
imageRegistrySecret:
description: secrets used for pull/push images from/to private
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,13 @@ spec:
path: devicePlugin
x-descriptors:
- urn:alm:descriptor:com.amd.deviceconfigs:devicePlugin
- description: 'device plugin arguments is used to pass supported flags and
their values while starting device plugin daemonset supported flag values:
{"resource_naming_strategy": {"single", "mixed"}}'
displayName: DevicePluginArguments
path: devicePlugin.devicePluginArguments
x-descriptors:
- urn:alm:descriptor:com.amd.deviceconfigs:devicePluginArguments
- description: device plugin image
displayName: DevicePluginImage
path: devicePlugin.devicePluginImage
Expand Down Expand Up @@ -163,11 +170,6 @@ spec:
path: devicePlugin.nodeLabellerTolerations
x-descriptors:
- urn:alm:descriptor:com.amd.deviceconfigs:nodeLabellerTolerations
- description: resource naming strategy for device plugin
displayName: ResourceNamingStrategy
path: devicePlugin.resourceNamingStrategy
x-descriptors:
- urn:alm:descriptor:com.amd.deviceconfigs:ResourceNamingStrategy
- description: upgrade policy for device plugin and node labeller daemons
displayName: UpgradePolicy
path: devicePlugin.upgradePolicy
Expand Down
3 changes: 0 additions & 3 deletions docs/.markdownlint-cli2.yaml

This file was deleted.

8 changes: 8 additions & 0 deletions docs/installation/openshift-olm.md
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,14 @@ spec:
"feature.node.kubernetes.io/amd-gpu": "true"
```

Things to note:

1. By default, there is no need to specify the image field in CR for Openshift. Default will be used which is: image-registry.openshift-image-registry.svc:5000/$MOD_NAMESPACE/amdgpu_kmod

2. If users specify image, $MOD_NAMESPACE can be a place holder , KMM Operator can automatically translate it to the namespace

3. Openshift internal registry has image url restriction, OpenShift users cannot use image like `<registry URL>/<repo name>` , it requires the image URL to be `<registry URL>/<project name or namespace>/<repo name>`. However, if any other registry is being used by the user, the image URL can be of either form.

The operator will:

1. Collect worker node system specifications
Expand Down
2 changes: 1 addition & 1 deletion helm-charts-k8s/Chart.lock
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@ dependencies:
repository: file://./charts/kmm
version: v1.0.0
digest: sha256:f9a315dd2ce3d515ebf28c8e9a6a82158b493ca2686439ec381487761261b597
generated: "2025-03-20T06:06:33.9562362Z"
generated: "2025-03-25T06:19:17.248998622Z"
16 changes: 8 additions & 8 deletions helm-charts-k8s/crds/deviceconfig-crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,13 @@ spec:
devicePlugin:
description: device plugin
properties:
devicePluginArguments:
additionalProperties:
type: string
description: |-
device plugin arguments is used to pass supported flags and their values while starting device plugin daemonset
supported flag values: {"resource_naming_strategy": {"single", "mixed"}}
type: object
devicePluginImage:
description: device plugin image
pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
Expand Down Expand Up @@ -310,13 +317,6 @@ spec:
type: string
type: object
type: array
resourceNamingStrategy:
default: single
description: resource naming strategy for device plugin
enum:
- single
- mixed
type: string
upgradePolicy:
description: upgrade policy for device plugin and node labeller
daemons
Expand Down Expand Up @@ -361,7 +361,7 @@ spec:
for OpenShift the default value is image-registry.openshift-image-registry.svc:5000/$MOD_NAMESPACE/amdgpu_kmod
image tag will be in the format of <linux distro>-<release version>-<kernel version>-<driver version>
example tag is coreos-416.94-5.14.0-427.28.1.el9_4.x86_64-6.2.2 and ubuntu-22.04-5.15.0-94-generic-6.1.3
pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[$a-zA-Z0-9_]+(?:[._-][$a-zA-Z0-9_]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
type: string
imageRegistrySecret:
description: secrets used for pull/push images from/to private registry
Expand Down
2 changes: 1 addition & 1 deletion helm-charts-openshift/Chart.lock
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@ dependencies:
repository: file://./charts/kmm
version: v1.0.0
digest: sha256:25200c34a5cc846a1275e5bf3fc637b19e909dc68de938189c5278d77d03f5ac
generated: "2025-03-20T06:06:55.80187139Z"
generated: "2025-03-25T06:19:26.060856628Z"
16 changes: 8 additions & 8 deletions helm-charts-openshift/crds/deviceconfig-crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,13 @@ spec:
devicePlugin:
description: device plugin
properties:
devicePluginArguments:
additionalProperties:
type: string
description: |-
device plugin arguments is used to pass supported flags and their values while starting device plugin daemonset
supported flag values: {"resource_naming_strategy": {"single", "mixed"}}
type: object
devicePluginImage:
description: device plugin image
pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
Expand Down Expand Up @@ -310,13 +317,6 @@ spec:
type: string
type: object
type: array
resourceNamingStrategy:
default: single
description: resource naming strategy for device plugin
enum:
- single
- mixed
type: string
upgradePolicy:
description: upgrade policy for device plugin and node labeller
daemons
Expand Down Expand Up @@ -361,7 +361,7 @@ spec:
for OpenShift the default value is image-registry.openshift-image-registry.svc:5000/$MOD_NAMESPACE/amdgpu_kmod
image tag will be in the format of <linux distro>-<release version>-<kernel version>-<driver version>
example tag is coreos-416.94-5.14.0-427.28.1.el9_4.x86_64-6.2.2 and ubuntu-22.04-5.15.0-94-generic-6.1.3
pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[$a-zA-Z0-9_]+(?:[._-][$a-zA-Z0-9_]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
type: string
imageRegistrySecret:
description: secrets used for pull/push images from/to private registry
Expand Down
10 changes: 5 additions & 5 deletions internal/controllers/upgrademgr.go
Original file line number Diff line number Diff line change
Expand Up @@ -151,11 +151,6 @@ func (n *upgradeMgr) HandleUpgrade(ctx context.Context, deviceConfig *amdv1alpha
// 1. Set init status for unprocessed nodes
n.helper.handleInitStatus(ctx, &nodeList.Items[i])

if !n.helper.isNodeReadyForUpgrade(ctx, &nodeList.Items[i]) {
res = ctrl.Result{Requeue: true, RequeueAfter: time.Second * 20}
continue
}

// 2. Handle failed nodes
if n.helper.isNodeStateUpgradeFailed(ctx, &nodeList.Items[i], deviceConfig) {
n.helper.clearUpgradeStartTime(nodeList.Items[i].Name)
Expand Down Expand Up @@ -193,6 +188,11 @@ func (n *upgradeMgr) HandleUpgrade(ctx context.Context, deviceConfig *amdv1alpha
continue
}

if !n.helper.isNodeReadyForUpgrade(ctx, &nodeList.Items[i]) {
res = ctrl.Result{Requeue: true, RequeueAfter: time.Second * 20}
continue
}

//This node is a candidate for selection
candidateNodes = append(candidateNodes, nodeList.Items[i])
}
Expand Down
Loading
Loading