diff --git a/.markdownlint-cli2.yaml b/.markdownlint-cli2.yaml new file mode 100644 index 00000000..9d2459dc --- /dev/null +++ b/.markdownlint-cli2.yaml @@ -0,0 +1,2 @@ +ignores: + - "vendor/**/*.md" \ No newline at end of file diff --git a/docs/.markdownlint.yaml b/.markdownlint.yaml similarity index 82% rename from docs/.markdownlint.yaml rename to .markdownlint.yaml index 21ea9629..3ffda0de 100644 --- a/docs/.markdownlint.yaml +++ b/.markdownlint.yaml @@ -1,6 +1,3 @@ -ignores: - - CHANGELOG.md - - "vendor/**" default: true MD013: false MD024: @@ -12,3 +9,5 @@ MD029: MD033: false MD034: false MD041: false +ignores: + - "vendor/**/*.md" \ No newline at end of file diff --git a/.spellcheck.yml b/.spellcheck.local.yaml similarity index 95% rename from .spellcheck.yml rename to .spellcheck.local.yaml index 9291279a..da2912e5 100644 --- a/.spellcheck.yml +++ b/.spellcheck.local.yaml @@ -33,9 +33,9 @@ matrix: - name: Markdown sources: - - ['docs/**/*.md', '!docs/doxygen/mainpage.md', '!docs/contributing/documentation-standards.md'] - - ['tools/autotag/templates/**/*.md', '!tools/autotag/templates/**/5*.md', '!tools/autotag/templates/**/6.0*.md', '!tools/autotag/templates/**/6.1*.md'] - expect_match: false + - '!vendor/**|docs/**/*.md|!docs/doxygen/mainpage.md|!docs/contributing/documentation-standards.md' + expect_match: true + jobs: 4 aspell: lang: en dictionary: @@ -115,8 +115,9 @@ matrix: - pyspelling.filters.url: - name: reST sources: - - 'docs/**/*.rst' - expect_match: false + - '!vendor/**|docs/**/*.rst' + expect_match: true + jobs: 4 aspell: lang: en dictionary: diff --git a/api/v1alpha1/deviceconfig_types.go b/api/v1alpha1/deviceconfig_types.go index 94a4d6a2..503c0939 100644 --- a/api/v1alpha1/deviceconfig_types.go +++ b/api/v1alpha1/deviceconfig_types.go @@ -117,7 +117,7 @@ type DriverSpec struct { // example tag is coreos-416.94-5.14.0-427.28.1.el9_4.x86_64-6.2.2 and ubuntu-22.04-5.15.0-94-generic-6.1.3 //+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="Image",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:image"} // +optional - // +kubebuilder:validation:Pattern=`^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$` + // +kubebuilder:validation:Pattern=`^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[$a-zA-Z0-9_]+(?:[._-][$a-zA-Z0-9_]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$` Image string `json:"image,omitempty"` // driver image registry TLS setting for the container image @@ -251,12 +251,11 @@ type DevicePluginSpec struct { // +optional DevicePluginTolerations []v1.Toleration `json:"devicePluginTolerations,omitempty"` - // resource naming strategy for device plugin - //+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="ResourceNamingStrategy",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:ResourceNamingStrategy"} - // +kubebuilder:validation:Enum=single;mixed - // +kubebuilder:default:="single" + // device plugin arguments is used to pass supported flags and their values while starting device plugin daemonset + // supported flag values: {"resource_naming_strategy": {"single", "mixed"}} + //+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="DevicePluginArguments",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:devicePluginArguments"} // +optional - ResourceNamingStrategy string `json:"resourceNamingStrategy,omitempty"` + DevicePluginArguments map[string]string `json:"devicePluginArguments,omitempty"` // node labeller image //+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="NodeLabellerImage",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:nodeLabellerImage"} diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index bbbd03c0..c2be36c9 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -267,6 +267,13 @@ func (in *DevicePluginSpec) DeepCopyInto(out *DevicePluginSpec) { (*in)[i].DeepCopyInto(&(*out)[i]) } } + if in.DevicePluginArguments != nil { + in, out := &in.DevicePluginArguments, &out.DevicePluginArguments + *out = make(map[string]string, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } if in.NodeLabellerTolerations != nil { in, out := &in.NodeLabellerTolerations, &out.NodeLabellerTolerations *out = make([]v1.Toleration, len(*in)) diff --git a/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml b/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml index 495f3670..45078acb 100644 --- a/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml +++ b/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml @@ -30,7 +30,7 @@ metadata: } ] capabilities: Basic Install - createdAt: "2025-03-20T06:06:57Z" + createdAt: "2025-03-25T06:19:27Z" operatorframework.io/suggested-namespace: openshift-amd-gpu operators.operatorframework.io/builder: operator-sdk-v1.32.0 operators.operatorframework.io/project_layout: go.kubebuilder.io/v3 @@ -152,6 +152,13 @@ spec: path: devicePlugin x-descriptors: - urn:alm:descriptor:com.amd.deviceconfigs:devicePlugin + - description: 'device plugin arguments is used to pass supported flags and + their values while starting device plugin daemonset supported flag values: + {"resource_naming_strategy": {"single", "mixed"}}' + displayName: DevicePluginArguments + path: devicePlugin.devicePluginArguments + x-descriptors: + - urn:alm:descriptor:com.amd.deviceconfigs:devicePluginArguments - description: device plugin image displayName: DevicePluginImage path: devicePlugin.devicePluginImage @@ -192,11 +199,6 @@ spec: path: devicePlugin.nodeLabellerTolerations x-descriptors: - urn:alm:descriptor:com.amd.deviceconfigs:nodeLabellerTolerations - - description: resource naming strategy for device plugin - displayName: ResourceNamingStrategy - path: devicePlugin.resourceNamingStrategy - x-descriptors: - - urn:alm:descriptor:com.amd.deviceconfigs:ResourceNamingStrategy - description: upgrade policy for device plugin and node labeller daemons displayName: UpgradePolicy path: devicePlugin.upgradePolicy diff --git a/bundle/manifests/amd.com_deviceconfigs.yaml b/bundle/manifests/amd.com_deviceconfigs.yaml index a0476c71..c9123ffe 100644 --- a/bundle/manifests/amd.com_deviceconfigs.yaml +++ b/bundle/manifests/amd.com_deviceconfigs.yaml @@ -190,6 +190,13 @@ spec: devicePlugin: description: device plugin properties: + devicePluginArguments: + additionalProperties: + type: string + description: |- + device plugin arguments is used to pass supported flags and their values while starting device plugin daemonset + supported flag values: {"resource_naming_strategy": {"single", "mixed"}} + type: object devicePluginImage: description: device plugin image pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$ @@ -306,13 +313,6 @@ spec: type: string type: object type: array - resourceNamingStrategy: - default: single - description: resource naming strategy for device plugin - enum: - - single - - mixed - type: string upgradePolicy: description: upgrade policy for device plugin and node labeller daemons @@ -357,7 +357,7 @@ spec: for OpenShift the default value is image-registry.openshift-image-registry.svc:5000/$MOD_NAMESPACE/amdgpu_kmod image tag will be in the format of --- example tag is coreos-416.94-5.14.0-427.28.1.el9_4.x86_64-6.2.2 and ubuntu-22.04-5.15.0-94-generic-6.1.3 - pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$ + pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[$a-zA-Z0-9_]+(?:[._-][$a-zA-Z0-9_]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$ type: string imageRegistrySecret: description: secrets used for pull/push images from/to private diff --git a/cmd/main.go b/cmd/main.go index b3c985ff..168a730d 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -35,9 +35,6 @@ package main import ( "flag" - "github.com/ROCm/gpu-operator/internal/configmanager" - "github.com/ROCm/gpu-operator/internal/metricsexporter" - "github.com/ROCm/gpu-operator/internal/testrunner" kmmv1beta1 "github.com/rh-ecosystem-edge/kernel-module-management/api/v1beta1" "k8s.io/apimachinery/pkg/runtime" utilruntime "k8s.io/apimachinery/pkg/util/runtime" @@ -51,11 +48,15 @@ import ( _ "k8s.io/client-go/plugin/pkg/client/auth" gpuev1alpha1 "github.com/ROCm/gpu-operator/api/v1alpha1" + utils "github.com/ROCm/gpu-operator/internal" "github.com/ROCm/gpu-operator/internal/cmd" "github.com/ROCm/gpu-operator/internal/config" + "github.com/ROCm/gpu-operator/internal/configmanager" "github.com/ROCm/gpu-operator/internal/controllers" "github.com/ROCm/gpu-operator/internal/kmmmodule" + "github.com/ROCm/gpu-operator/internal/metricsexporter" "github.com/ROCm/gpu-operator/internal/nodelabeller" + "github.com/ROCm/gpu-operator/internal/testrunner" //+kubebuilder:scaffold:imports ) @@ -107,8 +108,9 @@ func main() { } client := mgr.GetClient() - kmmHandler := kmmmodule.NewKMMModule(client, scheme) - nlHandler := nodelabeller.NewNodeLabeller(scheme) + isOpenShift := utils.IsOpenShift(setupLogger) + kmmHandler := kmmmodule.NewKMMModule(client, scheme, isOpenShift) + nlHandler := nodelabeller.NewNodeLabeller(scheme, isOpenShift) metricsHandler := metricsexporter.NewMetricsExporter(scheme) testrunnerHandler := testrunner.NewTestRunner(scheme) configmanagerHandler := configmanager.NewConfigManager(scheme) diff --git a/config/crd/bases/amd.com_deviceconfigs.yaml b/config/crd/bases/amd.com_deviceconfigs.yaml index 5f7a02bd..24c2b053 100644 --- a/config/crd/bases/amd.com_deviceconfigs.yaml +++ b/config/crd/bases/amd.com_deviceconfigs.yaml @@ -186,6 +186,13 @@ spec: devicePlugin: description: device plugin properties: + devicePluginArguments: + additionalProperties: + type: string + description: |- + device plugin arguments is used to pass supported flags and their values while starting device plugin daemonset + supported flag values: {"resource_naming_strategy": {"single", "mixed"}} + type: object devicePluginImage: description: device plugin image pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$ @@ -302,13 +309,6 @@ spec: type: string type: object type: array - resourceNamingStrategy: - default: single - description: resource naming strategy for device plugin - enum: - - single - - mixed - type: string upgradePolicy: description: upgrade policy for device plugin and node labeller daemons @@ -353,7 +353,7 @@ spec: for OpenShift the default value is image-registry.openshift-image-registry.svc:5000/$MOD_NAMESPACE/amdgpu_kmod image tag will be in the format of --- example tag is coreos-416.94-5.14.0-427.28.1.el9_4.x86_64-6.2.2 and ubuntu-22.04-5.15.0-94-generic-6.1.3 - pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$ + pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[$a-zA-Z0-9_]+(?:[._-][$a-zA-Z0-9_]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$ type: string imageRegistrySecret: description: secrets used for pull/push images from/to private diff --git a/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml b/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml index a5b6cd65..a9f4d685 100644 --- a/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml +++ b/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml @@ -123,6 +123,13 @@ spec: path: devicePlugin x-descriptors: - urn:alm:descriptor:com.amd.deviceconfigs:devicePlugin + - description: 'device plugin arguments is used to pass supported flags and + their values while starting device plugin daemonset supported flag values: + {"resource_naming_strategy": {"single", "mixed"}}' + displayName: DevicePluginArguments + path: devicePlugin.devicePluginArguments + x-descriptors: + - urn:alm:descriptor:com.amd.deviceconfigs:devicePluginArguments - description: device plugin image displayName: DevicePluginImage path: devicePlugin.devicePluginImage @@ -163,11 +170,6 @@ spec: path: devicePlugin.nodeLabellerTolerations x-descriptors: - urn:alm:descriptor:com.amd.deviceconfigs:nodeLabellerTolerations - - description: resource naming strategy for device plugin - displayName: ResourceNamingStrategy - path: devicePlugin.resourceNamingStrategy - x-descriptors: - - urn:alm:descriptor:com.amd.deviceconfigs:ResourceNamingStrategy - description: upgrade policy for device plugin and node labeller daemons displayName: UpgradePolicy path: devicePlugin.upgradePolicy diff --git a/docs/.markdownlint-cli2.yaml b/docs/.markdownlint-cli2.yaml deleted file mode 100644 index 74870cb6..00000000 --- a/docs/.markdownlint-cli2.yaml +++ /dev/null @@ -1,3 +0,0 @@ -ignores: - - CHANGELOG.md - - "vendor/**" \ No newline at end of file diff --git a/docs/installation/openshift-olm.md b/docs/installation/openshift-olm.md index 89625fc1..889ada17 100644 --- a/docs/installation/openshift-olm.md +++ b/docs/installation/openshift-olm.md @@ -204,6 +204,14 @@ spec: "feature.node.kubernetes.io/amd-gpu": "true" ``` +Things to note: + +1. By default, there is no need to specify the image field in CR for Openshift. Default will be used which is: image-registry.openshift-image-registry.svc:5000/$MOD_NAMESPACE/amdgpu_kmod + +2. If users specify image, $MOD_NAMESPACE can be a place holder , KMM Operator can automatically translate it to the namespace + +3. Openshift internal registry has image url restriction, OpenShift users cannot use image like `/` , it requires the image URL to be `//`. However, if any other registry is being used by the user, the image URL can be of either form. + The operator will: 1. Collect worker node system specifications diff --git a/helm-charts-k8s/Chart.lock b/helm-charts-k8s/Chart.lock index 6ad80130..54b4cb8c 100644 --- a/helm-charts-k8s/Chart.lock +++ b/helm-charts-k8s/Chart.lock @@ -6,4 +6,4 @@ dependencies: repository: file://./charts/kmm version: v1.0.0 digest: sha256:f9a315dd2ce3d515ebf28c8e9a6a82158b493ca2686439ec381487761261b597 -generated: "2025-03-20T06:06:33.9562362Z" +generated: "2025-03-25T06:19:17.248998622Z" diff --git a/helm-charts-k8s/crds/deviceconfig-crd.yaml b/helm-charts-k8s/crds/deviceconfig-crd.yaml index 6058c151..502f4b89 100644 --- a/helm-charts-k8s/crds/deviceconfig-crd.yaml +++ b/helm-charts-k8s/crds/deviceconfig-crd.yaml @@ -194,6 +194,13 @@ spec: devicePlugin: description: device plugin properties: + devicePluginArguments: + additionalProperties: + type: string + description: |- + device plugin arguments is used to pass supported flags and their values while starting device plugin daemonset + supported flag values: {"resource_naming_strategy": {"single", "mixed"}} + type: object devicePluginImage: description: device plugin image pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$ @@ -310,13 +317,6 @@ spec: type: string type: object type: array - resourceNamingStrategy: - default: single - description: resource naming strategy for device plugin - enum: - - single - - mixed - type: string upgradePolicy: description: upgrade policy for device plugin and node labeller daemons @@ -361,7 +361,7 @@ spec: for OpenShift the default value is image-registry.openshift-image-registry.svc:5000/$MOD_NAMESPACE/amdgpu_kmod image tag will be in the format of --- example tag is coreos-416.94-5.14.0-427.28.1.el9_4.x86_64-6.2.2 and ubuntu-22.04-5.15.0-94-generic-6.1.3 - pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$ + pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[$a-zA-Z0-9_]+(?:[._-][$a-zA-Z0-9_]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$ type: string imageRegistrySecret: description: secrets used for pull/push images from/to private registry diff --git a/helm-charts-openshift/Chart.lock b/helm-charts-openshift/Chart.lock index 6e3c4ccc..6e9b718d 100644 --- a/helm-charts-openshift/Chart.lock +++ b/helm-charts-openshift/Chart.lock @@ -6,4 +6,4 @@ dependencies: repository: file://./charts/kmm version: v1.0.0 digest: sha256:25200c34a5cc846a1275e5bf3fc637b19e909dc68de938189c5278d77d03f5ac -generated: "2025-03-20T06:06:55.80187139Z" +generated: "2025-03-25T06:19:26.060856628Z" diff --git a/helm-charts-openshift/crds/deviceconfig-crd.yaml b/helm-charts-openshift/crds/deviceconfig-crd.yaml index 6058c151..502f4b89 100644 --- a/helm-charts-openshift/crds/deviceconfig-crd.yaml +++ b/helm-charts-openshift/crds/deviceconfig-crd.yaml @@ -194,6 +194,13 @@ spec: devicePlugin: description: device plugin properties: + devicePluginArguments: + additionalProperties: + type: string + description: |- + device plugin arguments is used to pass supported flags and their values while starting device plugin daemonset + supported flag values: {"resource_naming_strategy": {"single", "mixed"}} + type: object devicePluginImage: description: device plugin image pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$ @@ -310,13 +317,6 @@ spec: type: string type: object type: array - resourceNamingStrategy: - default: single - description: resource naming strategy for device plugin - enum: - - single - - mixed - type: string upgradePolicy: description: upgrade policy for device plugin and node labeller daemons @@ -361,7 +361,7 @@ spec: for OpenShift the default value is image-registry.openshift-image-registry.svc:5000/$MOD_NAMESPACE/amdgpu_kmod image tag will be in the format of --- example tag is coreos-416.94-5.14.0-427.28.1.el9_4.x86_64-6.2.2 and ubuntu-22.04-5.15.0-94-generic-6.1.3 - pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$ + pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[$a-zA-Z0-9_]+(?:[._-][$a-zA-Z0-9_]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$ type: string imageRegistrySecret: description: secrets used for pull/push images from/to private registry diff --git a/internal/controllers/upgrademgr.go b/internal/controllers/upgrademgr.go index c5f3fd6e..a5e519b2 100644 --- a/internal/controllers/upgrademgr.go +++ b/internal/controllers/upgrademgr.go @@ -151,11 +151,6 @@ func (n *upgradeMgr) HandleUpgrade(ctx context.Context, deviceConfig *amdv1alpha // 1. Set init status for unprocessed nodes n.helper.handleInitStatus(ctx, &nodeList.Items[i]) - if !n.helper.isNodeReadyForUpgrade(ctx, &nodeList.Items[i]) { - res = ctrl.Result{Requeue: true, RequeueAfter: time.Second * 20} - continue - } - // 2. Handle failed nodes if n.helper.isNodeStateUpgradeFailed(ctx, &nodeList.Items[i], deviceConfig) { n.helper.clearUpgradeStartTime(nodeList.Items[i].Name) @@ -193,6 +188,11 @@ func (n *upgradeMgr) HandleUpgrade(ctx context.Context, deviceConfig *amdv1alpha continue } + if !n.helper.isNodeReadyForUpgrade(ctx, &nodeList.Items[i]) { + res = ctrl.Result{Requeue: true, RequeueAfter: time.Second * 20} + continue + } + //This node is a candidate for selection candidateNodes = append(candidateNodes, nodeList.Items[i]) } diff --git a/internal/kmmmodule/kmmmodule.go b/internal/kmmmodule/kmmmodule.go index 9aa6632b..9ca34383 100644 --- a/internal/kmmmodule/kmmmodule.go +++ b/internal/kmmmodule/kmmmodule.go @@ -55,10 +55,8 @@ import ( "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/intstr" - "k8s.io/client-go/discovery" "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" - ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" "sigs.k8s.io/controller-runtime/pkg/log" @@ -107,27 +105,14 @@ type kmmModule struct { isOpenShift bool } -func NewKMMModule(client client.Client, scheme *runtime.Scheme) KMMModuleAPI { +func NewKMMModule(client client.Client, scheme *runtime.Scheme, isOpenShift bool) KMMModuleAPI { return &kmmModule{ client: client, scheme: scheme, - isOpenShift: isOpenshift(), + isOpenShift: isOpenShift, } } -func isOpenshift() bool { - if dc, err := discovery.NewDiscoveryClientForConfig(ctrl.GetConfigOrDie()); err == nil { - if gplist, err := dc.ServerGroups(); err == nil { - for _, gp := range gplist.Groups { - if gp.Name == "route.openshift.io" { - return true - } - } - } - } - return false -} - func (km *kmmModule) SetNodeVersionLabelAsDesired(ctx context.Context, devConfig *amdv1alpha1.DeviceConfig, nodes *v1.NodeList) error { // for each selected node // put the KMM version label given by CR's driver version @@ -272,8 +257,14 @@ func (km *kmmModule) SetDevicePluginAsDesired(ds *appsv1.DaemonSet, devConfig *a return fmt.Errorf("daemon set is not initialized, zero pointer") } - resourceNamingStrategy := devConfig.Spec.DevicePlugin.ResourceNamingStrategy - command := []string{"sh", "-c", fmt.Sprintf("./k8s-device-plugin -logtostderr=true -stderrthreshold=INFO -v=5 -pulse=30 -resource_naming_strategy=%s", resourceNamingStrategy)} + commandArgs := "./k8s-device-plugin -logtostderr=true -stderrthreshold=INFO -v=5 -pulse=30" + + devicePluginArguments := devConfig.Spec.DevicePlugin.DevicePluginArguments + for key, val := range devicePluginArguments { + commandArgs += " -" + key + "=" + val + } + + command := []string{"sh", "-c", commandArgs} nodeSelector := map[string]string{} for key, val := range devConfig.Spec.Selector { nodeSelector[key] = val diff --git a/internal/nodelabeller/nodelabeller.go b/internal/nodelabeller/nodelabeller.go index 8f60805b..959bf39f 100644 --- a/internal/nodelabeller/nodelabeller.go +++ b/internal/nodelabeller/nodelabeller.go @@ -42,9 +42,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/util/intstr" - "k8s.io/client-go/discovery" "k8s.io/utils/ptr" - ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" ) @@ -66,26 +64,13 @@ type nodeLabeller struct { isOpenShift bool } -func NewNodeLabeller(scheme *runtime.Scheme) NodeLabeller { +func NewNodeLabeller(scheme *runtime.Scheme, isOpenshift bool) NodeLabeller { return &nodeLabeller{ scheme: scheme, - isOpenShift: isOpenshift(), + isOpenShift: isOpenshift, } } -func isOpenshift() bool { - if dc, err := discovery.NewDiscoveryClientForConfig(ctrl.GetConfigOrDie()); err == nil { - if gplist, err := dc.ServerGroups(); err == nil { - for _, gp := range gplist.Groups { - if gp.Name == "route.openshift.io" { - return true - } - } - } - } - return false -} - func (nl *nodeLabeller) SetNodeLabellerAsDesired(ds *appsv1.DaemonSet, devConfig *amdv1alpha1.DeviceConfig) error { if ds == nil { return fmt.Errorf("daemon set is not initialized, zero pointer") diff --git a/internal/utils.go b/internal/utils.go index 9c67f1d3..bc642343 100644 --- a/internal/utils.go +++ b/internal/utils.go @@ -17,17 +17,28 @@ limitations under the License. package utils import ( + "context" "fmt" "strings" - amdv1alpha1 "github.com/ROCm/gpu-operator/api/v1alpha1" + "github.com/go-logr/logr" v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" + + amdv1alpha1 "github.com/ROCm/gpu-operator/api/v1alpha1" + "github.com/ROCm/gpu-operator/internal/cmd" ) const ( - defaultOcDriversVersion = "6.2.2" - NodeFeatureLabelAmdGpu = "feature.node.kubernetes.io/amd-gpu" - NodeFeatureLabelAmdVGpu = "feature.node.kubernetes.io/amd-vgpu" + defaultOcDriversVersion = "6.2.2" + openShiftNodeLabel = "node.openshift.io/os_id" + NodeFeatureLabelAmdGpu = "feature.node.kubernetes.io/amd-gpu" + NodeFeatureLabelAmdVGpu = "feature.node.kubernetes.io/amd-vgpu" + ResourceNamingStrategyFlag = "resource_naming_strategy" + SingleStrategy = "single" + MixedStrategy = "mixed" ) func GetDriverVersion(node v1.Node, deviceConfig amdv1alpha1.DeviceConfig) (string, error) { @@ -88,3 +99,30 @@ func HasNodeLabelKey(node v1.Node, labelKey string) bool { } return false } + +func IsOpenShift(logger logr.Logger) bool { + config, err := rest.InClusterConfig() + if err != nil { + cmd.FatalError(logger, err, "unable to get cluster config") + } + // creates the clientset + clientset, err := kubernetes.NewForConfig(config) + if err != nil { + cmd.FatalError(logger, err, "unable to create cluster clientset") + } + // Check for OpenShift-specific labels on nodes + nodes, err := clientset.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{}) + if err != nil { + cmd.FatalError(logger, err, "unable to list nodes") + } + + isOpenShift := false + for _, node := range nodes.Items { + if _, exists := node.Labels[openShiftNodeLabel]; exists { + isOpenShift = true + break + } + } + logger.Info(fmt.Sprintf("IsOpenShift: %+v", isOpenShift)) + return isOpenShift +} diff --git a/internal/validator/specValidators.go b/internal/validator/specValidators.go index f6d87ca7..b804c488 100644 --- a/internal/validator/specValidators.go +++ b/internal/validator/specValidators.go @@ -21,6 +21,7 @@ import ( "fmt" amdv1alpha1 "github.com/ROCm/gpu-operator/api/v1alpha1" + utils "github.com/ROCm/gpu-operator/internal" "sigs.k8s.io/controller-runtime/pkg/client" ) @@ -86,5 +87,29 @@ func ValidateDevicePluginSpec(ctx context.Context, client client.Client, devConf } } + supportedFlagValues := map[string][]string{ + utils.ResourceNamingStrategyFlag: {utils.SingleStrategy, utils.MixedStrategy}, + } + + devicePluginArguments := devConfig.Spec.DevicePlugin.DevicePluginArguments + for key, val := range devicePluginArguments { + validValues, validKey := supportedFlagValues[key] + if !validKey { + return fmt.Errorf("Invalid flag: %s", key) + } + validKeyValue := false + + for _, validVal := range validValues { + if val == validVal { + validKeyValue = true + break + } + } + + if !validKeyValue { + return fmt.Errorf("Invalid flag value: %s=%s. Supported values: %v", key, val, supportedFlagValues[key]) + } + } + return nil } diff --git a/tests/e2e/Makefile b/tests/e2e/Makefile index 90b1e791..e00185c7 100644 --- a/tests/e2e/Makefile +++ b/tests/e2e/Makefile @@ -7,8 +7,11 @@ E2E_KUBE_RBAC_PROXY_CURL_IMAGE ?= curlimages/curl:7.78.0 E2E_UBUNTU_BASE_IMAGE ?= ubuntu:22.04 E2E_MINIO_IMAGE ?= minio/minio:latest E2E_EXPORTER_IMAGE ?= rocm/device-metrics-exporter:v1.2.0 +E2E_EXPORTER_IMAGE_2 ?= rocm/device-metrics-exporter:v1.1.1-beta.0 E2E_DEVICE_PLUGIN_IMAGE ?= rocm/k8s-device-plugin:latest E2E_NODE_LABELLER_IMAGE ?= rocm/k8s-device-plugin:labeller-latest +E2E_DEVICE_PLUGIN_IMAGE_2 ?= rocm/k8s-device-plugin:1.31.0.6 +E2E_NODE_LABELLER_IMAGE_2 ?= rocm/k8s-device-plugin:labeller-1.31.0.6 E2E_TEST_RUNNER_IMAGE ?= rocm/test-runner:v1.2.0-beta.0 export E2E_INIT_CONTAINER_IMAGE @@ -16,8 +19,11 @@ export E2E_KUBE_RBAC_PROXY_CURL_IMAGE export E2E_UBUNTU_BASE_IMAGE export E2E_MINIO_IMAGE export E2E_EXPORTER_IMAGE +export E2E_EXPORTER_IMAGE_2 export E2E_DEVICE_PLUGIN_IMAGE export E2E_NODE_LABELLER_IMAGE +export E2E_DEVICE_PLUGIN_IMAGE_2 +export E2E_NODE_LABELLER_IMAGE_2 export E2E_TEST_RUNNER_IMAGE export E2E_DCM_IMAGE diff --git a/tests/e2e/cluster_test.go b/tests/e2e/cluster_test.go index ed22eb99..40fd52e5 100644 --- a/tests/e2e/cluster_test.go +++ b/tests/e2e/cluster_test.go @@ -1051,7 +1051,7 @@ func (s *E2ESuite) TestWorkloadRequestedGPUs(c *C) { s.verifyDeviceConfigStatus(devCfg, c) s.verifyNodeGPULabel(devCfg, c) - ret, err := utils.GetAMDGPUCount(ctx, s.clientSet) + ret, err := utils.GetAMDGPUCount(ctx, s.clientSet, "gpu") if err != nil { logger.Errorf("error: %v", err) } @@ -1078,7 +1078,7 @@ func (s *E2ESuite) TestWorkloadRequestedGPUs(c *C) { err = utils.DeployRocmPods(context.TODO(), s.clientSet, res) assert.NoError(c, err, "failed to deploy pods") s.verifyROCMPOD(true, c) - err = utils.VerifyROCMPODResourceCount(ctx, s.clientSet, gpuReqCount) + err = utils.VerifyROCMPODResourceCount(ctx, s.clientSet, gpuReqCount, "gpu") assert.NoError(c, err, fmt.Sprintf("%v", err)) // delete @@ -1092,6 +1092,244 @@ func (s *E2ESuite) TestWorkloadRequestedGPUs(c *C) { assert.NoError(c, err, "failed to reboot nodes") } +func (s *E2ESuite) TestWorkloadRequestedGPUsHomogeneousSingle(c *C) { + if s.simEnable { + c.Skip("Skipping for non amd gpu testbed") + } + if !dcmImageDefined { + c.Skip("skip DCM test because E2E_DCM_IMAGE is not defined") + } + + s.configMapHelper(c) + + logger.Infof("Add node label after pod comes up") + time.Sleep(30 * time.Second) + + nodes := utils.GetAMDGpuWorker(s.clientSet, s.openshift) + nodeNames := make([]string, 0) + for _, node := range nodes { + nodeNames = append(nodeNames, node.Name) + } + for _, nodeName := range nodeNames { + s.addRemoveNodeLabels(nodeName, "e2e_profile2") + } + + logs := s.getLogs() + if strings.Contains(logs, "Partition completed successfully") && (!strings.Contains(logs, "ERROR")) && (s.eventHelper("SuccessfullyPartitioned", "Normal")) { + logger.Infof("Successfully tested homogenous default partitioning") + } else { + logger.Errorf("Failure test homogenous partitioning") + } + devCfgDcm := s.getDeviceConfigForDCM(c) + s.deleteDeviceConfig(devCfgDcm, c) + + time.Sleep(60 * time.Second) + + ctx := context.TODO() + logger.Infof("create %v", s.cfgName) + devCfg := s.getDeviceConfig(c) + driverEnable := false + devCfg.Spec.Driver.Enable = &driverEnable + s.createDeviceConfig(devCfg, c) + s.checkNFDWorkerStatus(s.ns, c, "") + s.checkNodeLabellerStatus(s.ns, c, devCfg) + s.verifyDeviceConfigStatus(devCfg, c) + s.verifyNodeGPULabel(devCfg, c) + + ret, err := utils.GetAMDGPUCount(ctx, s.clientSet, "gpu") + if err != nil { + logger.Errorf("error: %v", err) + } + var minGPU int = 10000 + for _, v := range ret { + if v < minGPU { + minGPU = v + } + } + assert.Greater(c, minGPU, 0, "did not find any server with amd gpu") + + gpuLimitCount := minGPU + gpuReqCount := minGPU + + res := &v1.ResourceRequirements{ + Limits: v1.ResourceList{ + "amd.com/gpu": resource.MustParse(fmt.Sprintf("%d", gpuLimitCount)), + }, + Requests: v1.ResourceList{ + "amd.com/gpu": resource.MustParse(fmt.Sprintf("%d", gpuReqCount)), + }, + } + + err = utils.DeployRocmPods(context.TODO(), s.clientSet, res) + assert.NoError(c, err, "failed to deploy pods") + err = utils.VerifyROCMPODResourceCount(ctx, s.clientSet, gpuReqCount, "gpu") + assert.NoError(c, err, fmt.Sprintf("%v", err)) + + // delete + s.deleteDeviceConfig(devCfg, c) + + err = utils.DelRocmPods(context.TODO(), s.clientSet) + assert.NoError(c, err, "failed to remove rocm pods") +} + +func (s *E2ESuite) TestWorkloadRequestedGPUsHomogeneousMixed(c *C) { + if s.simEnable { + c.Skip("Skipping for non amd gpu testbed") + } + if !dcmImageDefined { + c.Skip("skip DCM test because E2E_DCM_IMAGE is not defined") + } + + s.configMapHelper(c) + + logger.Infof("Add node label after pod comes up") + time.Sleep(30 * time.Second) + + nodes := utils.GetAMDGpuWorker(s.clientSet, s.openshift) + nodeNames := make([]string, 0) + for _, node := range nodes { + nodeNames = append(nodeNames, node.Name) + } + for _, nodeName := range nodeNames { + s.addRemoveNodeLabels(nodeName, "e2e_profile2") + } + + logs := s.getLogs() + if strings.Contains(logs, "Partition completed successfully") && (!strings.Contains(logs, "ERROR")) && (s.eventHelper("SuccessfullyPartitioned", "Normal")) { + logger.Infof("Successfully tested homogeneous partitioning") + } else { + logger.Errorf("Failure test homogeneous partitioning") + } + devCfgDcm := s.getDeviceConfigForDCM(c) + s.deleteDeviceConfig(devCfgDcm, c) + time.Sleep(60 * time.Second) + ctx := context.TODO() + logger.Infof("create %v", s.cfgName) + devCfg := s.getDeviceConfig(c) + driverEnable := false + devCfg.Spec.Driver.Enable = &driverEnable + devCfg.Spec.DevicePlugin.DevicePluginArguments = map[string]string{"resource_naming_strategy": "mixed"} + s.createDeviceConfig(devCfg, c) + s.checkNFDWorkerStatus(s.ns, c, "") + s.checkNodeLabellerStatus(s.ns, c, devCfg) + s.verifyDeviceConfigStatus(devCfg, c) + + ret, err := utils.GetAMDGPUCount(ctx, s.clientSet, "cpx_nps4") + if err != nil { + logger.Errorf("error: %v", err) + } + var minGPU int = 10000 + for _, v := range ret { + if v < minGPU { + minGPU = v + } + } + assert.Greater(c, minGPU, 0, "did not find any server with amd gpu") + + gpuLimitCount := minGPU + gpuReqCount := minGPU + + res := &v1.ResourceRequirements{ + Limits: v1.ResourceList{ + "amd.com/cpx_nps4": resource.MustParse(fmt.Sprintf("%d", gpuLimitCount)), + }, + Requests: v1.ResourceList{ + "amd.com/cpx_nps4": resource.MustParse(fmt.Sprintf("%d", gpuReqCount)), + }, + } + + err = utils.DeployRocmPods(context.TODO(), s.clientSet, res) + assert.NoError(c, err, "failed to deploy pods") + err = utils.VerifyROCMPODResourceCount(ctx, s.clientSet, gpuReqCount, "cpx_nps4") + assert.NoError(c, err, fmt.Sprintf("%v", err)) + + // delete + s.deleteDeviceConfig(devCfg, c) + + err = utils.DelRocmPods(context.TODO(), s.clientSet) + assert.NoError(c, err, "failed to remove rocm pods") + +} + +func (s *E2ESuite) TestWorkloadRequestedGPUsHeterogeneousMixed(c *C) { + if s.simEnable { + c.Skip("Skipping for non amd gpu testbed") + } + if !dcmImageDefined { + c.Skip("skip DCM test because E2E_DCM_IMAGE is not defined") + } + + s.configMapHelper(c) + + logger.Infof("Add node label after pod comes up") + time.Sleep(30 * time.Second) + + nodes := utils.GetAMDGpuWorker(s.clientSet, s.openshift) + nodeNames := make([]string, 0) + for _, node := range nodes { + nodeNames = append(nodeNames, node.Name) + } + for _, nodeName := range nodeNames { + s.addRemoveNodeLabels(nodeName, "e2e_profile1") + } + + logs := s.getLogs() + if strings.Contains(logs, "Partition completed successfully") && (!strings.Contains(logs, "ERROR")) && (s.eventHelper("SuccessfullyPartitioned", "Normal")) { + logger.Infof("Successfully tested homogeneous partitioning") + } else { + logger.Errorf("Failure test heterogenous partitioning") + } + devCfgDcm := s.getDeviceConfigForDCM(c) + s.deleteDeviceConfig(devCfgDcm, c) + time.Sleep(60 * time.Second) + + ctx := context.TODO() + logger.Infof("create %v", s.cfgName) + devCfg := s.getDeviceConfig(c) + driverEnable := false + devCfg.Spec.Driver.Enable = &driverEnable + devCfg.Spec.DevicePlugin.DevicePluginArguments = map[string]string{"resource_naming_strategy": "mixed"} + s.createDeviceConfig(devCfg, c) + s.checkNFDWorkerStatus(s.ns, c, "") + s.checkNodeLabellerStatus(s.ns, c, devCfg) + s.verifyDeviceConfigStatus(devCfg, c) + + ret, err := utils.GetAMDGPUCount(ctx, s.clientSet, "cpx_nps1") + if err != nil { + logger.Errorf("error: %v", err) + } + var minGPU int = 10000 + for _, v := range ret { + if v < minGPU { + minGPU = v + } + } + assert.Greater(c, minGPU, 0, "did not find any server with amd gpu") + + gpuLimitCount := minGPU + gpuReqCount := minGPU + + res := &v1.ResourceRequirements{ + Limits: v1.ResourceList{ + "amd.com/cpx_nps1": resource.MustParse(fmt.Sprintf("%d", gpuLimitCount)), + }, + Requests: v1.ResourceList{ + "amd.com/cpx_nps1": resource.MustParse(fmt.Sprintf("%d", gpuReqCount)), + }, + } + + err = utils.DeployRocmPods(context.TODO(), s.clientSet, res) + assert.NoError(c, err, "failed to deploy pods") + err = utils.VerifyROCMPODResourceCount(ctx, s.clientSet, gpuReqCount, "cpx_nps1") + assert.NoError(c, err, fmt.Sprintf("%v", err)) + + // delete + s.deleteDeviceConfig(devCfg, c) + + err = utils.DelRocmPods(context.TODO(), s.clientSet) + assert.NoError(c, err, "failed to remove rocm pods") +} + func (s *E2ESuite) TestKubeRbacProxyClusterIP(c *C) { _, err := s.dClient.DeviceConfigs(s.ns).Get("deviceconfig-kuberbac-clusterip", metav1.GetOptions{}) assert.Errorf(c, err, "config deviceconfig-kuberbac-clusterip exists") @@ -1877,8 +2115,8 @@ func (s *E2ESuite) TestDevicePluginNodeLabellerDaemonSetUpgrade(c *C) { // upgrade // update the CR's device plugin with image - devCfg.Spec.DevicePlugin.DevicePluginImage = devicePluginImage - devCfg.Spec.DevicePlugin.NodeLabellerImage = nodeLabellerImage + devCfg.Spec.DevicePlugin.DevicePluginImage = devicePluginImage2 + devCfg.Spec.DevicePlugin.NodeLabellerImage = nodeLabellerImage2 s.patchDevicePluginImage(devCfg, c) s.patchNodeLabellerImage(devCfg, c) s.verifyDevicePluginStatus(s.ns, c, devCfg) @@ -1911,7 +2149,7 @@ func (s *E2ESuite) TestMetricsExporterDaemonSetUpgrade(c *C) { // upgrade // update the CR's device plugin with image - devCfg.Spec.MetricsExporter.Image = exporterImage + devCfg.Spec.MetricsExporter.Image = exporterImage2 s.patchMetricsExporterImage(devCfg, c) s.verifyDeviceConfigStatus(devCfg, c) s.checkMetricsExporterStatus(devCfg, s.ns, v1.ServiceTypeClusterIP, c) diff --git a/tests/e2e/dcm_e2e_test.go b/tests/e2e/dcm_e2e_test.go index f3f8b9df..cd11bd3c 100644 --- a/tests/e2e/dcm_e2e_test.go +++ b/tests/e2e/dcm_e2e_test.go @@ -72,7 +72,7 @@ func (s *E2ESuite) addRemoveNodeLabels(nodeName string, selectedProfile string) logger.Infof("Error adding node lbels: %s\n", err.Error()) return } - time.Sleep(15 * time.Second) + time.Sleep(45 * time.Second) // Allow partition to happen err = utils.DeleteNodeLabel(s.clientSet, nodeName, "dcm.amd.com/gpu-config-profile") _ = utils.DeleteNodeLabel(s.clientSet, nodeName, "dcm.amd.com/apply-gpu-config-profile") @@ -269,6 +269,7 @@ func (s *E2ESuite) createConfigMap() GPUConfigProfiles { { ComputePartition: "CPX", MemoryPartition: "NPS4", + NumGPUsAssigned: 1, }, } diff --git a/tests/e2e/init.go b/tests/e2e/init.go index d7a863eb..973cedb6 100644 --- a/tests/e2e/init.go +++ b/tests/e2e/init.go @@ -25,8 +25,11 @@ var ( initContainerImage string kubeRbacProxyCurlImage string exporterImage string + exporterImage2 string devicePluginImage string nodeLabellerImage string + devicePluginImage2 string + nodeLabellerImage2 string testRunnerImage string driverImageRepo string ) @@ -46,6 +49,10 @@ func init() { if !ok { log.Fatalf("E2E_EXPORTER_IMAGE is not defined") } + exporterImage2, ok = os.LookupEnv("E2E_EXPORTER_IMAGE_2") + if !ok { + log.Fatalf("E2E_EXPORTER_IMAGE_2 is not defined") + } devicePluginImage, ok = os.LookupEnv("E2E_DEVICE_PLUGIN_IMAGE") if !ok { log.Fatalf("E2E_DEVICE_PLUGIN_IMAGE is not defined") @@ -54,6 +61,14 @@ func init() { if !ok { log.Fatalf("E2E_NODE_LABELLER_IMAGE is not defined") } + devicePluginImage2, ok = os.LookupEnv("E2E_DEVICE_PLUGIN_IMAGE_2") + if !ok { + log.Fatalf("E2E_DEVICE_PLUGIN_IMAGE_2 is not defined") + } + nodeLabellerImage2, ok = os.LookupEnv("E2E_NODE_LABELLER_IMAGE_2") + if !ok { + log.Fatalf("E2E_NODE_LABELLER_IMAGE_2 is not defined") + } testRunnerImage, ok = os.LookupEnv("E2E_TEST_RUNNER_IMAGE") if !ok { log.Fatalf("E2E_TEST_RUNNER_IMAGE is not defined") diff --git a/tests/e2e/testrunner_test.go b/tests/e2e/testrunner_test.go index 305b6f59..aa7b37a7 100644 --- a/tests/e2e/testrunner_test.go +++ b/tests/e2e/testrunner_test.go @@ -200,7 +200,7 @@ func (s *E2ESuite) createTestRunnerConfigmap(valid bool, devCfg *v1alpha1.Device } func (s *E2ESuite) scheduleWorkloadOnNodeWithMaxGPUs(c *C) string { - ret, err := utils.GetAMDGPUCount(context.TODO(), s.clientSet) + ret, err := utils.GetAMDGPUCount(context.TODO(), s.clientSet, "gpu") if err != nil { logger.Errorf("error: %v", err) } @@ -228,7 +228,7 @@ func (s *E2ESuite) scheduleWorkloadOnNodeWithMaxGPUs(c *C) string { err = utils.DeployRocmPods(context.TODO(), s.clientSet, res) assert.NoError(c, err, "failed to deploy pods") - err = utils.VerifyROCMPODResourceCount(context.TODO(), s.clientSet, gpuReqCount) + err = utils.VerifyROCMPODResourceCount(context.TODO(), s.clientSet, gpuReqCount, "gpu") assert.NoError(c, err, fmt.Sprintf("%v", err)) return nodeWithMaxGPU @@ -730,7 +730,7 @@ func (s *E2ESuite) TestTestRunnerLogsExport(c *C) { func (s *E2ESuite) getGPUNodeName() (nodeWithMaxGPU string) { var maxPerNodeGPU int = 0 - ret, err := utils.GetAMDGPUCount(context.TODO(), s.clientSet) + ret, err := utils.GetAMDGPUCount(context.TODO(), s.clientSet, "gpu") if err != nil { logger.Printf("Unable to fetch gpu nodes. Error %v", err) return diff --git a/tests/e2e/utils/utils.go b/tests/e2e/utils/utils.go index 9c9dcf9f..5813ccf5 100644 --- a/tests/e2e/utils/utils.go +++ b/tests/e2e/utils/utils.go @@ -598,14 +598,6 @@ func GetWorkerNodes(cl *kubernetes.Clientset) []*v1.Node { func GetAMDGpuWorker(cl *kubernetes.Clientset, isOpenshift bool) []v1.Node { ret := make([]v1.Node, 0) labelSelector := labels.NewSelector() - if !isOpenshift { - r, _ := labels.NewRequirement( - "node-role.kubernetes.io/control-plane", - selection.DoesNotExist, - nil, - ) - labelSelector = labelSelector.Add(*r) - } r, _ := labels.NewRequirement( "feature.node.kubernetes.io/amd-gpu", selection.Equals, @@ -766,7 +758,7 @@ func DelRocmPodsByNodeNames(ctx context.Context, cl *kubernetes.Clientset, } -func GetAMDGPUCount(ctx context.Context, cl *kubernetes.Clientset) (map[string]int, error) { +func GetAMDGPUCount(ctx context.Context, cl *kubernetes.Clientset, resourceType string) (map[string]int, error) { ret := make(map[string]int) // Get the list of nodes @@ -777,7 +769,8 @@ func GetAMDGPUCount(ctx context.Context, cl *kubernetes.Clientset) (map[string]i // Iterate over the nodes and count AMD GPUs for _, node := range nodes.Items { - if val, ok := node.Status.Capacity["amd.com/gpu"]; ok { + resourceKey := v1.ResourceName("amd.com/" + resourceType) + if val, ok := node.Status.Capacity[resourceKey]; ok { num, err := strconv.ParseInt(val.String(), 10, 64) if err != nil { log.Infof("error: %v", err) @@ -790,7 +783,7 @@ func GetAMDGPUCount(ctx context.Context, cl *kubernetes.Clientset) (map[string]i } func VerifyROCMPODResourceCount(ctx context.Context, cl *kubernetes.Clientset, - gpuReqCount int) error { + gpuReqCount int, resourceType string) error { its, err := cl.CoreV1().Pods("").List(ctx, metav1.ListOptions{ @@ -805,7 +798,8 @@ func VerifyROCMPODResourceCount(ctx context.Context, cl *kubernetes.Clientset, continue } - if gpu, ok := cntr.Resources.Requests["amd.com/gpu"]; ok { + resourceKey := v1.ResourceName("amd.com/" + resourceType) + if gpu, ok := cntr.Resources.Requests[resourceKey]; ok { gpuAssignedCount := int(gpu.Value()) if gpuReqCount < gpuAssignedCount { return fmt.Errorf("gpu requested %d got %d",