[Doc] Init release notes for v1.3.1 and amend tag changes (#859)

yansun1996 · sajmera-pensando · commit f392a5e74adf · 2025-08-10T17:15:15.000Z
diff --git a/Makefile b/Makefile
@@ -5,7 +5,7 @@ endif
 
 # PROJECT_VERSION defines the project version.
 # Update this value when you upgrade the version of your project.
-PROJECT_VERSION ?= v1.2.0
+PROJECT_VERSION ?= v1.3.1
 
 ####################################
 # GPU Operator Image Build variables
diff --git a/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml b/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml
@@ -32,7 +32,7 @@ metadata:
     capabilities: Seamless Upgrades
     categories: AI/Machine Learning,Monitoring
     containerImage: docker.io/rocm/gpu-operator:v1.2.0
-    createdAt: "2025-08-06T05:54:02Z"
+    createdAt: "2025-08-09T01:44:36Z"
     description: |-
       Operator responsible for deploying AMD GPU kernel drivers, device plugin, device test runner and device metrics exporter
       For more information, visit [documentation](https://instinct.docs.amd.com/projects/gpu-operator/en/latest/)
@@ -44,7 +44,7 @@ metadata:
     features.operators.openshift.io/token-auth-aws: "false"
     features.operators.openshift.io/token-auth-azure: "false"
     features.operators.openshift.io/token-auth-gcp: "false"
-    metricsExporterImage: docker.io/rocm/device-metrics-exporter:v1.2.0
+    metricsExporterImage: docker.io/rocm/device-metrics-exporter:v1.3.1
     nodelabellerImage: docker.io/rocm/k8s-device-plugin:labeller-rhubi-latest
     operatorframework.io/cluster-monitoring: "true"
     operatorframework.io/suggested-namespace: openshift-amd-gpu
@@ -53,7 +53,7 @@ metadata:
     operators.operatorframework.io/project_layout: go.kubebuilder.io/v3
     repository: https://github.com/ROCm/gpu-operator
     support: Advanced Micro Devices, Inc.
-  name: amd-gpu-operator.v1.2.0
+  name: amd-gpu-operator.v1.3.1
   namespace: placeholder
 spec:
   apiservicedefinitions: {}
@@ -1245,4 +1245,4 @@ spec:
   maturity: stable
   provider:
     name: Advanced Micro Devices, Inc.
-  version: 1.2.0
+  version: 1.3.1
diff --git a/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml b/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml
@@ -17,7 +17,7 @@ metadata:
     features.operators.openshift.io/token-auth-aws: "false"
     features.operators.openshift.io/token-auth-azure: "false"
     features.operators.openshift.io/token-auth-gcp: "false"
-    metricsExporterImage: docker.io/rocm/device-metrics-exporter:v1.2.0
+    metricsExporterImage: docker.io/rocm/device-metrics-exporter:v1.3.1
     nodelabellerImage: docker.io/rocm/k8s-device-plugin:labeller-rhubi-latest
     operatorframework.io/cluster-monitoring: "true"
     operatorframework.io/suggested-namespace: openshift-amd-gpu
diff --git a/docs/conf.py b/docs/conf.py
@@ -11,7 +11,7 @@
 external_projects_current_project = "amd-gpu-operator"
 
 project = "AMD GPU Operator"
-version = "1.3.0"
+version = "1.3.1"
 release = version
 html_title = f"{project} {version}"
 author = "Advanced Micro Devices, Inc."
diff --git a/docs/drivers/installation.md b/docs/drivers/installation.md
@@ -80,7 +80,7 @@ spec:
      serviceType: "NodePort"
      # Node port for metrics exporter service, metrics endpoint $node-ip:$nodePort
      nodePort: 32500
-     image: docker.io/rocm/device-metrics-exporter:v1.2.0
+     image: docker.io/rocm/device-metrics-exporter:v1.3.1
 
   # Specifythe node to be managed by this DeviceConfig Custom Resource
   selector:
@@ -134,7 +134,7 @@ spec:
      serviceType: "NodePort"
      # Node port for metrics exporter service, metrics endpoint $node-ip:$nodePort
      nodePort: 32500
-     image: docker.io/rocm/device-metrics-exporter:v1.2.0
+     image: docker.io/rocm/device-metrics-exporter:v1.3.1
 
   # Specifythe node to be managed by this DeviceConfig Custom Resource
   selector:
diff --git a/docs/fulldeviceconfig.rst b/docs/fulldeviceconfig.rst
@@ -30,7 +30,7 @@ Below is an example of a full DeviceConfig CR that can be used to install the AM
     apiVersion: amd.com/v1alpha1 
     kind: DeviceConfig #New Custom Resource Definition used by the GPU Operator
     metadata:
-      # Name of the DeviceConfig CR. Note that the name of device plugin, node-labeller and metric-explorter pods will be prefixed with 
+      # Name of the DeviceConfig CR. Note that the name of device plugin, node-labeller and metric-exporter pods will be prefixed with 
       name: gpu-operator 
       namespace: kube-amd-gpu # Namespace for the GPU Operator and it's components
     spec: 
@@ -147,7 +147,7 @@ Below is an example of a full DeviceConfig CR that can be used to install the AM
         serviceType: ClusterIP # ServiceType used to expose the Metrics Exporter endpoint. Can be either `ClusterIp` or `NodePort`.
         port: 5000 # Note if specifying NodePort as the serviceType use `32500` as the port number must be between 30000-32767
         # (Optional) Specifying metrics exporter image is optional. Default imagename shown here if not specified.
-        image: rocm/device-metrics-exporter:v1.2.0 # Change this to trigger metrics exporter upgrade on CR update
+        image: rocm/device-metrics-exporter:v1.3.1 # Change this to trigger metrics exporter upgrade on CR update
         imagePullPolicy: "IfNotPresent" # image pull policy for the metrics exporter container. Either `Always`, `IfNotPresent` or `Never`
         # imagePullPolicy default value is "IfNotPresent" for valid tags, "Always" for no tag or "latest" tag
         config:
@@ -187,7 +187,7 @@ Below is an example of a full DeviceConfig CR that can be used to install the AM
         serviceType: ClusterIP # ServiceType used to expose the Metrics Exporter endpoint. Can be either `ClusterIp` or `NodePort`.
         port: 5000 # Note if specifying NodePort as the serviceType use `32500` as the port number must be between 30000-32767
         # (Optional) Specifying metrics exporter image is optional. Default imagename shown here if not specified.
-        image: docker.io/rocm/test-runner:v1.2.0-beta.0 # Change this to trigger metrics exporter upgrade on CR update
+        image: docker.io/rocm/test-runner:v1.3.1 # Change this to trigger metrics exporter upgrade on CR update
         imagePullPolicy: "IfNotPresent" # image pull policy for the test runner container. Either `Always`, `IfNotPresent` or `Never`
         # imagePullPolicy default value is "IfNotPresent" for valid tags, "Always" for no tag or "latest" tag
         config:
diff --git a/docs/installation/kubernetes-helm.md b/docs/installation/kubernetes-helm.md
@@ -155,7 +155,7 @@ The following parameters are able to be configued when using the Helm Chart. In
 |-----|------|---------|-------------|
 | controllerManager.affinity | object | `{"nodeAffinity":{"preferredDuringSchedulingIgnoredDuringExecution":[{"preference":{"matchExpressions":[{"key":"node-role.kubernetes.io/control-plane","operator":"Exists"}]},"weight":1}]}}` | Deployment affinity configs for controller manager |
 | controllerManager.manager.image.repository | string | `"docker.io/rocm/gpu-operator"` | AMD GPU operator controller manager image repository |
-| controllerManager.manager.image.tag | string | `"v1.2.0"` | AMD GPU operator controller manager image tag |
+| controllerManager.manager.image.tag | string | `"v1.3.1"` | AMD GPU operator controller manager image tag |
 | controllerManager.manager.imagePullPolicy | string | `"Always"` | Image pull policy for AMD GPU operator controller manager pod |
 | controllerManager.manager.imagePullSecrets | string | `""` | Image pull secret name for pulling AMD GPU operator controller manager image if registry needs credential to pull image |
 | controllerManager.manager.resources.limits.cpu | string | `"1000m"` | CPU limits for the controller manager. Consider increasing for large clusters |
@@ -173,12 +173,12 @@ The following parameters are able to be configued when using the Helm Chart. In
 | kmm.controller.manager.containerSecurityContext.allowPrivilegeEscalation | bool | `false` |  |
 | kmm.controller.manager.env.relatedImageBuild | string | `"gcr.io/kaniko-project/executor:v1.23.2"` | KMM kaniko builder image for building driver image within cluster |
 | kmm.controller.manager.env.relatedImageBuildPullSecret | string | `""` | Image pull secret name for pulling KMM kaniko builder image if registry needs credential to pull image |
-| kmm.controller.manager.env.relatedImageSign | string | `"docker.io/rocm/kernel-module-management-signimage:v1.2.0"` | KMM signer image for signing driver image's kernel module with given key pairs within cluster |
+| kmm.controller.manager.env.relatedImageSign | string | `"docker.io/rocm/kernel-module-management-signimage:v1.3.1"` | KMM signer image for signing driver image's kernel module with given key pairs within cluster |
 | kmm.controller.manager.env.relatedImageSignPullSecret | string | `""` | Image pull secret name for pulling KMM signer image if registry needs credential to pull image |
-| kmm.controller.manager.env.relatedImageWorker | string | `"docker.io/rocm/kernel-module-management-worker:v1.2.0"` | KMM worker image for loading / unloading driver kernel module on worker nodes |
+| kmm.controller.manager.env.relatedImageWorker | string | `"docker.io/rocm/kernel-module-management-worker:v1.3.1"` | KMM worker image for loading / unloading driver kernel module on worker nodes |
 | kmm.controller.manager.env.relatedImageWorkerPullSecret | string | `""` | Image pull secret name for pulling KMM worker image if registry needs credential to pull image |
 | kmm.controller.manager.image.repository | string | `"docker.io/rocm/kernel-module-management-operator"` | KMM controller manager image repository |
-| kmm.controller.manager.image.tag | string | `"v1.2.0"` | KMM controller manager image tag |
+| kmm.controller.manager.image.tag | string | `"v1.3.1"` | KMM controller manager image tag |
 | kmm.controller.manager.imagePullPolicy | string | `"Always"` | Image pull policy for KMM controller manager pod |
 | kmm.controller.manager.imagePullSecrets | string | `""` | Image pull secret name for pulling KMM controller manager image if registry needs credential to pull image |
 | kmm.controller.manager.resources.limits.cpu | string | `"500m"` |  |
@@ -332,7 +332,7 @@ spec:
      serviceType: "NodePort"
      # Node port for metrics exporter service, metrics endpoint $node-ip:$nodePort
      nodePort: 32500
-     image: docker.io/rocm/device-metrics-exporter:v1.2.0
+     image: docker.io/rocm/device-metrics-exporter:v1.3.1
 
   # Specifythe node to be managed by this DeviceConfig Custom Resource
   selector:
@@ -382,7 +382,7 @@ spec:
      serviceType: "NodePort"
      # Node port for metrics exporter service, metrics endpoint $node-ip:$nodePort
      nodePort: 32500
-     image: docker.io/rocm/device-metrics-exporter:v1.2.0
+     image: docker.io/rocm/device-metrics-exporter:v1.3.1
 
   # Specifythe node to be managed by this DeviceConfig Custom Resource
   selector:
diff --git a/docs/metrics/exporter.md b/docs/metrics/exporter.md
@@ -45,7 +45,7 @@ metricsExporter:
     nodePort: 32500
 
     # image for the metrics-exporter container
-    image: "rocm/device-metrics-exporter:v1.2.0"
+    image: "rocm/device-metrics-exporter:v1.3.1"
  
 ```
 
diff --git a/docs/releasenotes.md b/docs/releasenotes.md
@@ -36,11 +36,6 @@ The AMD GPU Operator v1.3.1 release extends platform support to OpenShift v4.19
 
 ### Documentation Updates
 
-- Updated [Release notes](https://instinct.docs.amd.com/projects/gpu-operator/en/latest/releasenotes.html) detailing new features in v1.3.1.
-- Updated GPU Operator install instructions to include the default DeviceConfig custom resource that gets created and how to skip installing it if desired.
-
-### Known Limitations
-
 > **Note:** All current and historical limitations for the GPU Operator, including their latest statuses and any associated workarounds or fixes, are tracked in the following documentation page: [Known Issues and Limitations](https://instinct.docs.amd.com/projects/gpu-operator/en/latest/knownlimitations.html).  
    Please refer to this page regularly for the most up-to-date information.
 
diff --git a/docs/test/auto-unhealthy-device-test.md b/docs/test/auto-unhealthy-device-test.md
@@ -25,15 +25,15 @@ metricsExporter:
     nodePort: 32500
 
     # image for the metrics-exporter container
-    image: "rocm/device-metrics-exporter:v1.2.0"
+    image: "rocm/device-metrics-exporter:v1.3.1"
 
 # Specify the test runner config
 testRunner:
     # To enable/disable the test runner, disabled by default
     enable: true
 
     # image for the test runner container
-    image: docker.io/rocm/test-runner:v1.2.0-beta.0
+    image: docker.io/rocm/test-runner:v1.3.1
 
     # specify the mount for test logs
     logsLocation:
diff --git a/docs/test/logs-export.md b/docs/test/logs-export.md
@@ -165,7 +165,7 @@ Example:
     enable: True
 
     # testrunner image
-    image: docker.io/rocm/test-runner:v1.2.0-beta.0
+    image: docker.io/rocm/test-runner:v1.3.1
 
     # image pull policy for the testrunner
     # default value is IfNotPresent for valid tags, Always for no tag or "latest" tag
diff --git a/docs/test/manual-test.md b/docs/test/manual-test.md
@@ -76,7 +76,7 @@ spec:
           type: Directory
       containers:
       - name: amd-test-runner
-        image: docker.io/rocm/test-runner:v1.2.0-beta.0
+        image: docker.io/rocm/test-runner:v1.3.1
         imagePullPolicy: IfNotPresent
         securityContext: # setup security context for container to get access to device related interfaces
           privileged: true
@@ -167,7 +167,7 @@ spec:
           limits:
             amd.com/gpu: 8 # requesting all GPUs on the node
         name: amd-test-runner
-        image: docker.io/rocm/test-runner:v1.2.0-beta.0
+        image: docker.io/rocm/test-runner:v1.3.1
         imagePullPolicy: IfNotPresent
         env:
         - name: TEST_TRIGGER
@@ -284,7 +284,7 @@ spec:
               type: Directory
           containers:
           - name: init-test-runner
-            image: docker.io/rocm/test-runner:v1.2.0-beta.0
+            image: docker.io/rocm/test-runner:v1.3.1
             imagePullPolicy: IfNotPresent
             securityContext: # setup security context for container to get access to device related interfaces
               privileged: true
@@ -467,7 +467,7 @@ spec:
           name: manual-config-map
       containers:
       - name: amd-test-runner
-        image: docker.io/rocm/test-runner:v1.2.0-beta.0
+        image: docker.io/rocm/test-runner:v1.3.1
         imagePullPolicy: IfNotPresent
         securityContext: # setup security context for container to get access to device related interfaces
           privileged: true
@@ -623,7 +623,7 @@ spec:
         name: test-runner-volume
       containers:
       - name: amd-test-runner
-        image: docker.io/rocm/test-runner:v1.2.0-beta.0
+        image: docker.io/rocm/test-runner:v1.3.1
         imagePullPolicy: IfNotPresent
         securityContext: # setup security context for container to get access to device related interfaces
           privileged: true
diff --git a/docs/test/pre-start-job-test.md b/docs/test/pre-start-job-test.md
@@ -82,7 +82,7 @@ spec:
       serviceAccountName: test-run
       initContainers:
       - name: init-test-runner
-        image: docker.io/rocm/test-runner:v1.2.0-beta.0
+        image: docker.io/rocm/test-runner:v1.3.1
         imagePullPolicy: IfNotPresent
         resources:
           requests:
diff --git a/docs/test/test-runner-overview.md b/docs/test/test-runner-overview.md
@@ -18,7 +18,7 @@ Under the hood the Device Test runner leverages the ROCm Validation Suite (RVS)
     enable: True
 
     # testrunner image
-    image: docker.io/rocm/test-runner:v1.3.0
+    image: docker.io/rocm/test-runner:v1.3.1
 
     # image pull policy for the testrunner
     # default value is IfNotPresent for valid tags, Always for no tag or "latest" tag
diff --git a/docs/upgrades/componentupgrades.md b/docs/upgrades/componentupgrades.md
@@ -121,7 +121,7 @@ Updated CR:
     enable: True
     serviceType: "ClusterIP"
     port: 5000
-    image: rocm/device-metrics-exporter:v1.2.0
+    image: rocm/device-metrics-exporter:v1.3.1
     upgradePolicy:
       upgradeStrategy: OnDelete
 ```
diff --git a/example/testrunner/manual_test_job.yaml b/example/testrunner/manual_test_job.yaml
@@ -151,7 +151,7 @@ spec:
           secretName: aws-secret
       containers:
       - name: amd-test-runner
-        image: docker.io/rocm/test-runner:v1.2.0-beta.0
+        image: docker.io/rocm/test-runner:v1.3.1
         imagePullPolicy: IfNotPresent
         securityContext: # setup security context for container to get access to device related interfaces
           privileged: true
diff --git a/example/testrunner/pre_start_job_check.yaml b/example/testrunner/pre_start_job_check.yaml
@@ -146,7 +146,7 @@ spec:
           secretName: aws-secret
       initContainers:
       - name: init-test-runner
-        image: docker.io/rocm/test-runner:v1.2.0-beta.0
+        image: docker.io/rocm/test-runner:v1.3.1
         imagePullPolicy: IfNotPresent
         resources:
           limits:
diff --git a/example/testrunner/schedule_test_cronjob.yaml b/example/testrunner/schedule_test_cronjob.yaml
@@ -150,7 +150,7 @@ spec:
               secretName: aws-secret
           containers:
           - name: init-test-runner
-            image: docker.io/rocm/test-runner:v1.2.0-beta.0
+            image: docker.io/rocm/test-runner:v1.3.1
             imagePullPolicy: IfNotPresent
             securityContext: # setup security context for container to get access to device related interfaces
               privileged: true
diff --git a/hack/k8s-patch/metadata-patch/Chart.yaml b/hack/k8s-patch/metadata-patch/Chart.yaml
@@ -19,8 +19,8 @@ keywords:
   - monitoring
 
 kubeVersion: ">= 1.29.0-0"
-version: v1.2.0
-appVersion: "v1.2.0"
+version: v1.3.1
+appVersion: "v1.3.1"
 
 dependencies:
 - name: node-feature-discovery
diff --git a/hack/openshift-patch/metadata-patch/Chart.yaml b/hack/openshift-patch/metadata-patch/Chart.yaml
@@ -19,8 +19,8 @@ keywords:
   - monitoring
 
 kubeVersion: ">= 1.29.0-0"
-version: v1.2.0
-appVersion: "v1.2.0"
+version: v1.3.1
+appVersion: "v1.3.1"
 
 dependencies:
 - name: nfd
diff --git a/helm-charts-k8s/Chart.lock b/helm-charts-k8s/Chart.lock
@@ -6,4 +6,4 @@ dependencies:
   repository: file://./charts/kmm
   version: v1.0.0
 digest: sha256:f9a315dd2ce3d515ebf28c8e9a6a82158b493ca2686439ec381487761261b597
-generated: "2025-08-06T05:53:52.129487053Z"
+generated: "2025-08-09T01:44:10.510383817Z"
diff --git a/helm-charts-k8s/Chart.yaml b/helm-charts-k8s/Chart.yaml
@@ -19,8 +19,8 @@ keywords:
   - monitoring
 
 kubeVersion: ">= 1.29.0-0"
-version: v1.2.0
-appVersion: "v1.2.0"
+version: v1.3.1
+appVersion: "v1.3.1"
 
 dependencies:
 - name: node-feature-discovery
diff --git a/helm-charts-k8s/README.md b/helm-charts-k8s/README.md
@@ -117,7 +117,7 @@ The AMD GPU Operator is licensed under the [Apache License 2.0](LICENSE).
 
 ## gpu-operator-charts
 
-![Version: v1.2.0](https://img.shields.io/badge/Version-v1.2.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: v1.2.0](https://img.shields.io/badge/AppVersion-v1.2.0-informational?style=flat-square)
+![Version: v1.3.1](https://img.shields.io/badge/Version-v1.3.1-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: v1.3.1](https://img.shields.io/badge/AppVersion-v1.3.1-informational?style=flat-square)
 
 AMD GPU Operator simplifies the deployment and management of AMD Instinct GPU accelerators within Kubernetes clusters.
 
diff --git a/helm-charts-k8s/crds/deviceconfig-crd.yaml b/helm-charts-k8s/crds/deviceconfig-crd.yaml
@@ -9,10 +9,10 @@ metadata:
   labels:
     app.kubernetes.io/component: amd-gpu
     app.kubernetes.io/part-of: amd-gpu
-    helm.sh/chart: gpu-operator-charts-v1.2.0
+    helm.sh/chart: gpu-operator-charts-v1.3.1
     app.kubernetes.io/name: gpu-operator-charts
     app.kubernetes.io/instance: amd-gpu
-    app.kubernetes.io/version: "v1.2.0"
+    app.kubernetes.io/version: "v1.3.1"
     app.kubernetes.io/managed-by: Helm
 spec:
   group: amd.com
diff --git a/helm-charts-openshift/Chart.lock b/helm-charts-openshift/Chart.lock
@@ -6,4 +6,4 @@ dependencies:
   repository: file://./charts/kmm
   version: v1.0.0
 digest: sha256:25200c34a5cc846a1275e5bf3fc637b19e909dc68de938189c5278d77d03f5ac
-generated: "2025-08-06T05:54:01.327906767Z"
+generated: "2025-08-09T01:44:30.971839872Z"
diff --git a/helm-charts-openshift/Chart.yaml b/helm-charts-openshift/Chart.yaml
@@ -19,8 +19,8 @@ keywords:
   - monitoring
 
 kubeVersion: ">= 1.29.0-0"
-version: v1.2.0
-appVersion: "v1.2.0"
+version: v1.3.1
+appVersion: "v1.3.1"
 
 dependencies:
 - name: nfd
diff --git a/helm-charts-openshift/crds/deviceconfig-crd.yaml b/helm-charts-openshift/crds/deviceconfig-crd.yaml
@@ -9,10 +9,10 @@ metadata:
   labels:
     app.kubernetes.io/component: amd-gpu
     app.kubernetes.io/part-of: amd-gpu
-    helm.sh/chart: gpu-operator-charts-v1.2.0
+    helm.sh/chart: gpu-operator-charts-v1.3.1
     app.kubernetes.io/name: gpu-operator-charts
     app.kubernetes.io/instance: amd-gpu
-    app.kubernetes.io/version: "v1.2.0"
+    app.kubernetes.io/version: "v1.3.1"
     app.kubernetes.io/managed-by: Helm
 spec:
   group: amd.com
diff --git a/tests/e2e/Makefile b/tests/e2e/Makefile