fix(gke-hyperdisk): Update gke-managed-hyperdisk test playbook and example blueprint

Neelabh94 · Neelabh94 · commit 2f7f3781835c · 2025-10-24T12:11:50.000Z
Replaces the tensorflow jobs with FIO jobs in the `gke-managed-hyperdisk` example and
accordingly updates its correspoding test playbook.
diff --git a/examples/gke-managed-hyperdisk.yaml b/examples/gke-managed-hyperdisk.yaml
@@ -122,106 +122,24 @@ deployment_groups:
     settings:
       name: sample-pool
       zones: [$(vars.zone)]
-      machine_type: c3-standard-88 # Hyperdisk-extreme required C3 machine with 88 or more vCPUs
+      machine_type: c3-standard-88 # Hyperdisk-extreme requires C3 machine with 88 or more vCPUs
       auto_upgrade: true
 
-  # Train a TensorFlow model with Keras and Hyperdisk Balanced on GKE
-  # Tutorial: https://cloud.google.com/parallelstore/docs/tensorflow-sample
-  - id: hyperdisk-balanced-job
+  # This is an example job that will install and run an `fio`benchmark against the hyperdisk volumes.
+  # For more FIO tests, see https://cloud.google.com/compute/docs/disks/benchmark-hyperdisk-performance
+  - id: fio-bench-job-template
     source: modules/compute/gke-job-template
     use:
     - gke_cluster
     - hyperdisk-balanced-setup
-    settings:
-      name: tensorflow
-      image: jupyter/tensorflow-notebook@sha256:173f124f638efe870bb2b535e01a76a80a95217e66ed00751058c51c09d6d85d
-      security_context:  # to make sure the job have enough access to execute the jobs and r/w from hyperdisk
-      - key: runAsUser
-        value: 1000
-      - key: runAsGroup
-        value: 100
-      - key: fsGroup
-        value: 100
-      command:
-      - bash
-      - -c
-      - |
-        pip install transformers datasets
-        python - <<EOF
-        from datasets import load_dataset
-        dataset = load_dataset("glue", "cola", cache_dir='/data/hyperdisk-balanced-pvc-0')
-        dataset = dataset["train"]
-        from transformers import AutoTokenizer
-        import numpy as np
-        tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
-        sentences = [str(s) for s in dataset["sentence"]]
-        tokenized_data = tokenizer(sentences, return_tensors="np", padding=True)
-        tokenized_data = dict(tokenized_data)
-        labels = np.array(dataset["label"])
-        from transformers import TFAutoModelForSequenceClassification
-        from tensorflow.keras.optimizers import Adam
-        model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased")
-        model.compile(optimizer=Adam(3e-5))
-        model.fit(tokenized_data, labels)
-        EOF
-      node_count: 1
-    outputs: [instructions]
-
-  # Train a TensorFlow model with Keras and Hyperdisk Extreme on GKE
-  # Tutorial: https://cloud.google.com/parallelstore/docs/tensorflow-sample
-  - id: hyperdisk-extreme-job
-    source: modules/compute/gke-job-template
-    use:
-    - gke_cluster
     - hyperdisk-extreme-setup
-    settings:
-      name: tensorflow
-      image: jupyter/tensorflow-notebook@sha256:173f124f638efe870bb2b535e01a76a80a95217e66ed00751058c51c09d6d85d
-      security_context:  # to make sure the job have enough access to execute the jobs and r/w from hyperdisk
-      - key: runAsUser
-        value: 1000
-      - key: runAsGroup
-        value: 100
-      - key: fsGroup
-        value: 100
-      command:
-      - bash
-      - -c
-      - |
-        pip install transformers datasets
-        python - <<EOF
-        from datasets import load_dataset
-        dataset = load_dataset("glue", "cola", cache_dir='/data/hyperdisk-extreme-pvc-0')
-        dataset = dataset["train"]
-        from transformers import AutoTokenizer
-        import numpy as np
-        tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
-        sentences = [str(s) for s in dataset["sentence"]]
-        tokenized_data = tokenizer(sentences, return_tensors="np", padding=True)
-        tokenized_data = dict(tokenized_data)
-        labels = np.array(dataset["label"])
-        from transformers import TFAutoModelForSequenceClassification
-        from tensorflow.keras.optimizers import Adam
-        model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased")
-        model.compile(optimizer=Adam(3e-5))
-        model.fit(tokenized_data, labels)
-        EOF
-      node_count: 1
-    outputs: [instructions]
-
-  # Train a TensorFlow model with Keras and Hyperdisk Throughput on GKE
-  # Tutorial: https://cloud.google.com/parallelstore/docs/tensorflow-sample
-  - id: hyperdisk-throughput-job
-    source: modules/compute/gke-job-template
-    use:
-    - gke_cluster
     - hyperdisk-throughput-setup
     settings:
-      name: tensorflow
-      image: jupyter/tensorflow-notebook@sha256:173f124f638efe870bb2b535e01a76a80a95217e66ed00751058c51c09d6d85d
-      security_context:  # to make sure the job have enough access to execute the jobs and r/w from hyperdisk
+      name: fio-benchmark
+      image: ubuntu:latest
+      security_context:  # to make sure the job have enough access to install the fio packages
       - key: runAsUser
-        value: 1000
+        value: 0
       - key: runAsGroup
         value: 100
       - key: fsGroup
@@ -230,23 +148,49 @@ deployment_groups:
       - bash
       - -c
       - |
-        pip install transformers datasets
-        python - <<EOF
-        from datasets import load_dataset
-        dataset = load_dataset("glue", "cola", cache_dir='/data/hyperdisk-throughput-pvc-0')
-        dataset = dataset["train"]
-        from transformers import AutoTokenizer
-        import numpy as np
-        tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
-        sentences = [str(s) for s in dataset["sentence"]]
-        tokenized_data = tokenizer(sentences, return_tensors="np", padding=True)
-        tokenized_data = dict(tokenized_data)
-        labels = np.array(dataset["label"])
-        from transformers import TFAutoModelForSequenceClassification
-        from tensorflow.keras.optimizers import Adam
-        model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased")
-        model.compile(optimizer=Adam(3e-5))
-        model.fit(tokenized_data, labels)
-        EOF
+
+        set -eux
+        export DEBIAN_FRONTEND=noninteractive
+
+        # Install fio
+        apt update -y && apt install -y fio
+
+        # Use a tag to create a unique path for tests
+        TAG=`date +%s`
+
+        # Verify mountpoints
+        df -h
+        mountpoint /data/hyperdisk-balanced-pvc-0
+        mountpoint /data/hyperdisk-extreme-pvc-0
+        mountpoint /data/hyperdisk-throughput-pvc-0
+
+        # Create temporary directory for fio benchmarks
+        mkdir -p /data/hyperdisk-balanced-pvc-0/fio-benchmarks-${TAG}
+        mkdir -p /data/hyperdisk-extreme-pvc-0/fio-benchmarks-${TAG}
+        mkdir -p /data/hyperdisk-throughput-pvc-0/fio-benchmarks-${TAG}
+
+        # Perform hyperdisk balanced performance (Mixed IOPS) test
+        fio --name=hyperdisk-balanced-iops --ioengine=libaio --iodepth=256 --rw=randrw \
+        --bs=4k --direct=1 --size=10G --numjobs=16 --group_reporting --time_based --runtime=300s \
+        --ramp_time=10s --iodepth_batch_submit=256 --iodepth_batch_complete_max=256 \
+        --directory=/data/hyperdisk-balanced-pvc-0/fio-benchmarks-${TAG} --filename_format=fiotest-balanced-iops
+
+        # Perform hyperdisk extreme performance test (Max IOPS)
+        fio --name=hyperdisk-extreme-iops --ioengine=libaio --iodepth=256 --rw=randwrite \
+        --bs=4k --direct=1 --size=10G --numjobs=32 --group_reporting --time_based --runtime=300s --ramp_time=10s \
+        --iodepth_batch_submit=256 --iodepth_batch_complete_max=256 \
+        --directory=/data/hyperdisk-extreme-pvc-0/fio-benchmarks-${TAG} --filename_format=fiotest-extreme-iops
+
+        # Perform hyperdisk throughput performance test
+        fio --name=hyperdisk-throughput-bw --ioengine=libaio --iodepth=64 --rw=write --bs=1M \
+        --direct=1 --size=10G --numjobs=32 --group_reporting --time_based --runtime=300s --ramp_time=10s \
+        --iodepth_batch_submit=64 --iodepth_batch_complete_max=64 \
+        --directory=/data/hyperdisk-throughput-pvc-0/fio-benchmarks-${TAG} --filename_format=fiotest-throughput-bw
+
+        # Clean up temporary directories for fio benchmarks
+        rm -rf /data/hyperdisk-balanced-pvc-0/fio-benchmarks-${TAG}
+        rm -rf /data/hyperdisk-extreme-pvc-0/fio-benchmarks-${TAG}
+        rm -rf /data/hyperdisk-throughput-pvc-0/fio-benchmarks-${TAG}
       node_count: 1
+
     outputs: [instructions]
diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-gke-managed-hyperdisk.yml b/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-gke-managed-hyperdisk.yml
@@ -12,30 +12,52 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+- name: Assert variables are defined
+  ansible.builtin.assert:
+    that:
+    - region is defined
+    - custom_vars.project is defined
+
 - name: Get cluster credentials for kubectl
   delegate_to: localhost
   ansible.builtin.command: gcloud container clusters get-credentials {{ deployment_name }} --region {{ cli_deployment_vars.region }} --project {{ custom_vars.project }} --verbosity=debug
 
-- name: Execute the job
+- name: Run the FIO benchmark job and get its name
   delegate_to: localhost
   ansible.builtin.shell: |
-    jobs=({{ workspace }}/{{ deployment_name }}/primary/tensorflow*)
-    for job in "${jobs[@]}"; do
-      kubectl create -f "$job" -v=9
-    done
+    job_file=({{ workspace }}/{{ deployment_name }}/primary/fio-benchmark*)
+    # Assuming only one benchmark file matches
+    kubectl create -f "${job_file[0]}" -o=jsonpath='{.metadata.name}'
   args:
     executable: /bin/bash
-  changed_when: False
+  register: fio_job_create_output
+
+- name: Set FIO job name
+  ansible.builtin.set_fact:
+    fio_job_name: "{{ fio_job_create_output.stdout }}"
 
-- name: Wait for job to complete
+- name: Wait for FIO Job to complete
+  # The FIO job should take approximately 20 minutes, process times out after a max wait of 40 mins
   delegate_to: localhost
-  ansible.builtin.command: |
-    kubectl get job --field-selector  status.successful=1 -v=9
-  register: job_completion
-  until: job_completion.stdout_lines | length > 3 # 3 jobs total
+  ansible.builtin.shell: "kubectl get job {{ fio_job_name }} -o jsonpath='{.status.succeeded}'"
+  register: fio_job_status
+  until: fio_job_status.stdout == '1'
   retries: 80
-  delay: 15
+  delay: 30
+
+- name: Fetch logs from the FIO job pod and save to fio_pod_logs.txt
+  delegate_to: localhost
+  ansible.builtin.shell: |
+    pod_name="$(kubectl get pods -l job-name={{ fio_job_name }} -o jsonpath='{.items[0].metadata.name}')"
+    kubectl logs "$pod_name" > fio_pod_logs.txt
+    cat fio_pod_logs.txt
+  register: fio_test_logs
+
+- name: Print the FIO test logs
+  debug:
+    msg: "{{fio_test_logs.stdout}}"
 
-- name: Print job_completion debug output
-  ansible.builtin.debug:
-    var: job_completion.stdout_lines
+- name: Clean up FIO job
+  delegate_to: localhost
+  ansible.builtin.shell: |
+    kubectl delete job {{ fio_job_name }} -v=9