Skip to content

Commit 2f7f378

Browse files
committed
fix(gke-hyperdisk): Update gke-managed-hyperdisk test playbook and example blueprint
Replaces the tensorflow jobs with FIO jobs in the `gke-managed-hyperdisk` example and accordingly updates its correspoding test playbook.
1 parent 2d5b02d commit 2f7f378

File tree

2 files changed

+89
-123
lines changed

2 files changed

+89
-123
lines changed

examples/gke-managed-hyperdisk.yaml

Lines changed: 52 additions & 108 deletions
Original file line numberDiff line numberDiff line change
@@ -122,106 +122,24 @@ deployment_groups:
122122
settings:
123123
name: sample-pool
124124
zones: [$(vars.zone)]
125-
machine_type: c3-standard-88 # Hyperdisk-extreme required C3 machine with 88 or more vCPUs
125+
machine_type: c3-standard-88 # Hyperdisk-extreme requires C3 machine with 88 or more vCPUs
126126
auto_upgrade: true
127127

128-
# Train a TensorFlow model with Keras and Hyperdisk Balanced on GKE
129-
# Tutorial: https://cloud.google.com/parallelstore/docs/tensorflow-sample
130-
- id: hyperdisk-balanced-job
128+
# This is an example job that will install and run an `fio`benchmark against the hyperdisk volumes.
129+
# For more FIO tests, see https://cloud.google.com/compute/docs/disks/benchmark-hyperdisk-performance
130+
- id: fio-bench-job-template
131131
source: modules/compute/gke-job-template
132132
use:
133133
- gke_cluster
134134
- hyperdisk-balanced-setup
135-
settings:
136-
name: tensorflow
137-
image: jupyter/tensorflow-notebook@sha256:173f124f638efe870bb2b535e01a76a80a95217e66ed00751058c51c09d6d85d
138-
security_context: # to make sure the job have enough access to execute the jobs and r/w from hyperdisk
139-
- key: runAsUser
140-
value: 1000
141-
- key: runAsGroup
142-
value: 100
143-
- key: fsGroup
144-
value: 100
145-
command:
146-
- bash
147-
- -c
148-
- |
149-
pip install transformers datasets
150-
python - <<EOF
151-
from datasets import load_dataset
152-
dataset = load_dataset("glue", "cola", cache_dir='/data/hyperdisk-balanced-pvc-0')
153-
dataset = dataset["train"]
154-
from transformers import AutoTokenizer
155-
import numpy as np
156-
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
157-
sentences = [str(s) for s in dataset["sentence"]]
158-
tokenized_data = tokenizer(sentences, return_tensors="np", padding=True)
159-
tokenized_data = dict(tokenized_data)
160-
labels = np.array(dataset["label"])
161-
from transformers import TFAutoModelForSequenceClassification
162-
from tensorflow.keras.optimizers import Adam
163-
model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased")
164-
model.compile(optimizer=Adam(3e-5))
165-
model.fit(tokenized_data, labels)
166-
EOF
167-
node_count: 1
168-
outputs: [instructions]
169-
170-
# Train a TensorFlow model with Keras and Hyperdisk Extreme on GKE
171-
# Tutorial: https://cloud.google.com/parallelstore/docs/tensorflow-sample
172-
- id: hyperdisk-extreme-job
173-
source: modules/compute/gke-job-template
174-
use:
175-
- gke_cluster
176135
- hyperdisk-extreme-setup
177-
settings:
178-
name: tensorflow
179-
image: jupyter/tensorflow-notebook@sha256:173f124f638efe870bb2b535e01a76a80a95217e66ed00751058c51c09d6d85d
180-
security_context: # to make sure the job have enough access to execute the jobs and r/w from hyperdisk
181-
- key: runAsUser
182-
value: 1000
183-
- key: runAsGroup
184-
value: 100
185-
- key: fsGroup
186-
value: 100
187-
command:
188-
- bash
189-
- -c
190-
- |
191-
pip install transformers datasets
192-
python - <<EOF
193-
from datasets import load_dataset
194-
dataset = load_dataset("glue", "cola", cache_dir='/data/hyperdisk-extreme-pvc-0')
195-
dataset = dataset["train"]
196-
from transformers import AutoTokenizer
197-
import numpy as np
198-
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
199-
sentences = [str(s) for s in dataset["sentence"]]
200-
tokenized_data = tokenizer(sentences, return_tensors="np", padding=True)
201-
tokenized_data = dict(tokenized_data)
202-
labels = np.array(dataset["label"])
203-
from transformers import TFAutoModelForSequenceClassification
204-
from tensorflow.keras.optimizers import Adam
205-
model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased")
206-
model.compile(optimizer=Adam(3e-5))
207-
model.fit(tokenized_data, labels)
208-
EOF
209-
node_count: 1
210-
outputs: [instructions]
211-
212-
# Train a TensorFlow model with Keras and Hyperdisk Throughput on GKE
213-
# Tutorial: https://cloud.google.com/parallelstore/docs/tensorflow-sample
214-
- id: hyperdisk-throughput-job
215-
source: modules/compute/gke-job-template
216-
use:
217-
- gke_cluster
218136
- hyperdisk-throughput-setup
219137
settings:
220-
name: tensorflow
221-
image: jupyter/tensorflow-notebook@sha256:173f124f638efe870bb2b535e01a76a80a95217e66ed00751058c51c09d6d85d
222-
security_context: # to make sure the job have enough access to execute the jobs and r/w from hyperdisk
138+
name: fio-benchmark
139+
image: ubuntu:latest
140+
security_context: # to make sure the job have enough access to install the fio packages
223141
- key: runAsUser
224-
value: 1000
142+
value: 0
225143
- key: runAsGroup
226144
value: 100
227145
- key: fsGroup
@@ -230,23 +148,49 @@ deployment_groups:
230148
- bash
231149
- -c
232150
- |
233-
pip install transformers datasets
234-
python - <<EOF
235-
from datasets import load_dataset
236-
dataset = load_dataset("glue", "cola", cache_dir='/data/hyperdisk-throughput-pvc-0')
237-
dataset = dataset["train"]
238-
from transformers import AutoTokenizer
239-
import numpy as np
240-
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
241-
sentences = [str(s) for s in dataset["sentence"]]
242-
tokenized_data = tokenizer(sentences, return_tensors="np", padding=True)
243-
tokenized_data = dict(tokenized_data)
244-
labels = np.array(dataset["label"])
245-
from transformers import TFAutoModelForSequenceClassification
246-
from tensorflow.keras.optimizers import Adam
247-
model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased")
248-
model.compile(optimizer=Adam(3e-5))
249-
model.fit(tokenized_data, labels)
250-
EOF
151+
152+
set -eux
153+
export DEBIAN_FRONTEND=noninteractive
154+
155+
# Install fio
156+
apt update -y && apt install -y fio
157+
158+
# Use a tag to create a unique path for tests
159+
TAG=`date +%s`
160+
161+
# Verify mountpoints
162+
df -h
163+
mountpoint /data/hyperdisk-balanced-pvc-0
164+
mountpoint /data/hyperdisk-extreme-pvc-0
165+
mountpoint /data/hyperdisk-throughput-pvc-0
166+
167+
# Create temporary directory for fio benchmarks
168+
mkdir -p /data/hyperdisk-balanced-pvc-0/fio-benchmarks-${TAG}
169+
mkdir -p /data/hyperdisk-extreme-pvc-0/fio-benchmarks-${TAG}
170+
mkdir -p /data/hyperdisk-throughput-pvc-0/fio-benchmarks-${TAG}
171+
172+
# Perform hyperdisk balanced performance (Mixed IOPS) test
173+
fio --name=hyperdisk-balanced-iops --ioengine=libaio --iodepth=256 --rw=randrw \
174+
--bs=4k --direct=1 --size=10G --numjobs=16 --group_reporting --time_based --runtime=300s \
175+
--ramp_time=10s --iodepth_batch_submit=256 --iodepth_batch_complete_max=256 \
176+
--directory=/data/hyperdisk-balanced-pvc-0/fio-benchmarks-${TAG} --filename_format=fiotest-balanced-iops
177+
178+
# Perform hyperdisk extreme performance test (Max IOPS)
179+
fio --name=hyperdisk-extreme-iops --ioengine=libaio --iodepth=256 --rw=randwrite \
180+
--bs=4k --direct=1 --size=10G --numjobs=32 --group_reporting --time_based --runtime=300s --ramp_time=10s \
181+
--iodepth_batch_submit=256 --iodepth_batch_complete_max=256 \
182+
--directory=/data/hyperdisk-extreme-pvc-0/fio-benchmarks-${TAG} --filename_format=fiotest-extreme-iops
183+
184+
# Perform hyperdisk throughput performance test
185+
fio --name=hyperdisk-throughput-bw --ioengine=libaio --iodepth=64 --rw=write --bs=1M \
186+
--direct=1 --size=10G --numjobs=32 --group_reporting --time_based --runtime=300s --ramp_time=10s \
187+
--iodepth_batch_submit=64 --iodepth_batch_complete_max=64 \
188+
--directory=/data/hyperdisk-throughput-pvc-0/fio-benchmarks-${TAG} --filename_format=fiotest-throughput-bw
189+
190+
# Clean up temporary directories for fio benchmarks
191+
rm -rf /data/hyperdisk-balanced-pvc-0/fio-benchmarks-${TAG}
192+
rm -rf /data/hyperdisk-extreme-pvc-0/fio-benchmarks-${TAG}
193+
rm -rf /data/hyperdisk-throughput-pvc-0/fio-benchmarks-${TAG}
251194
node_count: 1
195+
252196
outputs: [instructions]

tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-gke-managed-hyperdisk.yml

Lines changed: 37 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -12,30 +12,52 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15+
- name: Assert variables are defined
16+
ansible.builtin.assert:
17+
that:
18+
- region is defined
19+
- custom_vars.project is defined
20+
1521
- name: Get cluster credentials for kubectl
1622
delegate_to: localhost
1723
ansible.builtin.command: gcloud container clusters get-credentials {{ deployment_name }} --region {{ cli_deployment_vars.region }} --project {{ custom_vars.project }} --verbosity=debug
1824

19-
- name: Execute the job
25+
- name: Run the FIO benchmark job and get its name
2026
delegate_to: localhost
2127
ansible.builtin.shell: |
22-
jobs=({{ workspace }}/{{ deployment_name }}/primary/tensorflow*)
23-
for job in "${jobs[@]}"; do
24-
kubectl create -f "$job" -v=9
25-
done
28+
job_file=({{ workspace }}/{{ deployment_name }}/primary/fio-benchmark*)
29+
# Assuming only one benchmark file matches
30+
kubectl create -f "${job_file[0]}" -o=jsonpath='{.metadata.name}'
2631
args:
2732
executable: /bin/bash
28-
changed_when: False
33+
register: fio_job_create_output
34+
35+
- name: Set FIO job name
36+
ansible.builtin.set_fact:
37+
fio_job_name: "{{ fio_job_create_output.stdout }}"
2938

30-
- name: Wait for job to complete
39+
- name: Wait for FIO Job to complete
40+
# The FIO job should take approximately 20 minutes, process times out after a max wait of 40 mins
3141
delegate_to: localhost
32-
ansible.builtin.command: |
33-
kubectl get job --field-selector status.successful=1 -v=9
34-
register: job_completion
35-
until: job_completion.stdout_lines | length > 3 # 3 jobs total
42+
ansible.builtin.shell: "kubectl get job {{ fio_job_name }} -o jsonpath='{.status.succeeded}'"
43+
register: fio_job_status
44+
until: fio_job_status.stdout == '1'
3645
retries: 80
37-
delay: 15
46+
delay: 30
47+
48+
- name: Fetch logs from the FIO job pod and save to fio_pod_logs.txt
49+
delegate_to: localhost
50+
ansible.builtin.shell: |
51+
pod_name="$(kubectl get pods -l job-name={{ fio_job_name }} -o jsonpath='{.items[0].metadata.name}')"
52+
kubectl logs "$pod_name" > fio_pod_logs.txt
53+
cat fio_pod_logs.txt
54+
register: fio_test_logs
55+
56+
- name: Print the FIO test logs
57+
debug:
58+
msg: "{{fio_test_logs.stdout}}"
3859

39-
- name: Print job_completion debug output
40-
ansible.builtin.debug:
41-
var: job_completion.stdout_lines
60+
- name: Clean up FIO job
61+
delegate_to: localhost
62+
ansible.builtin.shell: |
63+
kubectl delete job {{ fio_job_name }} -v=9

0 commit comments

Comments
 (0)