Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
161 changes: 53 additions & 108 deletions examples/gke-managed-hyperdisk.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -122,106 +122,24 @@ deployment_groups:
settings:
name: sample-pool
zones: [$(vars.zone)]
machine_type: c3-standard-88 # Hyperdisk-extreme required C3 machine with 88 or more vCPUs
machine_type: c3-standard-88 # Hyperdisk-extreme requires C3 machine with 88 or more vCPUs
auto_upgrade: true

# Train a TensorFlow model with Keras and Hyperdisk Balanced on GKE
# Tutorial: https://cloud.google.com/parallelstore/docs/tensorflow-sample
- id: hyperdisk-balanced-job
# This is an example job that will install and run an `fio`benchmark against the hyperdisk volumes.
# For more FIO tests, see https://cloud.google.com/compute/docs/disks/benchmark-hyperdisk-performance
- id: fio-bench-job-template
source: modules/compute/gke-job-template
use:
- gke_cluster
- hyperdisk-balanced-setup
settings:
name: tensorflow
image: jupyter/tensorflow-notebook@sha256:173f124f638efe870bb2b535e01a76a80a95217e66ed00751058c51c09d6d85d
security_context: # to make sure the job have enough access to execute the jobs and r/w from hyperdisk
- key: runAsUser
value: 1000
- key: runAsGroup
value: 100
- key: fsGroup
value: 100
command:
- bash
- -c
- |
pip install transformers datasets
python - <<EOF
from datasets import load_dataset
dataset = load_dataset("glue", "cola", cache_dir='/data/hyperdisk-balanced-pvc-0')
dataset = dataset["train"]
from transformers import AutoTokenizer
import numpy as np
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
sentences = [str(s) for s in dataset["sentence"]]
tokenized_data = tokenizer(sentences, return_tensors="np", padding=True)
tokenized_data = dict(tokenized_data)
labels = np.array(dataset["label"])
from transformers import TFAutoModelForSequenceClassification
from tensorflow.keras.optimizers import Adam
model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased")
model.compile(optimizer=Adam(3e-5))
model.fit(tokenized_data, labels)
EOF
node_count: 1
outputs: [instructions]

# Train a TensorFlow model with Keras and Hyperdisk Extreme on GKE
# Tutorial: https://cloud.google.com/parallelstore/docs/tensorflow-sample
- id: hyperdisk-extreme-job
source: modules/compute/gke-job-template
use:
- gke_cluster
- hyperdisk-extreme-setup
settings:
name: tensorflow
image: jupyter/tensorflow-notebook@sha256:173f124f638efe870bb2b535e01a76a80a95217e66ed00751058c51c09d6d85d
security_context: # to make sure the job have enough access to execute the jobs and r/w from hyperdisk
- key: runAsUser
value: 1000
- key: runAsGroup
value: 100
- key: fsGroup
value: 100
command:
- bash
- -c
- |
pip install transformers datasets
python - <<EOF
from datasets import load_dataset
dataset = load_dataset("glue", "cola", cache_dir='/data/hyperdisk-extreme-pvc-0')
dataset = dataset["train"]
from transformers import AutoTokenizer
import numpy as np
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
sentences = [str(s) for s in dataset["sentence"]]
tokenized_data = tokenizer(sentences, return_tensors="np", padding=True)
tokenized_data = dict(tokenized_data)
labels = np.array(dataset["label"])
from transformers import TFAutoModelForSequenceClassification
from tensorflow.keras.optimizers import Adam
model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased")
model.compile(optimizer=Adam(3e-5))
model.fit(tokenized_data, labels)
EOF
node_count: 1
outputs: [instructions]

# Train a TensorFlow model with Keras and Hyperdisk Throughput on GKE
# Tutorial: https://cloud.google.com/parallelstore/docs/tensorflow-sample
- id: hyperdisk-throughput-job
source: modules/compute/gke-job-template
use:
- gke_cluster
- hyperdisk-throughput-setup
settings:
name: tensorflow
image: jupyter/tensorflow-notebook@sha256:173f124f638efe870bb2b535e01a76a80a95217e66ed00751058c51c09d6d85d
security_context: # to make sure the job have enough access to execute the jobs and r/w from hyperdisk
name: fio-benchmark
image: ubuntu:latest
security_context: # to make sure the job have enough access to install the fio packages
- key: runAsUser
value: 1000
value: 0
- key: runAsGroup
value: 100
- key: fsGroup
Expand All @@ -230,23 +148,50 @@ deployment_groups:
- bash
- -c
- |
pip install transformers datasets
python - <<EOF
from datasets import load_dataset
dataset = load_dataset("glue", "cola", cache_dir='/data/hyperdisk-throughput-pvc-0')
dataset = dataset["train"]
from transformers import AutoTokenizer
import numpy as np
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
sentences = [str(s) for s in dataset["sentence"]]
tokenized_data = tokenizer(sentences, return_tensors="np", padding=True)
tokenized_data = dict(tokenized_data)
labels = np.array(dataset["label"])
from transformers import TFAutoModelForSequenceClassification
from tensorflow.keras.optimizers import Adam
model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased")
model.compile(optimizer=Adam(3e-5))
model.fit(tokenized_data, labels)
EOF

set -eux
export DEBIAN_FRONTEND=noninteractive

# Install fio
apt update -y && apt install -y fio

# Use a tag to create a unique path for tests
TAG=`date +%s`

# Verify mountpoints
df -h
mountpoint /data/hyperdisk-balanced-pvc-0
mountpoint /data/hyperdisk-extreme-pvc-0
mountpoint /data/hyperdisk-throughput-pvc-0

# Create temporary directory for fio benchmarks
mkdir -p /data/hyperdisk-balanced-pvc-0/fio-benchmarks-${TAG}
mkdir -p /data/hyperdisk-extreme-pvc-0/fio-benchmarks-${TAG}
mkdir -p /data/hyperdisk-throughput-pvc-0/fio-benchmarks-${TAG}

# Perform hyperdisk balanced performance (Mixed IOPS) test
fio --name=hyperdisk-balanced-iops --ioengine=libaio --iodepth=256 --rw=randrw \
--bs=4k --direct=1 --size=10G --numjobs=16 --group_reporting --time_based --runtime=120s \
--ramp_time=10s --iodepth_batch_submit=256 --iodepth_batch_complete_max=256 \
--directory=/data/hyperdisk-balanced-pvc-0/fio-benchmarks-${TAG} --filename_format=fiotest-balanced-iops

# Perform hyperdisk extreme performance test (Write IOPS)
fio --name=global --group_reporting=1 --filesize=10G --numjobs=4 --size=5G --offset_increment=5G \
--time_based --runtime=120s --ramp_time=10s --ioengine=libaio --direct=1 --verify=0 --bs=1M --iodepth=8 \
--rw=write --directory=/data/hyperdisk-balanced-pvc-0/fio-benchmarks-${TAG} --filename_format=fiotest-extreme-write
--name=write_throughput --numa_cpu_nodes=0 --name=write_throughput_1 --numa_cpu_nodes=1 \
--name=write_throughput_2 --numa_cpu_nodes=2 --name=write_throughput_3 --numa_cpu_nodes=3

# Perform hyperdisk throughput performance test
fio --name=hyperdisk-throughput-bw --ioengine=libaio --iodepth=64 --rw=write --bs=1M \
--direct=1 --size=10G --numjobs=32 --group_reporting --time_based --runtime=120s --ramp_time=10s \
--iodepth_batch_submit=64 --iodepth_batch_complete_max=64 \
--directory=/data/hyperdisk-throughput-pvc-0/fio-benchmarks-${TAG} --filename_format=fiotest-throughput-bw

# Clean up temporary directories for fio benchmarks
rm -rf /data/hyperdisk-balanced-pvc-0/fio-benchmarks-${TAG}
rm -rf /data/hyperdisk-extreme-pvc-0/fio-benchmarks-${TAG}
rm -rf /data/hyperdisk-throughput-pvc-0/fio-benchmarks-${TAG}
node_count: 1

outputs: [instructions]
Original file line number Diff line number Diff line change
Expand Up @@ -12,30 +12,52 @@
# See the License for the specific language governing permissions and
# limitations under the License.

- name: Assert variables are defined
ansible.builtin.assert:
that:
- region is defined
- custom_vars.project is defined

- name: Get cluster credentials for kubectl
delegate_to: localhost
ansible.builtin.command: gcloud container clusters get-credentials {{ deployment_name }} --region {{ cli_deployment_vars.region }} --project {{ custom_vars.project }} --verbosity=debug

- name: Execute the job
- name: Run the FIO benchmark job and get its name
delegate_to: localhost
ansible.builtin.shell: |
jobs=({{ workspace }}/{{ deployment_name }}/primary/tensorflow*)
for job in "${jobs[@]}"; do
kubectl create -f "$job" -v=9
done
job_file=({{ workspace }}/{{ deployment_name }}/primary/fio-benchmark*)
# Assuming only one benchmark file matches
kubectl create -f "${job_file[0]}" -o=jsonpath='{.metadata.name}'
args:
executable: /bin/bash
changed_when: False
register: fio_job_create_output

- name: Set FIO job name
ansible.builtin.set_fact:
fio_job_name: "{{ fio_job_create_output.stdout }}"

- name: Wait for job to complete
- name: Wait for FIO Job to complete
# The FIO job should take approximately 20 minutes, process times out after a max wait of 30 mins
delegate_to: localhost
ansible.builtin.command: |
kubectl get job --field-selector status.successful=1 -v=9
register: job_completion
until: job_completion.stdout_lines | length > 3 # 3 jobs total
retries: 80
delay: 15

- name: Print job_completion debug output
ansible.builtin.debug:
var: job_completion.stdout_lines
ansible.builtin.shell: "kubectl get job {{ fio_job_name }} -o jsonpath='{.status.succeeded}'"
register: fio_job_status
until: fio_job_status.stdout == '1'
retries: 90
delay: 30

- name: Fetch logs from the FIO job pod and save to fio_pod_logs.txt
delegate_to: localhost
ansible.builtin.shell: |
pod_name="$(kubectl get pods -l job-name={{ fio_job_name }} -o jsonpath='{.items[0].metadata.name}')"
kubectl logs "$pod_name" > fio_pod_logs.txt
cat fio_pod_logs.txt
register: fio_test_logs

- name: Print the FIO test logs
debug:
msg: "{{fio_test_logs.stdout}}"

- name: Clean up FIO job
delegate_to: localhost
ansible.builtin.shell: |
kubectl delete job {{ fio_job_name }} -v=9