diff --git a/.github/workflows/ci-full.yaml b/.github/workflows/ci-full.yaml
index 90042f65..929d9534 100644
--- a/.github/workflows/ci-full.yaml
+++ b/.github/workflows/ci-full.yaml
@@ -46,8 +46,8 @@ jobs:
- name: Mount CVMFS
run: |
kubectl create namespace cvmfs-csi
- helm install -n cvmfs-csi cvmfs-csi oci://registry.cern.ch/kubernetes/charts/cvmfs-csi --values ci/values-cvmfs-csi.yaml
- kubectl apply -f ci/cvmfs-storageclass.yaml -n cvmfs-csi
+ helm install -n cvmfs-csi cvmfs-csi oci://registry.cern.ch/kubernetes/charts/cvmfs-csi --values cvmfs/values-cvmfs-csi.yaml
+ kubectl apply -f cvmfs/cvmfs-storageclass.yaml -n cvmfs-csi
- name: Deploy Helm chart
run: |
@@ -98,7 +98,7 @@ jobs:
- name: Run Perf Analyzer Job
run: |
- kubectl apply -f ci/perf-analyzer-job.yaml
+ kubectl apply -f tests/perf-analyzer-job-ci.yaml
kubectl wait --for=condition=complete job/perf-analyzer-job -n cms --timeout=300s || \
(echo "Perf-analyzer job did not complete in time or failed." && exit 1)
diff --git a/.github/workflows/ci-local.sh b/.github/workflows/ci-local.sh
index a7c49461..94db1769 100644
--- a/.github/workflows/ci-local.sh
+++ b/.github/workflows/ci-local.sh
@@ -36,8 +36,8 @@ helm install keda kedacore/keda --namespace keda
echo "Mounting CVMFS..."
kubectl create namespace cvmfs-csi
helm install -n cvmfs-csi cvmfs-csi oci://registry.cern.ch/kubernetes/charts/cvmfs-csi \
- --values ci/values-cvmfs-csi.yaml
-kubectl apply -f ci/cvmfs-storageclass.yaml -n cvmfs-csi
+ --values cvmfs/values-cvmfs-csi.yaml
+kubectl apply -f cvmfs/cvmfs-storageclass.yaml -n cvmfs-csi
# 7. Deploy the Helm chart for supersonic
echo "Deploying Helm chart for supersonic..."
@@ -82,7 +82,7 @@ kubectl get all -n cms
# 10. Run Perf Analyzer Job
echo "Running Perf Analyzer Job..."
-kubectl apply -f ci/perf-analyzer-job.yaml
+kubectl apply -f tests/perf-analyzer-job-ci.yaml
kubectl wait --for=condition=complete job/perf-analyzer-job -n cms --timeout=180s || {
echo "Perf-analyzer job did not complete in time or failed."
exit 1
diff --git a/.github/workflows/helm-lint.yaml b/.github/workflows/helm-lint.yaml
index ddf09b4b..0af048b2 100644
--- a/.github/workflows/helm-lint.yaml
+++ b/.github/workflows/helm-lint.yaml
@@ -35,7 +35,7 @@ jobs:
- name: Generate JSON schema
run: |
- python ci/yaml-to-schema.py helm/supersonic/values.yaml helm/supersonic/values.schema.json
+ python .github/workflows/yaml-to-schema.py helm/supersonic/values.yaml helm/supersonic/values.schema.json
- name: Commit and push changes
env:
diff --git a/ci/yaml-to-schema.py b/.github/workflows/yaml-to-schema.py
similarity index 100%
rename from ci/yaml-to-schema.py
rename to .github/workflows/yaml-to-schema.py
diff --git a/README.md b/README.md
index 2539efa1..8a6dbda6 100644
--- a/README.md
+++ b/README.md
@@ -27,24 +27,145 @@ Currently, SuperSONIC supports the following functionality:
## Installation
-**Pre-requisites:**
-- a Kubernetes cluster with access to GPUs
-- a Prometheus instance installed on the cluster, or Prometheus CRDs to deploy your own instance
-- KEDA CRDs installed on the cluster (only if using autoscaling)
+### Pre-requisites
+
+
+ Kubernetes cluster
+
+ ideally with access to GPUs, but CPUs are enough for a minimal deployment.
+
+
+
+ Helm
+
+ Helm is a package manager for Kubernetes.
+ To install Helm on your machine, follow the official instructions at [https://helm.sh/docs/intro/install/](https://helm.sh/docs/intro/install/).
+
+
+
+ Custom Resource Definitions (CRDs) – not needed for minimal deployment
+
+ - [Prometheus](https://prometheus.io) CRDs
+
+ If you are using an established Kubernetes cluster (e.g. at an HPC), there is a high chance that these CRDs are already installed. Otherwise, cluster admin can use the following commands:
+
+ How to install Prometheus CRDs
+
+ ```
+ helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
+ helm repo update
+ kubectl create namespace monitoring
+ helm install prometheus-operator prometheus-community/kube-prometheus-stack --namespace monitoring --set prometheusOperator.createCustomResource=false --set defaultRules.create=false --set alertmanager.enabled=false --set prometheus.enabled=false --set grafana.enabled=false
+ ```
+
+ - [KEDA](https://keda.sh) CRDs (only if using autoscaling)
+
+
+ How to install Prometheus CRDs
+
+ ```
+ helm repo add kedacore https://kedacore.github.io/charts
+ helm repo update
+ kubectl create namespace keda
+ helm install keda kedacore/keda --namespace keda
+ ```
+
+
+
+---
+
+### Standard deployment
+
+If you are installing SuperSONIC for the first time, proceed to the [Minimal deployment](#minimal-deployment) section below.
+
+If you already have a functional `values.yaml` and/or installed SuperSONIC previously, use the following installation commands:
+
+```
+helm repo add fastml https://fastmachinelearning.org/SuperSONIC
+helm repo update
+helm install fastml/supersonic -n -f
+```
+
+To construct the `values.yaml` file for your application, follow [Configuration guide](http://fastmachinelearning.org/SuperSONIC/configuration-guide.html "Configuration guide").
+
+The full list of configuration parameters is available in the [Configuration reference](http://fastmachinelearning.org/SuperSONIC/configuration-reference.html "Configuration reference").
+
+---
+
+### Minimal deployment
+
+
+1. Install cvmfs-csi plugin to load models from CVMFS
+
+For an example installation, we will use CMS models loaded from [CVMFS](https://cvmfs.readthedocs.io/en/stable/). SuperSONIC allows other types of model repository, including
+an arbitrary Persistent Volume, an NFS volume, or S3 storage.
+
+[cvmfs-csi](https://github.com/cvmfs-contrib/cvmfs-csi) plugin allows to easily mount CVMFS
+into a Kubernetes cluster by creating a new storage class. A Persistent Volume created with this
+storage class will have CVMFS contents visible inside.
+
+Cluster admin can use the following commands to install `cvmfs-csi`:
+```
+kubectl create namespace cvmfs-csi
+helm install -n cvmfs-csi cvmfs-csi oci://registry.cern.ch/kubernetes/charts/cvmfs-csi --values cvmfs/values-cvmfs-csi.yaml
+kubectl apply -f cvmfs/cvmfs-storageclass.yaml -n cvmfs-csi
+```
+
-Install the latest released version from the Helm repository
+2. Install SuperSONIC with minimal configuration
+
+The minimal deployment will install only a single CPU-based Triton server and an Envoy Proxy.
+We will use [`values/values-minimal.yaml`](values/values-minimal.yaml) as our minimal
+configuration file.
```
helm repo add fastml https://fastmachinelearning.org/SuperSONIC
helm repo update
-helm install fastml/supersonic -n -f
+helm install fastml/supersonic -n -f values/values-minimal.yaml
```
+
+
+
+3. Deploy a test job to run inferences
+
+To test your SuperSONIC installation, we will create a small [Nvidia Performance Analyzer](https://docs.nvidia.com/deeplearning/triton-inference-server/archives/triton-inference-server-2280/user-guide/docs/user_guide/perf_analyzer.html) job,
+which will send a single inference request with random input data to Envoy Proxy endpoint.
+
+1. In `tests/perf-analyzer-job.yaml`, edit the following parameters to match your deployment:
+
+ ```
+ metadata:
+ namespace:
+ ```
+
+ In `perf_analyzer` command:
+
+ ```
+ -u ..svc.cluster.local:8001
+ ```
+
+2. Submit the job to your Kubernetes cluster:
+
+ ```
+ kubectl apply -n -f tests/perf-analyzer-job.yaml
+ ```
+
+3. Track job performance and inspect logs:
+
+ ```
+ kubectl get pods -l job-name=perf-analyzer-job -n
+ kubectl logs -n
+ ```
+---
+
+### Installing from a GitHub branch/tag/commit
+
-Install directly from a GitHub branch/tag/commit
+This option may be useful for testing unreleased features.
```
git clone https://github.com/fastmachinelearning/SuperSONIC.git
@@ -56,9 +177,6 @@ helm install helm/supersonic -n -f
-To construct the `values.yaml` file for your application, follow [Configuration guide](http://fastmachinelearning.org/SuperSONIC/configuration-guide.html "Configuration guide").
-
-The full list of configuration parameters is available in the [Configuration reference](http://fastmachinelearning.org/SuperSONIC/configuration-reference.html "Configuration reference").
## Server diagram
@@ -76,6 +194,7 @@ The full list of configuration parameters is available in the [Configuration ref
| **[Purdue Anvil](https://www.rcac.purdue.edu/compute/anvil)** | ✅ | - | - |
| **[NRP Nautilus](https://docs.nationalresearchplatform.org)** | ✅ | ✅ | ✅ |
| **[UChicago](https://af.uchicago.edu/)** | - | ✅ | - |
+| **[UW–Madison](https://www.hep.wisc.edu/cms/comp/)** | ⏳ | - | - |
## Publications
diff --git a/ci/cvmfs-storageclass.yaml b/cvmfs/cvmfs-storageclass.yaml
similarity index 100%
rename from ci/cvmfs-storageclass.yaml
rename to cvmfs/cvmfs-storageclass.yaml
diff --git a/ci/values-cvmfs-csi.yaml b/cvmfs/values-cvmfs-csi.yaml
similarity index 100%
rename from ci/values-cvmfs-csi.yaml
rename to cvmfs/values-cvmfs-csi.yaml
diff --git a/helm/supersonic/README.md b/helm/supersonic/README.md
index 2539efa1..8a6dbda6 100644
--- a/helm/supersonic/README.md
+++ b/helm/supersonic/README.md
@@ -27,24 +27,145 @@ Currently, SuperSONIC supports the following functionality:
## Installation
-**Pre-requisites:**
-- a Kubernetes cluster with access to GPUs
-- a Prometheus instance installed on the cluster, or Prometheus CRDs to deploy your own instance
-- KEDA CRDs installed on the cluster (only if using autoscaling)
+### Pre-requisites
+
+
+ Kubernetes cluster
+
+ ideally with access to GPUs, but CPUs are enough for a minimal deployment.
+
+
+
+ Helm
+
+ Helm is a package manager for Kubernetes.
+ To install Helm on your machine, follow the official instructions at [https://helm.sh/docs/intro/install/](https://helm.sh/docs/intro/install/).
+
+
+
+ Custom Resource Definitions (CRDs) – not needed for minimal deployment
+
+ - [Prometheus](https://prometheus.io) CRDs
+
+ If you are using an established Kubernetes cluster (e.g. at an HPC), there is a high chance that these CRDs are already installed. Otherwise, cluster admin can use the following commands:
+
+ How to install Prometheus CRDs
+
+ ```
+ helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
+ helm repo update
+ kubectl create namespace monitoring
+ helm install prometheus-operator prometheus-community/kube-prometheus-stack --namespace monitoring --set prometheusOperator.createCustomResource=false --set defaultRules.create=false --set alertmanager.enabled=false --set prometheus.enabled=false --set grafana.enabled=false
+ ```
+
+ - [KEDA](https://keda.sh) CRDs (only if using autoscaling)
+
+
+ How to install Prometheus CRDs
+
+ ```
+ helm repo add kedacore https://kedacore.github.io/charts
+ helm repo update
+ kubectl create namespace keda
+ helm install keda kedacore/keda --namespace keda
+ ```
+
+
+
+---
+
+### Standard deployment
+
+If you are installing SuperSONIC for the first time, proceed to the [Minimal deployment](#minimal-deployment) section below.
+
+If you already have a functional `values.yaml` and/or installed SuperSONIC previously, use the following installation commands:
+
+```
+helm repo add fastml https://fastmachinelearning.org/SuperSONIC
+helm repo update
+helm install fastml/supersonic -n -f
+```
+
+To construct the `values.yaml` file for your application, follow [Configuration guide](http://fastmachinelearning.org/SuperSONIC/configuration-guide.html "Configuration guide").
+
+The full list of configuration parameters is available in the [Configuration reference](http://fastmachinelearning.org/SuperSONIC/configuration-reference.html "Configuration reference").
+
+---
+
+### Minimal deployment
+
+
+1. Install cvmfs-csi plugin to load models from CVMFS
+
+For an example installation, we will use CMS models loaded from [CVMFS](https://cvmfs.readthedocs.io/en/stable/). SuperSONIC allows other types of model repository, including
+an arbitrary Persistent Volume, an NFS volume, or S3 storage.
+
+[cvmfs-csi](https://github.com/cvmfs-contrib/cvmfs-csi) plugin allows to easily mount CVMFS
+into a Kubernetes cluster by creating a new storage class. A Persistent Volume created with this
+storage class will have CVMFS contents visible inside.
+
+Cluster admin can use the following commands to install `cvmfs-csi`:
+```
+kubectl create namespace cvmfs-csi
+helm install -n cvmfs-csi cvmfs-csi oci://registry.cern.ch/kubernetes/charts/cvmfs-csi --values cvmfs/values-cvmfs-csi.yaml
+kubectl apply -f cvmfs/cvmfs-storageclass.yaml -n cvmfs-csi
+```
+
-Install the latest released version from the Helm repository
+2. Install SuperSONIC with minimal configuration
+
+The minimal deployment will install only a single CPU-based Triton server and an Envoy Proxy.
+We will use [`values/values-minimal.yaml`](values/values-minimal.yaml) as our minimal
+configuration file.
```
helm repo add fastml https://fastmachinelearning.org/SuperSONIC
helm repo update
-helm install fastml/supersonic -n -f
+helm install fastml/supersonic -n -f values/values-minimal.yaml
```
+
+
+
+3. Deploy a test job to run inferences
+
+To test your SuperSONIC installation, we will create a small [Nvidia Performance Analyzer](https://docs.nvidia.com/deeplearning/triton-inference-server/archives/triton-inference-server-2280/user-guide/docs/user_guide/perf_analyzer.html) job,
+which will send a single inference request with random input data to Envoy Proxy endpoint.
+
+1. In `tests/perf-analyzer-job.yaml`, edit the following parameters to match your deployment:
+
+ ```
+ metadata:
+ namespace:
+ ```
+
+ In `perf_analyzer` command:
+
+ ```
+ -u ..svc.cluster.local:8001
+ ```
+
+2. Submit the job to your Kubernetes cluster:
+
+ ```
+ kubectl apply -n -f tests/perf-analyzer-job.yaml
+ ```
+
+3. Track job performance and inspect logs:
+
+ ```
+ kubectl get pods -l job-name=perf-analyzer-job -n
+ kubectl logs -n
+ ```
+---
+
+### Installing from a GitHub branch/tag/commit
+
-Install directly from a GitHub branch/tag/commit
+This option may be useful for testing unreleased features.
```
git clone https://github.com/fastmachinelearning/SuperSONIC.git
@@ -56,9 +177,6 @@ helm install helm/supersonic -n -f
-To construct the `values.yaml` file for your application, follow [Configuration guide](http://fastmachinelearning.org/SuperSONIC/configuration-guide.html "Configuration guide").
-
-The full list of configuration parameters is available in the [Configuration reference](http://fastmachinelearning.org/SuperSONIC/configuration-reference.html "Configuration reference").
## Server diagram
@@ -76,6 +194,7 @@ The full list of configuration parameters is available in the [Configuration ref
| **[Purdue Anvil](https://www.rcac.purdue.edu/compute/anvil)** | ✅ | - | - |
| **[NRP Nautilus](https://docs.nationalresearchplatform.org)** | ✅ | ✅ | ✅ |
| **[UChicago](https://af.uchicago.edu/)** | - | ✅ | - |
+| **[UW–Madison](https://www.hep.wisc.edu/cms/comp/)** | ⏳ | - | - |
## Publications
diff --git a/ci/perf-analyzer-job.yaml b/tests/perf-analyzer-job-ci.yaml
similarity index 100%
rename from ci/perf-analyzer-job.yaml
rename to tests/perf-analyzer-job-ci.yaml
diff --git a/ci/perf-analyzer-job-local.yaml b/tests/perf-analyzer-job.yaml
similarity index 61%
rename from ci/perf-analyzer-job-local.yaml
rename to tests/perf-analyzer-job.yaml
index 94860643..ee36e54f 100644
--- a/ci/perf-analyzer-job-local.yaml
+++ b/tests/perf-analyzer-job.yaml
@@ -2,11 +2,11 @@ apiVersion: batch/v1
kind: Job
metadata:
name: perf-analyzer-job
- namespace: sonic-server
+ namespace: default
spec:
parallelism: 1
completions: 1
- backoffLimit: 1000
+ backoffLimit: 0
template:
spec:
restartPolicy: OnFailure
@@ -19,17 +19,15 @@ spec:
- |
echo "Running perf_analyzer..."
perf_analyzer -i grpc \
- -m deeptau_2018v2p5 \
- -u supersonic.sonic-server.svc.cluster.local:8001 \
+ -m higgsInteractionNet \
+ -u supersonic.default.svc.cluster.local:8001 \
--async -p 1 -b 100 \
- --request-count=10 \
- --concurrency-range=8 --input-data "random"
+ --request-count=1 \
+ --concurrency-range=1 --input-data "random"
resources:
requests:
cpu: 1
memory: "2G"
limits:
cpu: 1
- memory: "2G"
- # nodeSelector:
- # topology.kubernetes.io/zone: ucsd
\ No newline at end of file
+ memory: "2G"
\ No newline at end of file