diff --git a/.github/workflows/ci-full.yaml b/.github/workflows/ci-full.yaml index 90042f65..929d9534 100644 --- a/.github/workflows/ci-full.yaml +++ b/.github/workflows/ci-full.yaml @@ -46,8 +46,8 @@ jobs: - name: Mount CVMFS run: | kubectl create namespace cvmfs-csi - helm install -n cvmfs-csi cvmfs-csi oci://registry.cern.ch/kubernetes/charts/cvmfs-csi --values ci/values-cvmfs-csi.yaml - kubectl apply -f ci/cvmfs-storageclass.yaml -n cvmfs-csi + helm install -n cvmfs-csi cvmfs-csi oci://registry.cern.ch/kubernetes/charts/cvmfs-csi --values cvmfs/values-cvmfs-csi.yaml + kubectl apply -f cvmfs/cvmfs-storageclass.yaml -n cvmfs-csi - name: Deploy Helm chart run: | @@ -98,7 +98,7 @@ jobs: - name: Run Perf Analyzer Job run: | - kubectl apply -f ci/perf-analyzer-job.yaml + kubectl apply -f tests/perf-analyzer-job-ci.yaml kubectl wait --for=condition=complete job/perf-analyzer-job -n cms --timeout=300s || \ (echo "Perf-analyzer job did not complete in time or failed." && exit 1) diff --git a/.github/workflows/ci-local.sh b/.github/workflows/ci-local.sh index a7c49461..94db1769 100644 --- a/.github/workflows/ci-local.sh +++ b/.github/workflows/ci-local.sh @@ -36,8 +36,8 @@ helm install keda kedacore/keda --namespace keda echo "Mounting CVMFS..." kubectl create namespace cvmfs-csi helm install -n cvmfs-csi cvmfs-csi oci://registry.cern.ch/kubernetes/charts/cvmfs-csi \ - --values ci/values-cvmfs-csi.yaml -kubectl apply -f ci/cvmfs-storageclass.yaml -n cvmfs-csi + --values cvmfs/values-cvmfs-csi.yaml +kubectl apply -f cvmfs/cvmfs-storageclass.yaml -n cvmfs-csi # 7. Deploy the Helm chart for supersonic echo "Deploying Helm chart for supersonic..." @@ -82,7 +82,7 @@ kubectl get all -n cms # 10. Run Perf Analyzer Job echo "Running Perf Analyzer Job..." -kubectl apply -f ci/perf-analyzer-job.yaml +kubectl apply -f tests/perf-analyzer-job-ci.yaml kubectl wait --for=condition=complete job/perf-analyzer-job -n cms --timeout=180s || { echo "Perf-analyzer job did not complete in time or failed." exit 1 diff --git a/.github/workflows/helm-lint.yaml b/.github/workflows/helm-lint.yaml index ddf09b4b..0af048b2 100644 --- a/.github/workflows/helm-lint.yaml +++ b/.github/workflows/helm-lint.yaml @@ -35,7 +35,7 @@ jobs: - name: Generate JSON schema run: | - python ci/yaml-to-schema.py helm/supersonic/values.yaml helm/supersonic/values.schema.json + python .github/workflows/yaml-to-schema.py helm/supersonic/values.yaml helm/supersonic/values.schema.json - name: Commit and push changes env: diff --git a/ci/yaml-to-schema.py b/.github/workflows/yaml-to-schema.py similarity index 100% rename from ci/yaml-to-schema.py rename to .github/workflows/yaml-to-schema.py diff --git a/README.md b/README.md index 2539efa1..8a6dbda6 100644 --- a/README.md +++ b/README.md @@ -27,24 +27,145 @@ Currently, SuperSONIC supports the following functionality: ## Installation -**Pre-requisites:** -- a Kubernetes cluster with access to GPUs -- a Prometheus instance installed on the cluster, or Prometheus CRDs to deploy your own instance -- KEDA CRDs installed on the cluster (only if using autoscaling) +### Pre-requisites + +
+ Kubernetes cluster + + ideally with access to GPUs, but CPUs are enough for a minimal deployment. +
+ +
+ Helm + + Helm is a package manager for Kubernetes. + To install Helm on your machine, follow the official instructions at [https://helm.sh/docs/intro/install/](https://helm.sh/docs/intro/install/). +
+ +
+ Custom Resource Definitions (CRDs) – not needed for minimal deployment + + - [Prometheus](https://prometheus.io) CRDs + + If you are using an established Kubernetes cluster (e.g. at an HPC), there is a high chance that these CRDs are already installed. Otherwise, cluster admin can use the following commands: +
+ How to install Prometheus CRDs + + ``` + helm repo add prometheus-community https://prometheus-community.github.io/helm-charts + helm repo update + kubectl create namespace monitoring + helm install prometheus-operator prometheus-community/kube-prometheus-stack --namespace monitoring --set prometheusOperator.createCustomResource=false --set defaultRules.create=false --set alertmanager.enabled=false --set prometheus.enabled=false --set grafana.enabled=false + ``` +
+ - [KEDA](https://keda.sh) CRDs (only if using autoscaling) + +
+ How to install Prometheus CRDs + + ``` + helm repo add kedacore https://kedacore.github.io/charts + helm repo update + kubectl create namespace keda + helm install keda kedacore/keda --namespace keda + ``` +
+
+ +--- + +### Standard deployment + +If you are installing SuperSONIC for the first time, proceed to the [Minimal deployment](#minimal-deployment) section below. + +If you already have a functional `values.yaml` and/or installed SuperSONIC previously, use the following installation commands: + +``` +helm repo add fastml https://fastmachinelearning.org/SuperSONIC +helm repo update +helm install fastml/supersonic -n -f +``` + +To construct the `values.yaml` file for your application, follow [Configuration guide](http://fastmachinelearning.org/SuperSONIC/configuration-guide.html "Configuration guide"). + +The full list of configuration parameters is available in the [Configuration reference](http://fastmachinelearning.org/SuperSONIC/configuration-reference.html "Configuration reference"). + +--- + +### Minimal deployment + +
+1. Install cvmfs-csi plugin to load models from CVMFS + +For an example installation, we will use CMS models loaded from [CVMFS](https://cvmfs.readthedocs.io/en/stable/). SuperSONIC allows other types of model repository, including +an arbitrary Persistent Volume, an NFS volume, or S3 storage. + +[cvmfs-csi](https://github.com/cvmfs-contrib/cvmfs-csi) plugin allows to easily mount CVMFS +into a Kubernetes cluster by creating a new storage class. A Persistent Volume created with this +storage class will have CVMFS contents visible inside. + +Cluster admin can use the following commands to install `cvmfs-csi`: +``` +kubectl create namespace cvmfs-csi +helm install -n cvmfs-csi cvmfs-csi oci://registry.cern.ch/kubernetes/charts/cvmfs-csi --values cvmfs/values-cvmfs-csi.yaml +kubectl apply -f cvmfs/cvmfs-storageclass.yaml -n cvmfs-csi +``` +
-Install the latest released version from the Helm repository +2. Install SuperSONIC with minimal configuration + +The minimal deployment will install only a single CPU-based Triton server and an Envoy Proxy. +We will use [`values/values-minimal.yaml`](values/values-minimal.yaml) as our minimal +configuration file. ``` helm repo add fastml https://fastmachinelearning.org/SuperSONIC helm repo update -helm install fastml/supersonic -n -f +helm install fastml/supersonic -n -f values/values-minimal.yaml ``` +
+ +
+3. Deploy a test job to run inferences + +To test your SuperSONIC installation, we will create a small [Nvidia Performance Analyzer](https://docs.nvidia.com/deeplearning/triton-inference-server/archives/triton-inference-server-2280/user-guide/docs/user_guide/perf_analyzer.html) job, +which will send a single inference request with random input data to Envoy Proxy endpoint. + +1. In `tests/perf-analyzer-job.yaml`, edit the following parameters to match your deployment: + + ``` + metadata: + namespace: + ``` + + In `perf_analyzer` command: + + ``` + -u ..svc.cluster.local:8001 + ``` + +2. Submit the job to your Kubernetes cluster: + + ``` + kubectl apply -n -f tests/perf-analyzer-job.yaml + ``` + +3. Track job performance and inspect logs: + + ``` + kubectl get pods -l job-name=perf-analyzer-job -n + kubectl logs -n + ```
+--- + +### Installing from a GitHub branch/tag/commit +
-Install directly from a GitHub branch/tag/commit +This option may be useful for testing unreleased features. ``` git clone https://github.com/fastmachinelearning/SuperSONIC.git @@ -56,9 +177,6 @@ helm install helm/supersonic -n -f
-To construct the `values.yaml` file for your application, follow [Configuration guide](http://fastmachinelearning.org/SuperSONIC/configuration-guide.html "Configuration guide"). - -The full list of configuration parameters is available in the [Configuration reference](http://fastmachinelearning.org/SuperSONIC/configuration-reference.html "Configuration reference"). ## Server diagram @@ -76,6 +194,7 @@ The full list of configuration parameters is available in the [Configuration ref | **[Purdue Anvil](https://www.rcac.purdue.edu/compute/anvil)** | ✅ | - | - | | **[NRP Nautilus](https://docs.nationalresearchplatform.org)** | ✅ | ✅ | ✅ | | **[UChicago](https://af.uchicago.edu/)** | - | ✅ | - | +| **[UW–Madison](https://www.hep.wisc.edu/cms/comp/)** | ⏳ | - | - | ## Publications diff --git a/ci/cvmfs-storageclass.yaml b/cvmfs/cvmfs-storageclass.yaml similarity index 100% rename from ci/cvmfs-storageclass.yaml rename to cvmfs/cvmfs-storageclass.yaml diff --git a/ci/values-cvmfs-csi.yaml b/cvmfs/values-cvmfs-csi.yaml similarity index 100% rename from ci/values-cvmfs-csi.yaml rename to cvmfs/values-cvmfs-csi.yaml diff --git a/helm/supersonic/README.md b/helm/supersonic/README.md index 2539efa1..8a6dbda6 100644 --- a/helm/supersonic/README.md +++ b/helm/supersonic/README.md @@ -27,24 +27,145 @@ Currently, SuperSONIC supports the following functionality: ## Installation -**Pre-requisites:** -- a Kubernetes cluster with access to GPUs -- a Prometheus instance installed on the cluster, or Prometheus CRDs to deploy your own instance -- KEDA CRDs installed on the cluster (only if using autoscaling) +### Pre-requisites + +
+ Kubernetes cluster + + ideally with access to GPUs, but CPUs are enough for a minimal deployment. +
+ +
+ Helm + + Helm is a package manager for Kubernetes. + To install Helm on your machine, follow the official instructions at [https://helm.sh/docs/intro/install/](https://helm.sh/docs/intro/install/). +
+ +
+ Custom Resource Definitions (CRDs) – not needed for minimal deployment + + - [Prometheus](https://prometheus.io) CRDs + + If you are using an established Kubernetes cluster (e.g. at an HPC), there is a high chance that these CRDs are already installed. Otherwise, cluster admin can use the following commands: +
+ How to install Prometheus CRDs + + ``` + helm repo add prometheus-community https://prometheus-community.github.io/helm-charts + helm repo update + kubectl create namespace monitoring + helm install prometheus-operator prometheus-community/kube-prometheus-stack --namespace monitoring --set prometheusOperator.createCustomResource=false --set defaultRules.create=false --set alertmanager.enabled=false --set prometheus.enabled=false --set grafana.enabled=false + ``` +
+ - [KEDA](https://keda.sh) CRDs (only if using autoscaling) + +
+ How to install Prometheus CRDs + + ``` + helm repo add kedacore https://kedacore.github.io/charts + helm repo update + kubectl create namespace keda + helm install keda kedacore/keda --namespace keda + ``` +
+
+ +--- + +### Standard deployment + +If you are installing SuperSONIC for the first time, proceed to the [Minimal deployment](#minimal-deployment) section below. + +If you already have a functional `values.yaml` and/or installed SuperSONIC previously, use the following installation commands: + +``` +helm repo add fastml https://fastmachinelearning.org/SuperSONIC +helm repo update +helm install fastml/supersonic -n -f +``` + +To construct the `values.yaml` file for your application, follow [Configuration guide](http://fastmachinelearning.org/SuperSONIC/configuration-guide.html "Configuration guide"). + +The full list of configuration parameters is available in the [Configuration reference](http://fastmachinelearning.org/SuperSONIC/configuration-reference.html "Configuration reference"). + +--- + +### Minimal deployment + +
+1. Install cvmfs-csi plugin to load models from CVMFS + +For an example installation, we will use CMS models loaded from [CVMFS](https://cvmfs.readthedocs.io/en/stable/). SuperSONIC allows other types of model repository, including +an arbitrary Persistent Volume, an NFS volume, or S3 storage. + +[cvmfs-csi](https://github.com/cvmfs-contrib/cvmfs-csi) plugin allows to easily mount CVMFS +into a Kubernetes cluster by creating a new storage class. A Persistent Volume created with this +storage class will have CVMFS contents visible inside. + +Cluster admin can use the following commands to install `cvmfs-csi`: +``` +kubectl create namespace cvmfs-csi +helm install -n cvmfs-csi cvmfs-csi oci://registry.cern.ch/kubernetes/charts/cvmfs-csi --values cvmfs/values-cvmfs-csi.yaml +kubectl apply -f cvmfs/cvmfs-storageclass.yaml -n cvmfs-csi +``` +
-Install the latest released version from the Helm repository +2. Install SuperSONIC with minimal configuration + +The minimal deployment will install only a single CPU-based Triton server and an Envoy Proxy. +We will use [`values/values-minimal.yaml`](values/values-minimal.yaml) as our minimal +configuration file. ``` helm repo add fastml https://fastmachinelearning.org/SuperSONIC helm repo update -helm install fastml/supersonic -n -f +helm install fastml/supersonic -n -f values/values-minimal.yaml ``` +
+ +
+3. Deploy a test job to run inferences + +To test your SuperSONIC installation, we will create a small [Nvidia Performance Analyzer](https://docs.nvidia.com/deeplearning/triton-inference-server/archives/triton-inference-server-2280/user-guide/docs/user_guide/perf_analyzer.html) job, +which will send a single inference request with random input data to Envoy Proxy endpoint. + +1. In `tests/perf-analyzer-job.yaml`, edit the following parameters to match your deployment: + + ``` + metadata: + namespace: + ``` + + In `perf_analyzer` command: + + ``` + -u ..svc.cluster.local:8001 + ``` + +2. Submit the job to your Kubernetes cluster: + + ``` + kubectl apply -n -f tests/perf-analyzer-job.yaml + ``` + +3. Track job performance and inspect logs: + + ``` + kubectl get pods -l job-name=perf-analyzer-job -n + kubectl logs -n + ```
+--- + +### Installing from a GitHub branch/tag/commit +
-Install directly from a GitHub branch/tag/commit +This option may be useful for testing unreleased features. ``` git clone https://github.com/fastmachinelearning/SuperSONIC.git @@ -56,9 +177,6 @@ helm install helm/supersonic -n -f
-To construct the `values.yaml` file for your application, follow [Configuration guide](http://fastmachinelearning.org/SuperSONIC/configuration-guide.html "Configuration guide"). - -The full list of configuration parameters is available in the [Configuration reference](http://fastmachinelearning.org/SuperSONIC/configuration-reference.html "Configuration reference"). ## Server diagram @@ -76,6 +194,7 @@ The full list of configuration parameters is available in the [Configuration ref | **[Purdue Anvil](https://www.rcac.purdue.edu/compute/anvil)** | ✅ | - | - | | **[NRP Nautilus](https://docs.nationalresearchplatform.org)** | ✅ | ✅ | ✅ | | **[UChicago](https://af.uchicago.edu/)** | - | ✅ | - | +| **[UW–Madison](https://www.hep.wisc.edu/cms/comp/)** | ⏳ | - | - | ## Publications diff --git a/ci/perf-analyzer-job.yaml b/tests/perf-analyzer-job-ci.yaml similarity index 100% rename from ci/perf-analyzer-job.yaml rename to tests/perf-analyzer-job-ci.yaml diff --git a/ci/perf-analyzer-job-local.yaml b/tests/perf-analyzer-job.yaml similarity index 61% rename from ci/perf-analyzer-job-local.yaml rename to tests/perf-analyzer-job.yaml index 94860643..ee36e54f 100644 --- a/ci/perf-analyzer-job-local.yaml +++ b/tests/perf-analyzer-job.yaml @@ -2,11 +2,11 @@ apiVersion: batch/v1 kind: Job metadata: name: perf-analyzer-job - namespace: sonic-server + namespace: default spec: parallelism: 1 completions: 1 - backoffLimit: 1000 + backoffLimit: 0 template: spec: restartPolicy: OnFailure @@ -19,17 +19,15 @@ spec: - | echo "Running perf_analyzer..." perf_analyzer -i grpc \ - -m deeptau_2018v2p5 \ - -u supersonic.sonic-server.svc.cluster.local:8001 \ + -m higgsInteractionNet \ + -u supersonic.default.svc.cluster.local:8001 \ --async -p 1 -b 100 \ - --request-count=10 \ - --concurrency-range=8 --input-data "random" + --request-count=1 \ + --concurrency-range=1 --input-data "random" resources: requests: cpu: 1 memory: "2G" limits: cpu: 1 - memory: "2G" - # nodeSelector: - # topology.kubernetes.io/zone: ucsd \ No newline at end of file + memory: "2G" \ No newline at end of file