diff --git a/deploy/charts/ray/templates/operator_cluster_scoped.yaml b/deploy/charts/ray/templates/operator_cluster_scoped.yaml index ff93c524be59..658ad0bdad42 100644 --- a/deploy/charts/ray/templates/operator_cluster_scoped.yaml +++ b/deploy/charts/ray/templates/operator_cluster_scoped.yaml @@ -83,6 +83,53 @@ spec: spec: serviceAccountName: ray-operator-serviceaccount containers: + {{- if (.Values.tensorboard.enable) }} + - name: tensorboard + command: ["/bin/sh", "-c"] + args: ["tensorboard --logdir", {{ .Values.tensorboard.path }}] + imagePullPolicy: IfNotPresent + image: guidebooks/tensorboard + env: + - name: TBOARD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: AWS_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + name: {{ .Values.tensorboard.secret }} + key: S3_ACCESS_KEY_ID + - name: AWS_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + name: {{ .Values.tensorboard.secret }} + key: S3_SECRET_ACCESS_KEY + - name: AWS_REGION + valueFrom: + secretKeyRef: + name: {{ .Values.tensorboard.secret }} + key: S3_REGION + resources: + requests: + cpu: 1 + memory: 2Gi + #ephemeral-storage: 1Gi + limits: + memory: 4Gi + cpu: 2 + ports: + - name: tboard + containerPort: 6006 + protocol: TCP + livenessProbe: + httpGet: + path: / + port: tboard + readinessProbe: + httpGet: + path: / + port: tboard + {{- end }} - name: mlflow-server imagePullPolicy: IfNotPresent image: guidebooks/mlflow @@ -132,4 +179,10 @@ spec: protocol: TCP port: 9080 targetPort: http + {{- if (.Values.tensorboard.enable) }} + - name: tboard + protocol: TCP + port: 6066 + targetPort: tboard + {{- end }} {{- end }} diff --git a/deploy/charts/ray/templates/operator_namespaced.yaml b/deploy/charts/ray/templates/operator_namespaced.yaml index 41786ff47cbd..e109df9787f1 100644 --- a/deploy/charts/ray/templates/operator_namespaced.yaml +++ b/deploy/charts/ray/templates/operator_namespaced.yaml @@ -69,6 +69,7 @@ apiVersion: apps/v1 kind: Deployment metadata: name: monitoring + namespace: {{ .Values.operatorNamespace }} spec: replicas: 1 selector: @@ -83,6 +84,53 @@ spec: spec: serviceAccountName: ray-operator-serviceaccount containers: + {{- if (.Values.tensorboard.enable) }} + - name: tensorboard + command: ["/bin/sh", "-c"] + args: ["tensorboard --logdir", {{ .Values.tensorboard.path }}] + imagePullPolicy: IfNotPresent + image: guidebooks/tensorboard + env: + - name: TBOARD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: AWS_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + name: {{ .Values.tensorboard.secret }} + key: S3_ACCESS_KEY_ID + - name: AWS_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + name: {{ .Values.tensorboard.secret }} + key: S3_SECRET_ACCESS_KEY + - name: AWS_REGION + valueFrom: + secretKeyRef: + name: {{ .Values.tensorboard.secret }} + key: S3_REGION + resources: + requests: + cpu: 1 + memory: 2Gi + #ephemeral-storage: 1Gi + limits: + memory: 4Gi + cpu: 2 + ports: + - name: tboard + containerPort: 6006 + protocol: TCP + livenessProbe: + httpGet: + path: / + port: tboard + readinessProbe: + httpGet: + path: / + port: tboard + {{- end }} - name: mlflow-server imagePullPolicy: IfNotPresent image: guidebooks/mlflow @@ -122,6 +170,7 @@ apiVersion: v1 kind: Service metadata: name: monitoring-service + namespace: {{ .Values.operatorNamespace }} spec: selector: component: mlflow @@ -131,4 +180,10 @@ spec: protocol: TCP port: 9080 targetPort: http + {{- if (.Values.tensorboard.enable) }} + - name: tboard + protocol: TCP + port: 6066 + targetPort: tboard + {{- end }} {{- end }} diff --git a/deploy/charts/ray/values.yaml b/deploy/charts/ray/values.yaml index 388dadb9e178..56d0d60cd63d 100644 --- a/deploy/charts/ray/values.yaml +++ b/deploy/charts/ray/values.yaml @@ -109,3 +109,6 @@ operatorNamespace: default # by the operator. In other words, the images specified under the fields `operatorImage` and `image` # should carry matching Ray versions. operatorImage: rayproject/ray:latest + +# non ray related values, specific to monitoring +tensorboard.enable: false \ No newline at end of file