From 0d9d0b111b6e809d7933e5caaccefe604dff790c Mon Sep 17 00:00:00 2001 From: Atin Sood Date: Thu, 21 Jul 2022 14:19:53 -0400 Subject: [PATCH 1/2] feat: added mlflow server --- .../templates/operator_cluster_scoped.yaml | 69 +++++++++++++++++++ .../ray/templates/operator_namespaced.yaml | 67 ++++++++++++++++++ 2 files changed, 136 insertions(+) diff --git a/deploy/charts/ray/templates/operator_cluster_scoped.yaml b/deploy/charts/ray/templates/operator_cluster_scoped.yaml index 5e0221a444f2..ff93c524be59 100644 --- a/deploy/charts/ray/templates/operator_cluster_scoped.yaml +++ b/deploy/charts/ray/templates/operator_cluster_scoped.yaml @@ -63,4 +63,73 @@ spec: limits: memory: 2Gi cpu: 1 +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: monitoring + namespace: {{ .Values.operatorNamespace }} +spec: + replicas: 1 + selector: + matchLabels: + component: mlflow + operation: monitoring + template: + metadata: + labels: + component: mlflow + operation: monitoring + spec: + serviceAccountName: ray-operator-serviceaccount + containers: + - name: mlflow-server + imagePullPolicy: IfNotPresent + image: guidebooks/mlflow + env: + - name: MLFLOW_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + resources: + requests: + cpu: 1 + memory: 2Gi + #ephemeral-storage: 1Gi + limits: + memory: 4Gi + cpu: 2 + ports: + - name: http + containerPort: 9080 + protocol: TCP + livenessProbe: + httpGet: + path: / + port: http + readinessProbe: + httpGet: + path: / + port: http + volumeMounts: + - mountPath: /mnt/mlflow-disk + name: mlflow-disk + volumes: + - name: mlflow-disk + emptyDir: {} +--- +apiVersion: v1 +kind: Service +metadata: + name: monitoring-service + namespace: {{ .Values.operatorNamespace }} +spec: + selector: + component: mlflow + operation: monitoring + ports: + - name: mlflow + protocol: TCP + port: 9080 + targetPort: http {{- end }} diff --git a/deploy/charts/ray/templates/operator_namespaced.yaml b/deploy/charts/ray/templates/operator_namespaced.yaml index 4458b630590a..41786ff47cbd 100644 --- a/deploy/charts/ray/templates/operator_namespaced.yaml +++ b/deploy/charts/ray/templates/operator_namespaced.yaml @@ -64,4 +64,71 @@ spec: limits: memory: 2Gi cpu: 1 +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: monitoring +spec: + replicas: 1 + selector: + matchLabels: + component: mlflow + operation: monitoring + template: + metadata: + labels: + component: mlflow + operation: monitoring + spec: + serviceAccountName: ray-operator-serviceaccount + containers: + - name: mlflow-server + imagePullPolicy: IfNotPresent + image: guidebooks/mlflow + env: + - name: MLFLOW_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + resources: + requests: + cpu: 1 + memory: 2Gi + #ephemeral-storage: 1Gi + limits: + memory: 4Gi + cpu: 2 + ports: + - name: http + containerPort: 9080 + protocol: TCP + livenessProbe: + httpGet: + path: / + port: http + readinessProbe: + httpGet: + path: / + port: http + volumeMounts: + - mountPath: /mnt/mlflow-disk + name: mlflow-disk + volumes: + - name: mlflow-disk + emptyDir: {} +--- +apiVersion: v1 +kind: Service +metadata: + name: monitoring-service +spec: + selector: + component: mlflow + operation: monitoring + ports: + - name: mlflow + protocol: TCP + port: 9080 + targetPort: http {{- end }} From 2f0f68352698b5081f22bf26491d96d4844a850d Mon Sep 17 00:00:00 2001 From: Atin Sood Date: Fri, 22 Jul 2022 13:38:36 -0400 Subject: [PATCH 2/2] initial tboard support --- .../templates/operator_cluster_scoped.yaml | 53 +++++++++++++++++ .../ray/templates/operator_namespaced.yaml | 57 ++++++++++++++++++- deploy/charts/ray/values.yaml | 3 + 3 files changed, 112 insertions(+), 1 deletion(-) diff --git a/deploy/charts/ray/templates/operator_cluster_scoped.yaml b/deploy/charts/ray/templates/operator_cluster_scoped.yaml index ff93c524be59..658ad0bdad42 100644 --- a/deploy/charts/ray/templates/operator_cluster_scoped.yaml +++ b/deploy/charts/ray/templates/operator_cluster_scoped.yaml @@ -83,6 +83,53 @@ spec: spec: serviceAccountName: ray-operator-serviceaccount containers: + {{- if (.Values.tensorboard.enable) }} + - name: tensorboard + command: ["/bin/sh", "-c"] + args: ["tensorboard --logdir", {{ .Values.tensorboard.path }}] + imagePullPolicy: IfNotPresent + image: guidebooks/tensorboard + env: + - name: TBOARD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: AWS_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + name: {{ .Values.tensorboard.secret }} + key: S3_ACCESS_KEY_ID + - name: AWS_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + name: {{ .Values.tensorboard.secret }} + key: S3_SECRET_ACCESS_KEY + - name: AWS_REGION + valueFrom: + secretKeyRef: + name: {{ .Values.tensorboard.secret }} + key: S3_REGION + resources: + requests: + cpu: 1 + memory: 2Gi + #ephemeral-storage: 1Gi + limits: + memory: 4Gi + cpu: 2 + ports: + - name: tboard + containerPort: 6006 + protocol: TCP + livenessProbe: + httpGet: + path: / + port: tboard + readinessProbe: + httpGet: + path: / + port: tboard + {{- end }} - name: mlflow-server imagePullPolicy: IfNotPresent image: guidebooks/mlflow @@ -132,4 +179,10 @@ spec: protocol: TCP port: 9080 targetPort: http + {{- if (.Values.tensorboard.enable) }} + - name: tboard + protocol: TCP + port: 6066 + targetPort: tboard + {{- end }} {{- end }} diff --git a/deploy/charts/ray/templates/operator_namespaced.yaml b/deploy/charts/ray/templates/operator_namespaced.yaml index 41786ff47cbd..c29294f6a639 100644 --- a/deploy/charts/ray/templates/operator_namespaced.yaml +++ b/deploy/charts/ray/templates/operator_namespaced.yaml @@ -69,6 +69,7 @@ apiVersion: apps/v1 kind: Deployment metadata: name: monitoring + namespace: {{ .Values.operatorNamespace }} spec: replicas: 1 selector: @@ -83,6 +84,53 @@ spec: spec: serviceAccountName: ray-operator-serviceaccount containers: + {{- if (.Values.tensorboard.enable) }} + - name: tensorboard + command: ["/bin/sh", "-c"] + args: ["tensorboard --logdir", {{ .Values.tensorboard.path }}] + imagePullPolicy: IfNotPresent + image: guidebooks/tensorboard + env: + - name: TBOARD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: AWS_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + name: {{ .Values.tensorboard.secret }} + key: S3_ACCESS_KEY_ID + - name: AWS_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + name: {{ .Values.tensorboard.secret }} + key: S3_SECRET_ACCESS_KEY + - name: AWS_REGION + valueFrom: + secretKeyRef: + name: {{ .Values.tensorboard.secret }} + key: S3_REGION + resources: + requests: + cpu: 1 + memory: 2Gi + #ephemeral-storage: 1Gi + limits: + memory: 4Gi + cpu: 2 + ports: + - name: tboard + containerPort: 6006 + protocol: TCP + livenessProbe: + httpGet: + path: / + port: tboard + readinessProbe: + httpGet: + path: / + port: tboard + {{- end }} - name: mlflow-server imagePullPolicy: IfNotPresent image: guidebooks/mlflow @@ -122,6 +170,7 @@ apiVersion: v1 kind: Service metadata: name: monitoring-service + namespace: {{ .Values.operatorNamespace }} spec: selector: component: mlflow @@ -131,4 +180,10 @@ spec: protocol: TCP port: 9080 targetPort: http -{{- end }} + {{- if (.Values.tensorboard.enable) }} + - name: tboard + protocol: TCP + port: 6066 + targetPort: tboard + {{- end }} +{{- end }} \ No newline at end of file diff --git a/deploy/charts/ray/values.yaml b/deploy/charts/ray/values.yaml index 388dadb9e178..56d0d60cd63d 100644 --- a/deploy/charts/ray/values.yaml +++ b/deploy/charts/ray/values.yaml @@ -109,3 +109,6 @@ operatorNamespace: default # by the operator. In other words, the images specified under the fields `operatorImage` and `image` # should carry matching Ray versions. operatorImage: rayproject/ray:latest + +# non ray related values, specific to monitoring +tensorboard.enable: false \ No newline at end of file