NexusGPU
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 0 deletions b/‎.gitignore‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎.vscode/launch.json‎
Lines changed: 1 addition & 0 deletions b/‎.vscode/launch.json‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.vscode/settings.json‎
Lines changed: 1 addition & 0 deletions b/‎.vscode/settings.json‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎api/v1/gpupool_types.go‎
Lines changed: 0 additions & 8 deletions b/‎api/v1/gpupool_types.go‎
Lines changed: 0 additions & 8 deletions
diff --git a/‎charts/tensor-fusion/Chart.yaml‎
Lines changed: 1 addition & 1 deletion b/‎charts/tensor-fusion/Chart.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎charts/tensor-fusion/crds/tensor-fusion.ai_gpupools.yaml‎
Lines changed: 0 additions & 8 deletions b/‎charts/tensor-fusion/crds/tensor-fusion.ai_gpupools.yaml‎
Lines changed: 0 additions & 8 deletions
diff --git a/‎charts/tensor-fusion/templates/alert-manager.yaml‎
Lines changed: 61 additions & 3 deletions b/‎charts/tensor-fusion/templates/alert-manager.yaml‎
Lines changed: 61 additions & 3 deletions
diff --git a/‎charts/tensor-fusion/values-production.yaml‎
Lines changed: 4 additions & 1 deletion b/‎charts/tensor-fusion/values-production.yaml‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎charts/tensor-fusion/values.schema.json‎
Lines changed: 21 additions & 0 deletions b/‎charts/tensor-fusion/values.schema.json‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎charts/tensor-fusion/values.yaml‎
Lines changed: 12 additions & 8 deletions b/‎charts/tensor-fusion/values.yaml‎
Lines changed: 12 additions & 8 deletions
@@ -39,3 +39,5 @@ __debug*
 
 vendor
 logs
+
+*.prof
@@ -68,6 +68,7 @@
                 "--gpu-info-config", "${workspaceFolder}/config/samples/gpu-info-config.yaml",
                 "--dynamic-config", "${workspaceFolder}/config/samples/dynamic-config.yaml",
                 "--scheduler-config", "${workspaceFolder}/config/samples/scheduler-config.yaml",
+                "--enable-alert",
                 "-v", "4"
             ],
             "program": "${workspaceFolder}/cmd/main.go",
 
@@ -22,6 +22,7 @@
         "certificaterequests",
         "certmanager",
         "clientcmd",
+        "clientcmdapi",
         "clientgoscheme",
         "clientset",
         "cloudnative",
 
@@ -389,14 +389,6 @@ type GPUPoolStatus struct {
 	// when the progress is 100, the component version or config is fully updated.
 	ComponentStatus PoolComponentStatus `json:"componentStatus"`
 
-	// TODO: calculated every 1h/1d/1w average
-	UtilizedTFlopsPercent string `json:"utilizedTFlopsPercent,omitempty"`
-	UtilizedVRAMPercent   string `json:"utilizedVRAMPercent,omitempty"`
-
-	// TODO: updated with interval
-	AllocatedTFlopsPercent string `json:"allocatedTFlopsPercent,omitempty"`
-	AllocatedVRAMPercent   string `json:"allocatedVRAMPercent,omitempty"`
-
 	// TODO: aggregated with interval
 	SavedCostsPerMonth       string `json:"savedCostsPerMonth,omitempty"`
 	PotentialSavingsPerMonth string `json:"potentialSavingsPerMonth,omitempty"`
 
@@ -15,7 +15,7 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 1.5.4
+version: 1.5.5
 
 # This is the version number of the application being deployed. This version number should be
 # incremented each time you make changes to the application. Versions are not expected to
 
@@ -601,10 +601,6 @@ spec:
           status:
             description: GPUPoolStatus defines the observed state of GPUPool.
             properties:
-              allocatedTFlopsPercent:
-                type: string
-              allocatedVRAMPercent:
-                type: string
               availableTFlops:
                 anyOf:
                 - type: integer
@@ -760,10 +756,6 @@ spec:
                 - type: string
                 pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
                 x-kubernetes-int-or-string: true
-              utilizedTFlopsPercent:
-                type: string
-              utilizedVRAMPercent:
-                type: string
               virtualAvailableTFlops:
                 anyOf:
                 - type: integer
 
@@ -32,25 +32,52 @@ spec:
         {{- include "tensor-fusion.labels" . | nindent 8 }}
     spec:
       enableServiceLinks: false
+      {{- if gt (.Values.alert.replicaCount | int) 1 }}
+      affinity:
+        podAntiAffinity:
+          preferredDuringSchedulingIgnoredDuringExecution:
+          - weight: 100
+            podAffinityTerm:
+              labelSelector:
+                matchExpressions:
+                - key: tensor-fusion.ai/component
+                  operator: In
+                  values:
+                  - alert-manager
+              topologyKey: kubernetes.io/hostname
+      {{- end }}
       volumes:
         - name: config
           configMap:
             name: {{ .Release.Name }}-alert-manager-config
             defaultMode: 420
+        {{- if not .Values.alert.persistence.enabled }}
         - name: storage
-          hostPath:
-            path: /data/alertmanager
-            type: DirectoryOrCreate
+          emptyDir: {}
+        {{- end }}
       containers:
         - name: alertmanager
           image: "{{ .Values.alert.image.repository }}:{{ .Values.alert.image.tag }}"
           args:
             - '--storage.path=/alertmanager'
             - '--config.file=/etc/alertmanager/alertmanager.yml'
+            - '--web.listen-address=0.0.0.0:9093'
+            {{- if gt (.Values.alert.replicaCount | int) 1 }}
+            - '--cluster.listen-address=0.0.0.0:9094'
+            - '--cluster.advertise-address=$(POD_IP):9094'
+            {{- range $i := until (.Values.alert.replicaCount | int) }}
+            - '--cluster.peer={{ $.Release.Name }}-alert-manager-{{ $i }}.alert-manager-headless.{{ include "tensor-fusion.namespace" $ }}.svc.cluster.local:9094'
+            {{- end }}
+            {{- end }}
           ports:
             - name: http
               containerPort: 9093
               protocol: TCP
+            {{- if gt (.Values.alert.replicaCount | int) 1 }}
+            - name: gossip
+              containerPort: 9094
+              protocol: TCP
+            {{- end }}
           env:
             - name: POD_IP
               valueFrom:
@@ -62,8 +89,13 @@ spec:
           volumeMounts:
             - name: config
               mountPath: /etc/alertmanager
+            {{- if .Values.alert.persistence.enabled }}
+            - name: alertmanager-storage
+              mountPath: /alertmanager
+            {{- else }}
             - name: storage
               mountPath: /alertmanager
+            {{- end }}
           livenessProbe:
             httpGet:
               path: /
@@ -85,6 +117,20 @@ spec:
       restartPolicy: Always
       serviceAccountName: alert-manager
   serviceName: alert-manager-headless
+  {{- if .Values.alert.persistence.enabled }}
+  volumeClaimTemplates:
+  - metadata:
+      name: alertmanager-storage
+    spec:
+      accessModes:
+        - ReadWriteOnce
+      {{- if .Values.alert.persistence.storageClass }}
+      storageClassName: {{ .Values.alert.persistence.storageClass }}
+      {{- end }}
+      resources:
+        requests:
+          storage: {{ .Values.alert.persistence.size }}
+  {{- end }}
   updateStrategy:
     type: RollingUpdate
     rollingUpdate:
@@ -105,6 +151,12 @@ spec:
       protocol: TCP
       port: 9093
       targetPort: http
+    {{- if gt (.Values.alert.replicaCount | int) 1 }}
+    - name: gossip
+      protocol: TCP
+      port: 9094
+      targetPort: gossip
+    {{- end }}
   selector:
     tensor-fusion.ai/component: alert-manager
   type: ClusterIP
@@ -125,6 +177,12 @@ spec:
       protocol: TCP
       port: 9093
       targetPort: http
+    {{- if gt (.Values.alert.replicaCount | int) 1 }}
+    - name: gossip
+      protocol: TCP
+      port: 9094
+      targetPort: gossip
+    {{- end }}
   selector:
     tensor-fusion.ai/component: alert-manager
   clusterIP: None
 
@@ -29,4 +29,7 @@ alert:
       cpu: 200m
     limits:
       memory: 1Gi
-      cpu: 2000m
+      cpu: 2000m
+  persistence:
+    enabled: true
+    size: 5Gi
@@ -430,6 +430,27 @@
             }
           }
         },
+        "persistence": {
+          "type": "object",
+          "description": "Persistent storage configuration for alerting",
+          "properties": {
+            "enabled": {
+              "type": "boolean",
+              "description": "Enable persistent storage for alerting",
+              "default": false
+            },
+            "storageClass": {
+              "type": "string",
+              "description": "Storage class for persistent storage",
+              "default": ""
+            },
+            "size": {
+              "type": "string",
+              "description": "Size of persistent storage",
+              "default": ""
+            }
+          }
+        },
         "alertManagerConfig": {
           "type": "object",
           "description": "Alertmanager configuration"
 
@@ -148,6 +148,10 @@ alert:
     limits:
       memory: 1Gi
       cpu: 1500m
+  persistence:
+    enabled: false
+    # storageClass: "gp3"
+    # size: 10Gi
   alertManagerConfig:
     global: {}
     receivers:
@@ -347,7 +351,7 @@ dynamicConfig:
     - name: NodeTFlopsAllocationCritical
       query: | 
         SELECT node, pool, (100 - avg(allocated_tflops_percent)) as tflops_available
-        FROM tf_node_resources
+        FROM tf_node_metrics
         WHERE {{ .Conditions }}
         GROUP BY node, pool
         HAVING tflops_available < {{ .Threshold }}
@@ -362,7 +366,7 @@ dynamicConfig:
     - name: NodeTFlopsAllocationWarning
       query: | 
         SELECT node, pool, (100 - avg(allocated_tflops_percent)) as tflops_available
-        FROM tf_node_resources
+        FROM tf_node_metrics
         WHERE {{ .Conditions }}
         GROUP BY node, pool
         HAVING tflops_available < {{ .Threshold }}
@@ -378,7 +382,7 @@ dynamicConfig:
     - name: PoolTotalTFlopsAllocationCritical
       query: |
         SELECT pool, (100 - avg(allocated_tflops_percent)) as tflops_available
-        FROM tf_node_resources
+        FROM tf_node_metrics
         WHERE {{ .Conditions }}
         GROUP BY pool
         HAVING tflops_available < {{ .Threshold }}
@@ -393,7 +397,7 @@ dynamicConfig:
     - name: PoolTotalTFlopsAllocationWarning
       query: |
         SELECT pool, (100 - avg(allocated_tflops_percent)) as tflops_available
-        FROM tf_node_resources
+        FROM tf_node_metrics
         WHERE {{ .Conditions }}
         GROUP BY pool
         HAVING tflops_available < {{ .Threshold }}
@@ -409,7 +413,7 @@ dynamicConfig:
     - name: NodeVRAMAllocationCritical
       query: |
         SELECT node, pool, (100 - avg(allocated_vram_percent)) as vram_available
-        FROM tf_node_resources
+        FROM tf_node_metrics
         WHERE {{ .Conditions }}
         GROUP BY node, pool
         HAVING vram_available < {{ .Threshold }}
@@ -424,7 +428,7 @@ dynamicConfig:
     - name: NodeVRAMAllocationWarning
       query: |
         SELECT node, pool, (100 - avg(allocated_vram_percent)) as vram_available
-        FROM tf_node_resources
+        FROM tf_node_metrics
         WHERE {{ .Conditions }}
         GROUP BY node, pool
         HAVING vram_available < {{ .Threshold }}
@@ -440,7 +444,7 @@ dynamicConfig:
     - name: PoolVRAMAllocationWarning
       query: |
         SELECT pool, (100 - avg(allocated_vram_percent)) as vram_available
-        FROM tf_node_resources
+        FROM tf_node_metrics
         WHERE {{ .Conditions }}
         GROUP BY pool
         HAVING vram_available < {{ .Threshold }}
@@ -456,7 +460,7 @@ dynamicConfig:
     - name: EmptyGPU
       query: |
         SELECT DISTINCT node 
-        FROM tf_node_resources 
+        FROM tf_node_metrics 
         WHERE {{ .Conditions }} AND node NOT IN (
             SELECT DISTINCT node 
             FROM tf_worker_usage
-Original file line number
+Diff line change
 vendor
 logs
++
 +*.prof