Skip to content

Commit 0d7d8e4

Browse files
authored
feat: monitor & alert issues, large scale benchmark and performance optimization (#323)
* fix: alert manager HA in helm chart * fix: hypervisor vector can not resolve greptime dns issue * fix: remove metrics table migrate, use dynamic ingestion, fix metrics issues * fix: potential nil pointer issue of alert * fix: add pool metrics * fix: optimize worker suffix * fix: lint issue * fix: add webhook and scheduler benchmark testing, support delay initialization after device plugin * fix: add large scale benchmark for pod webhook and scheduler, optimize performance
1 parent 9591b78 commit 0d7d8e4

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

47 files changed

+1665
-208
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,3 +39,5 @@ __debug*
3939

4040
vendor
4141
logs
42+
43+
*.prof

.vscode/launch.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@
6868
"--gpu-info-config", "${workspaceFolder}/config/samples/gpu-info-config.yaml",
6969
"--dynamic-config", "${workspaceFolder}/config/samples/dynamic-config.yaml",
7070
"--scheduler-config", "${workspaceFolder}/config/samples/scheduler-config.yaml",
71+
"--enable-alert",
7172
"-v", "4"
7273
],
7374
"program": "${workspaceFolder}/cmd/main.go",

.vscode/settings.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
"certificaterequests",
2323
"certmanager",
2424
"clientcmd",
25+
"clientcmdapi",
2526
"clientgoscheme",
2627
"clientset",
2728
"cloudnative",

api/v1/gpupool_types.go

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -389,14 +389,6 @@ type GPUPoolStatus struct {
389389
// when the progress is 100, the component version or config is fully updated.
390390
ComponentStatus PoolComponentStatus `json:"componentStatus"`
391391

392-
// TODO: calculated every 1h/1d/1w average
393-
UtilizedTFlopsPercent string `json:"utilizedTFlopsPercent,omitempty"`
394-
UtilizedVRAMPercent string `json:"utilizedVRAMPercent,omitempty"`
395-
396-
// TODO: updated with interval
397-
AllocatedTFlopsPercent string `json:"allocatedTFlopsPercent,omitempty"`
398-
AllocatedVRAMPercent string `json:"allocatedVRAMPercent,omitempty"`
399-
400392
// TODO: aggregated with interval
401393
SavedCostsPerMonth string `json:"savedCostsPerMonth,omitempty"`
402394
PotentialSavingsPerMonth string `json:"potentialSavingsPerMonth,omitempty"`

charts/tensor-fusion/Chart.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ type: application
1515
# This is the chart version. This version number should be incremented each time you make changes
1616
# to the chart and its templates, including the app version.
1717
# Versions are expected to follow Semantic Versioning (https://semver.org/)
18-
version: 1.5.4
18+
version: 1.5.5
1919

2020
# This is the version number of the application being deployed. This version number should be
2121
# incremented each time you make changes to the application. Versions are not expected to

charts/tensor-fusion/crds/tensor-fusion.ai_gpupools.yaml

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -601,10 +601,6 @@ spec:
601601
status:
602602
description: GPUPoolStatus defines the observed state of GPUPool.
603603
properties:
604-
allocatedTFlopsPercent:
605-
type: string
606-
allocatedVRAMPercent:
607-
type: string
608604
availableTFlops:
609605
anyOf:
610606
- type: integer
@@ -760,10 +756,6 @@ spec:
760756
- type: string
761757
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
762758
x-kubernetes-int-or-string: true
763-
utilizedTFlopsPercent:
764-
type: string
765-
utilizedVRAMPercent:
766-
type: string
767759
virtualAvailableTFlops:
768760
anyOf:
769761
- type: integer

charts/tensor-fusion/templates/alert-manager.yaml

Lines changed: 61 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,25 +32,52 @@ spec:
3232
{{- include "tensor-fusion.labels" . | nindent 8 }}
3333
spec:
3434
enableServiceLinks: false
35+
{{- if gt (.Values.alert.replicaCount | int) 1 }}
36+
affinity:
37+
podAntiAffinity:
38+
preferredDuringSchedulingIgnoredDuringExecution:
39+
- weight: 100
40+
podAffinityTerm:
41+
labelSelector:
42+
matchExpressions:
43+
- key: tensor-fusion.ai/component
44+
operator: In
45+
values:
46+
- alert-manager
47+
topologyKey: kubernetes.io/hostname
48+
{{- end }}
3549
volumes:
3650
- name: config
3751
configMap:
3852
name: {{ .Release.Name }}-alert-manager-config
3953
defaultMode: 420
54+
{{- if not .Values.alert.persistence.enabled }}
4055
- name: storage
41-
hostPath:
42-
path: /data/alertmanager
43-
type: DirectoryOrCreate
56+
emptyDir: {}
57+
{{- end }}
4458
containers:
4559
- name: alertmanager
4660
image: "{{ .Values.alert.image.repository }}:{{ .Values.alert.image.tag }}"
4761
args:
4862
- '--storage.path=/alertmanager'
4963
- '--config.file=/etc/alertmanager/alertmanager.yml'
64+
- '--web.listen-address=0.0.0.0:9093'
65+
{{- if gt (.Values.alert.replicaCount | int) 1 }}
66+
- '--cluster.listen-address=0.0.0.0:9094'
67+
- '--cluster.advertise-address=$(POD_IP):9094'
68+
{{- range $i := until (.Values.alert.replicaCount | int) }}
69+
- '--cluster.peer={{ $.Release.Name }}-alert-manager-{{ $i }}.alert-manager-headless.{{ include "tensor-fusion.namespace" $ }}.svc.cluster.local:9094'
70+
{{- end }}
71+
{{- end }}
5072
ports:
5173
- name: http
5274
containerPort: 9093
5375
protocol: TCP
76+
{{- if gt (.Values.alert.replicaCount | int) 1 }}
77+
- name: gossip
78+
containerPort: 9094
79+
protocol: TCP
80+
{{- end }}
5481
env:
5582
- name: POD_IP
5683
valueFrom:
@@ -62,8 +89,13 @@ spec:
6289
volumeMounts:
6390
- name: config
6491
mountPath: /etc/alertmanager
92+
{{- if .Values.alert.persistence.enabled }}
93+
- name: alertmanager-storage
94+
mountPath: /alertmanager
95+
{{- else }}
6596
- name: storage
6697
mountPath: /alertmanager
98+
{{- end }}
6799
livenessProbe:
68100
httpGet:
69101
path: /
@@ -85,6 +117,20 @@ spec:
85117
restartPolicy: Always
86118
serviceAccountName: alert-manager
87119
serviceName: alert-manager-headless
120+
{{- if .Values.alert.persistence.enabled }}
121+
volumeClaimTemplates:
122+
- metadata:
123+
name: alertmanager-storage
124+
spec:
125+
accessModes:
126+
- ReadWriteOnce
127+
{{- if .Values.alert.persistence.storageClass }}
128+
storageClassName: {{ .Values.alert.persistence.storageClass }}
129+
{{- end }}
130+
resources:
131+
requests:
132+
storage: {{ .Values.alert.persistence.size }}
133+
{{- end }}
88134
updateStrategy:
89135
type: RollingUpdate
90136
rollingUpdate:
@@ -105,6 +151,12 @@ spec:
105151
protocol: TCP
106152
port: 9093
107153
targetPort: http
154+
{{- if gt (.Values.alert.replicaCount | int) 1 }}
155+
- name: gossip
156+
protocol: TCP
157+
port: 9094
158+
targetPort: gossip
159+
{{- end }}
108160
selector:
109161
tensor-fusion.ai/component: alert-manager
110162
type: ClusterIP
@@ -125,6 +177,12 @@ spec:
125177
protocol: TCP
126178
port: 9093
127179
targetPort: http
180+
{{- if gt (.Values.alert.replicaCount | int) 1 }}
181+
- name: gossip
182+
protocol: TCP
183+
port: 9094
184+
targetPort: gossip
185+
{{- end }}
128186
selector:
129187
tensor-fusion.ai/component: alert-manager
130188
clusterIP: None

charts/tensor-fusion/values-production.yaml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,4 +29,7 @@ alert:
2929
cpu: 200m
3030
limits:
3131
memory: 1Gi
32-
cpu: 2000m
32+
cpu: 2000m
33+
persistence:
34+
enabled: true
35+
size: 5Gi

charts/tensor-fusion/values.schema.json

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -430,6 +430,27 @@
430430
}
431431
}
432432
},
433+
"persistence": {
434+
"type": "object",
435+
"description": "Persistent storage configuration for alerting",
436+
"properties": {
437+
"enabled": {
438+
"type": "boolean",
439+
"description": "Enable persistent storage for alerting",
440+
"default": false
441+
},
442+
"storageClass": {
443+
"type": "string",
444+
"description": "Storage class for persistent storage",
445+
"default": ""
446+
},
447+
"size": {
448+
"type": "string",
449+
"description": "Size of persistent storage",
450+
"default": ""
451+
}
452+
}
453+
},
433454
"alertManagerConfig": {
434455
"type": "object",
435456
"description": "Alertmanager configuration"

charts/tensor-fusion/values.yaml

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,10 @@ alert:
148148
limits:
149149
memory: 1Gi
150150
cpu: 1500m
151+
persistence:
152+
enabled: false
153+
# storageClass: "gp3"
154+
# size: 10Gi
151155
alertManagerConfig:
152156
global: {}
153157
receivers:
@@ -347,7 +351,7 @@ dynamicConfig:
347351
- name: NodeTFlopsAllocationCritical
348352
query: |
349353
SELECT node, pool, (100 - avg(allocated_tflops_percent)) as tflops_available
350-
FROM tf_node_resources
354+
FROM tf_node_metrics
351355
WHERE {{ .Conditions }}
352356
GROUP BY node, pool
353357
HAVING tflops_available < {{ .Threshold }}
@@ -362,7 +366,7 @@ dynamicConfig:
362366
- name: NodeTFlopsAllocationWarning
363367
query: |
364368
SELECT node, pool, (100 - avg(allocated_tflops_percent)) as tflops_available
365-
FROM tf_node_resources
369+
FROM tf_node_metrics
366370
WHERE {{ .Conditions }}
367371
GROUP BY node, pool
368372
HAVING tflops_available < {{ .Threshold }}
@@ -378,7 +382,7 @@ dynamicConfig:
378382
- name: PoolTotalTFlopsAllocationCritical
379383
query: |
380384
SELECT pool, (100 - avg(allocated_tflops_percent)) as tflops_available
381-
FROM tf_node_resources
385+
FROM tf_node_metrics
382386
WHERE {{ .Conditions }}
383387
GROUP BY pool
384388
HAVING tflops_available < {{ .Threshold }}
@@ -393,7 +397,7 @@ dynamicConfig:
393397
- name: PoolTotalTFlopsAllocationWarning
394398
query: |
395399
SELECT pool, (100 - avg(allocated_tflops_percent)) as tflops_available
396-
FROM tf_node_resources
400+
FROM tf_node_metrics
397401
WHERE {{ .Conditions }}
398402
GROUP BY pool
399403
HAVING tflops_available < {{ .Threshold }}
@@ -409,7 +413,7 @@ dynamicConfig:
409413
- name: NodeVRAMAllocationCritical
410414
query: |
411415
SELECT node, pool, (100 - avg(allocated_vram_percent)) as vram_available
412-
FROM tf_node_resources
416+
FROM tf_node_metrics
413417
WHERE {{ .Conditions }}
414418
GROUP BY node, pool
415419
HAVING vram_available < {{ .Threshold }}
@@ -424,7 +428,7 @@ dynamicConfig:
424428
- name: NodeVRAMAllocationWarning
425429
query: |
426430
SELECT node, pool, (100 - avg(allocated_vram_percent)) as vram_available
427-
FROM tf_node_resources
431+
FROM tf_node_metrics
428432
WHERE {{ .Conditions }}
429433
GROUP BY node, pool
430434
HAVING vram_available < {{ .Threshold }}
@@ -440,7 +444,7 @@ dynamicConfig:
440444
- name: PoolVRAMAllocationWarning
441445
query: |
442446
SELECT pool, (100 - avg(allocated_vram_percent)) as vram_available
443-
FROM tf_node_resources
447+
FROM tf_node_metrics
444448
WHERE {{ .Conditions }}
445449
GROUP BY pool
446450
HAVING vram_available < {{ .Threshold }}
@@ -456,7 +460,7 @@ dynamicConfig:
456460
- name: EmptyGPU
457461
query: |
458462
SELECT DISTINCT node
459-
FROM tf_node_resources
463+
FROM tf_node_metrics
460464
WHERE {{ .Conditions }} AND node NOT IN (
461465
SELECT DISTINCT node
462466
FROM tf_worker_usage

0 commit comments

Comments
 (0)