@@ -148,6 +148,10 @@ alert:
148
148
limits :
149
149
memory : 1Gi
150
150
cpu : 1500m
151
+ persistence :
152
+ enabled : false
153
+ # storageClass: "gp3"
154
+ # size: 10Gi
151
155
alertManagerConfig :
152
156
global : {}
153
157
receivers :
@@ -347,7 +351,7 @@ dynamicConfig:
347
351
- name : NodeTFlopsAllocationCritical
348
352
query : |
349
353
SELECT node, pool, (100 - avg(allocated_tflops_percent)) as tflops_available
350
- FROM tf_node_resources
354
+ FROM tf_node_metrics
351
355
WHERE {{ .Conditions }}
352
356
GROUP BY node, pool
353
357
HAVING tflops_available < {{ .Threshold }}
@@ -362,7 +366,7 @@ dynamicConfig:
362
366
- name : NodeTFlopsAllocationWarning
363
367
query : |
364
368
SELECT node, pool, (100 - avg(allocated_tflops_percent)) as tflops_available
365
- FROM tf_node_resources
369
+ FROM tf_node_metrics
366
370
WHERE {{ .Conditions }}
367
371
GROUP BY node, pool
368
372
HAVING tflops_available < {{ .Threshold }}
@@ -378,7 +382,7 @@ dynamicConfig:
378
382
- name : PoolTotalTFlopsAllocationCritical
379
383
query : |
380
384
SELECT pool, (100 - avg(allocated_tflops_percent)) as tflops_available
381
- FROM tf_node_resources
385
+ FROM tf_node_metrics
382
386
WHERE {{ .Conditions }}
383
387
GROUP BY pool
384
388
HAVING tflops_available < {{ .Threshold }}
@@ -393,7 +397,7 @@ dynamicConfig:
393
397
- name : PoolTotalTFlopsAllocationWarning
394
398
query : |
395
399
SELECT pool, (100 - avg(allocated_tflops_percent)) as tflops_available
396
- FROM tf_node_resources
400
+ FROM tf_node_metrics
397
401
WHERE {{ .Conditions }}
398
402
GROUP BY pool
399
403
HAVING tflops_available < {{ .Threshold }}
@@ -409,7 +413,7 @@ dynamicConfig:
409
413
- name : NodeVRAMAllocationCritical
410
414
query : |
411
415
SELECT node, pool, (100 - avg(allocated_vram_percent)) as vram_available
412
- FROM tf_node_resources
416
+ FROM tf_node_metrics
413
417
WHERE {{ .Conditions }}
414
418
GROUP BY node, pool
415
419
HAVING vram_available < {{ .Threshold }}
@@ -424,7 +428,7 @@ dynamicConfig:
424
428
- name : NodeVRAMAllocationWarning
425
429
query : |
426
430
SELECT node, pool, (100 - avg(allocated_vram_percent)) as vram_available
427
- FROM tf_node_resources
431
+ FROM tf_node_metrics
428
432
WHERE {{ .Conditions }}
429
433
GROUP BY node, pool
430
434
HAVING vram_available < {{ .Threshold }}
@@ -440,7 +444,7 @@ dynamicConfig:
440
444
- name : PoolVRAMAllocationWarning
441
445
query : |
442
446
SELECT pool, (100 - avg(allocated_vram_percent)) as vram_available
443
- FROM tf_node_resources
447
+ FROM tf_node_metrics
444
448
WHERE {{ .Conditions }}
445
449
GROUP BY pool
446
450
HAVING vram_available < {{ .Threshold }}
@@ -456,7 +460,7 @@ dynamicConfig:
456
460
- name : EmptyGPU
457
461
query : |
458
462
SELECT DISTINCT node
459
- FROM tf_node_resources
463
+ FROM tf_node_metrics
460
464
WHERE {{ .Conditions }} AND node NOT IN (
461
465
SELECT DISTINCT node
462
466
FROM tf_worker_usage
0 commit comments