11local utils = import '../lib/utils.libsonnet' ;
22
33{
4+ local kubeOvercommitExpression(resource) = if $._config.showMultiCluster then
5+ |||
6+ # Non-HA clusters.
7+ (
8+ (
9+ sum by(%(clusterLabel)s) (namespace_%(resource)s:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s})
10+ -
11+ sum by(%(clusterLabel)s) (kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="%(resource)s"}) > 0
12+ )
13+ and
14+ count by (%(clusterLabel)s) (max by (%(clusterLabel)s, node) (kube_node_role{%(kubeStateMetricsSelector)s, role="control-plane"})) < 3
15+ )
16+ or
17+ # HA clusters.
18+ (
19+ sum by(%(clusterLabel)s) (namespace_%(resource)s:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s})
20+ -
21+ (
22+ # Skip clusters with only one allocatable node.
23+ (
24+ sum by (%(clusterLabel)s) (kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="%(resource)s"})
25+ -
26+ max by (%(clusterLabel)s) (kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="%(resource)s"})
27+ ) > 0
28+ ) > 0
29+ )
30+ ||| % $._config { resource: resource }
31+ else
32+ |||
33+ # Non-HA clusters.
34+ (
35+ (
36+ sum(namespace_%(resource)s:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s})
37+ -
38+ sum(kube_node_status_allocatable{resource="%(resource)s", %(kubeStateMetricsSelector)s}) > 0
39+ )
40+ and
41+ count(max by (node) (kube_node_role{%(kubeStateMetricsSelector)s, role="control-plane"})) < 3
42+ )
43+ or
44+ # HA clusters.
45+ (
46+ sum(namespace_%(resource)s:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s})
47+ -
48+ (
49+ # Skip clusters with only one allocatable node.
50+ (
51+ sum(kube_node_status_allocatable{resource="%(resource)s", %(kubeStateMetricsSelector)s})
52+ -
53+ max(kube_node_status_allocatable{resource="%(resource)s", %(kubeStateMetricsSelector)s})
54+ ) > 0
55+ ) > 0
56+ )
57+ ||| % $._config { resource: resource },
58+
459 _config+:: {
560 kubeStateMetricsSelector: error 'must provide selector for kube-state-metrics' ,
661 nodeExporterSelector: error 'must provide selector for node-exporter' ,
@@ -31,43 +86,12 @@ local utils = import '../lib/utils.libsonnet';
3186 },
3287 annotations: {
3388 summary: 'Cluster has overcommitted CPU resource requests.' ,
89+ description: 'Cluster%s has overcommitted CPU resource requests for Pods by {{ printf "%%.2f" $value }} CPU shares and cannot tolerate node failure.' % [
90+ utils.ifShowMultiCluster($._config, ' {{ $labels.%(clusterLabel)s }}' % $._config),
91+ ],
3492 },
3593 'for' : '10m' ,
36- } +
37- if $._config.showMultiCluster then {
38- expr: |||
39- (sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) -
40- sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s) > 0
41- and
42- count by (%(clusterLabel)s) (max by (%(clusterLabel)s, node) (kube_node_role{%(kubeStateMetricsSelector)s, role="control-plane"})) < 3)
43- or
44- (sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) -
45- (sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s) -
46- max(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s)) > 0
47- and
48- (sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s) -
49- max(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s)) > 0)
50- ||| % $._config,
51- annotations+: {
52- description: 'Cluster {{ $labels.%(clusterLabel)s }} has overcommitted CPU resource requests for Pods by {{ printf "%%.2f" $value }} CPU shares and cannot tolerate node failure.' % $._config,
53- },
54- } else {
55- expr: |||
56- (sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) -
57- sum(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s}) > 0
58- and
59- count(max by (node) (kube_node_role{%(kubeStateMetricsSelector)s, role="control-plane"})) < 3)
60- or
61- (sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) -
62- (sum(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s}) -
63- max(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s})) > 0
64- and
65- (sum(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s}) -
66- max(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s})) > 0)
67- ||| % $._config,
68- annotations+: {
69- description: 'Cluster has overcommitted CPU resource requests for Pods by {{ $value }} CPU shares and cannot tolerate node failure.' % $._config,
70- },
94+ expr: kubeOvercommitExpression('cpu' ),
7195 },
7296 {
7397 alert: 'KubeMemoryOvercommit' ,
@@ -76,43 +100,12 @@ local utils = import '../lib/utils.libsonnet';
76100 },
77101 annotations: {
78102 summary: 'Cluster has overcommitted memory resource requests.' ,
103+ description: 'Cluster%s has overcommitted memory resource requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node failure.' % [
104+ utils.ifShowMultiCluster($._config, ' {{ $labels.%(clusterLabel)s }}' % $._config),
105+ ],
79106 },
80107 'for' : '10m' ,
81- } +
82- if $._config.showMultiCluster then {
83- expr: |||
84- (sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) -
85- sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s) > 0
86- and
87- count by (%(clusterLabel)s) (max by (%(clusterLabel)s, node) (kube_node_role{%(kubeStateMetricsSelector)s, role="control-plane"})) < 3)
88- or
89- (sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) -
90- (sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s) -
91- max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s)) > 0
92- and
93- (sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s) -
94- max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s)) > 0)
95- ||| % $._config,
96- annotations+: {
97- description: 'Cluster {{ $labels.%(clusterLabel)s }} has overcommitted memory resource requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node failure.' % $._config,
98- },
99- } else {
100- expr: |||
101- (sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) -
102- sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) > 0
103- and
104- count(max by (node) (kube_node_role{%(kubeStateMetricsSelector)s, role="control-plane"})) < 3)
105- or
106- (sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) -
107- (sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) -
108- max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s})) > 0
109- and
110- (sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) -
111- max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s})) > 0)
112- ||| % $._config,
113- annotations+: {
114- description: 'Cluster has overcommitted memory resource requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node failure.' ,
115- },
108+ expr: kubeOvercommitExpression('memory' ),
116109 },
117110 {
118111 alert: 'KubeCPUQuotaOvercommit' ,
0 commit comments