From ac0748a092378c0c225eb2460bdbd96a50dda3f7 Mon Sep 17 00:00:00 2001 From: Pablo Date: Wed, 25 Apr 2018 11:50:21 +0200 Subject: [PATCH 1/8] Add new alerting rules --- roles/prometheus/files/cpu-usage.rule | 10 ++++ roles/prometheus/files/es-status.rule | 17 +++++++ roles/prometheus/files/filesystem-usage.rule | 8 ++++ .../prometheus/files/http-responses-500.rule | 1 + .../files/{rule1 => instance-down.rule} | 12 +---- roles/prometheus/files/load-average.rule | 10 ++++ roles/prometheus/files/memory-usage.rule | 17 +++++++ roles/prometheus/files/redis-status.rule | 47 +++++++++++++++++++ 8 files changed, 111 insertions(+), 11 deletions(-) create mode 100644 roles/prometheus/files/cpu-usage.rule create mode 100644 roles/prometheus/files/es-status.rule create mode 100644 roles/prometheus/files/filesystem-usage.rule create mode 100644 roles/prometheus/files/http-responses-500.rule rename roles/prometheus/files/{rule1 => instance-down.rule} (51%) create mode 100644 roles/prometheus/files/load-average.rule create mode 100644 roles/prometheus/files/memory-usage.rule create mode 100644 roles/prometheus/files/redis-status.rule diff --git a/roles/prometheus/files/cpu-usage.rule b/roles/prometheus/files/cpu-usage.rule new file mode 100644 index 0000000..8a57d80 --- /dev/null +++ b/roles/prometheus/files/cpu-usage.rule @@ -0,0 +1,10 @@ +ALERT NodeCPUUsage + IF (100 - (avg by (instance) (irate(node_cpu{name="node-exporter",mode="idle"}[5m])) * 100)) > 75 + FOR 2m + LABELS { + severity="critical" + } + ANNOTATIONS { + SUMMARY = "{{$labels.instance}}: High CPU usage detected", + DESCRIPTION = "{{$labels.instance}}: CPU usage is above 75% (current value is: {{ $value }})" +} \ No newline at end of file diff --git a/roles/prometheus/files/es-status.rule b/roles/prometheus/files/es-status.rule new file mode 100644 index 0000000..5a1b6d6 --- /dev/null +++ b/roles/prometheus/files/es-status.rule @@ -0,0 +1,17 @@ +ALERT EsStatusYellow + IF elasticsearch_cluster_health_status{color="yellow"} + FOR 5m + LABELS { severity ="warning" } + ANNOTATIONS { + SUMMARY = "{{$labels.instance}}: ES cluster status yellow", + DESCRIPTION = "{{$labels.instance}}: Elasticsearch cluster has been in yellow state for more than 5 minutes)" +} + +ALERT EsStatusRed + IF elasticsearch_cluster_health_status{color="red"} + FOR 5m + LABELS { severity = "critical" } + ANNOTATIONS { + SUMMARY = "{{$labels.instance}}: ES cluster status red", + DESCRIPTION = "{{$labels.instance}}: Elasticsearch cluster has been in red state for more than 5 minutes)" +} \ No newline at end of file diff --git a/roles/prometheus/files/filesystem-usage.rule b/roles/prometheus/files/filesystem-usage.rule new file mode 100644 index 0000000..b184eac --- /dev/null +++ b/roles/prometheus/files/filesystem-usage.rule @@ -0,0 +1,8 @@ +ALERT FilesystemFull + IF node_filesystem_free / node_filesystem_size < 0.3 + FOR 5m + LABELS { severity = "critical" } + ANNOTATIONS { + summary = "Filesystem {{ $labels.instance }} full", + description = "{{ $labels.instance }} of job {{ $labels.job }} free space less than 30%.", + } \ No newline at end of file diff --git a/roles/prometheus/files/http-responses-500.rule b/roles/prometheus/files/http-responses-500.rule new file mode 100644 index 0000000..ed5538b --- /dev/null +++ b/roles/prometheus/files/http-responses-500.rule @@ -0,0 +1 @@ +rate(nginx_http_requests_total{status=~"5[0-9][0-9]",host!="127.0.0.1"}[5m]) \ No newline at end of file diff --git a/roles/prometheus/files/rule1 b/roles/prometheus/files/instance-down.rule similarity index 51% rename from roles/prometheus/files/rule1 rename to roles/prometheus/files/instance-down.rule index c4ab792..99c30a9 100644 --- a/roles/prometheus/files/rule1 +++ b/roles/prometheus/files/instance-down.rule @@ -6,14 +6,4 @@ ALERT InstanceDown ANNOTATIONS { summary = "Instance {{ $labels.instance }} down", description = "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes.", - } - - -ALERT FilesystemFull - IF node_filesystem_free / node_filesystem_size < 0.3 - FOR 5m - LABELS { severity = "critical" } - ANNOTATIONS { - summary = "Filesystem {{ $labels.instance }} full", - description = "{{ $labels.instance }} of job {{ $labels.job }} free space less than 30%.", - } + } \ No newline at end of file diff --git a/roles/prometheus/files/load-average.rule b/roles/prometheus/files/load-average.rule new file mode 100644 index 0000000..a710699 --- /dev/null +++ b/roles/prometheus/files/load-average.rule @@ -0,0 +1,10 @@ +ALERT NodeLoadAverage + IF ((node_load5 / count without (cpu, mode) (node_cpu{mode="system"})) > 1) + FOR 2m + LABELS { + severity="page" + } + ANNOTATIONS { + SUMMARY = "{{$labels.instance}}: High LA detected", + DESCRIPTION = "{{$labels.instance}}: LA is high" +} \ No newline at end of file diff --git a/roles/prometheus/files/memory-usage.rule b/roles/prometheus/files/memory-usage.rule new file mode 100644 index 0000000..7cc88af --- /dev/null +++ b/roles/prometheus/files/memory-usage.rule @@ -0,0 +1,17 @@ +ALERT NodeMemoryUsage + IF (((node_memory_MemTotal-node_memory_MemFree-node_memory_Cached)/(node_memory_MemTotal)*100)) > 75 + FOR 5m + LABELS { severity = "critical" } + ANNOTATIONS { + SUMMARY = "{{$labels.instance}}: High memory usage detected", + DESCRIPTION = "{{$labels.instance}}: Memory usage is above 75% (current value is: {{ $value }})" +} + +ALERT NodeSwapUsage + IF (((node_memory_SwapTotal-node_memory_SwapFree)/node_memory_SwapTotal)*100) > 75 + FOR 5m + LABELS { severity = "critical" } + ANNOTATIONS { + SUMMARY = "{{$labels.instance}}: Swap usage detected", + DESCRIPTION = "{{$labels.instance}}: Swap usage usage is above 75% (current value is: {{ $value }})" +} \ No newline at end of file diff --git a/roles/prometheus/files/redis-status.rule b/roles/prometheus/files/redis-status.rule new file mode 100644 index 0000000..1102648 --- /dev/null +++ b/roles/prometheus/files/redis-status.rule @@ -0,0 +1,47 @@ +ALERT RedisHighMissRatio + IF (((rate(redis_keyspace_misses_total[5m])) / (rate(redis_keyspace_hits_total[5m]) + rate(redis_keyspace_misses_total[5m]))) > 0.5) + LABELS { severity = "critical" } + ANNOTATIONS { + summary = "{{ $labels.instance }}: Redis instance miss ratio high.", + description = "{{ $labels.instance }}: Redis instance miss ratio is over 50%.", + } + +ALERT MemoryFragmentationHigh + IF (redis_memory_fragmentation_ratio > 1.5) + LABELS { severity = "critical" } + ANNOTATIONS { + summary = "{{ $labels.instance }}: Redis memory fragmentation too high.", + description = "{{ $labels.instance }}: Redis instance memory fragmentation ratio over 1.5.", + } + +ALERT MemoryFragmentationLow + IF (redis_memory_fragmentation_ratio > 0.9) + LABELS { severity = "critical" } + ANNOTATIONS { + summary = "{{ $labels.instance }}: Redis memory fragmentation too low.", + description = "{{ $labels.instance }}: Redis instance memory fragmentation ratio under 0.9.", + } + +ALERT KeyEvictions + IF (rate(redis_evicted_keys_total[5m]) >= 1) + LABELS { severity = "critical" } + ANNOTATIONS { + summary = "{{ $labels.instance }}: Redis instance evicting keys.", + description = "{{ $labels.instance }}: Redis instance has been consistently evicting keys for 5 minutes.", + } + +ALERT TotalMemoryUsed + IF (redis_memory_used_bytes / (0.95 *node_memory_MemTotal{instance="cache1.local:9100"})) > 0.8 + LABELS { severity = "critical" } + ANNOTATIONS { + summary = "{{ $labels.instance }}: Redis is using too much memory.", + description = "{{ $labels.instance }}: Redis instance is using more than 80% of its available memory.", + } + +ALERT MaxClients + IF redis_connected_clients > 80000 + LABELS { severity = "critical" } + ANNOTATIONS { + summary = "{{ $labels.instance }}: Redis is at 80% of client capacity.", + description = "{{ $labels.instance }}: Redis is currently handling more than 80000 clients", + } \ No newline at end of file From 8d79a7fedc36a35e55e4aeff72e9c955e17cde4c Mon Sep 17 00:00:00 2001 From: Pablo Date: Wed, 25 Apr 2018 11:50:51 +0200 Subject: [PATCH 2/8] Fix mailing config --- roles/alertmanager/templates/alertmanager.yml.j2 | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/roles/alertmanager/templates/alertmanager.yml.j2 b/roles/alertmanager/templates/alertmanager.yml.j2 index cc88d3b..0576bca 100644 --- a/roles/alertmanager/templates/alertmanager.yml.j2 +++ b/roles/alertmanager/templates/alertmanager.yml.j2 @@ -1,6 +1,15 @@ global: - smtp_from: 'alert@caliopen.org' - smtp_smarthost: 'localhost:25' + smtp_from: 'ops@caliopen.org' + smtp_smarthost: 'mail.gandi.net:587' + smtp_auth_username: 'ops@caliopen.org' + smtp_auth_password: !vault | + $ANSIBLE_VAULT;1.1;AES256 + 39323731323662373133663263653334643766373562663238653661333963323362336238383034 + 3032613435376263643130636438353339323465613763640a303561633162356361333136386664 + 61316631333162386430343935373132393437656234636331613230663362373932356465323865 + 3636373534363331660a666165656362316335376464376565323239653031353739623831306537 + 6637 + smtp_require_tls: true route: group_by: ['alertname', 'service'] @@ -16,4 +25,4 @@ route: receivers: - name: 'team-ops' email_configs: - - to: 'ops@caliopen.org' + - to: 'alert@caliopen.org' From 7cd7b462215ff60c3d705c5939e9448ba9b6be90 Mon Sep 17 00:00:00 2001 From: Pablo Date: Wed, 25 Apr 2018 11:51:50 +0200 Subject: [PATCH 3/8] Fix alertmanager installation path --- roles/alertmanager/tasks/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/roles/alertmanager/tasks/main.yml b/roles/alertmanager/tasks/main.yml index da09e2d..1e13679 100644 --- a/roles/alertmanager/tasks/main.yml +++ b/roles/alertmanager/tasks/main.yml @@ -1,7 +1,7 @@ - name: install alertmanager copy: src: "{{ dist_directory }}/ext/alertmanager-{{ alertmanager_version }}.linux-amd64/alertmanager" - dest: /usr/local/sbin/alertmanager + dest: /usr/local/alertmanager-{{ alertmanager_version }}.linux-amd64/alertmanager mode: 0711 - name: install service for alertmanager From 5eff404a93bac996188eb6b4737670b6fb1f8972 Mon Sep 17 00:00:00 2001 From: Pablo Date: Wed, 25 Apr 2018 12:06:47 +0200 Subject: [PATCH 4/8] Fix prometheus to use all .rule files --- roles/prometheus/tasks/main.yml | 11 ++++++++++- roles/prometheus/templates/prometheus.yml.j2 | 2 +- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/roles/prometheus/tasks/main.yml b/roles/prometheus/tasks/main.yml index 25b4636..d6e9c4b 100644 --- a/roles/prometheus/tasks/main.yml +++ b/roles/prometheus/tasks/main.yml @@ -39,8 +39,17 @@ path: /etc/prometheus state: directory +- name: create alerting directory + file: + path: /etc/prometheus/alerting-rules + state: directory + - name: upload rule files - copy: src=rule1 dest=/etc/prometheus/rule1 + - copy: + src= "{{ item }}" + dest=/etc/prometheus/alerting-rules/ + with_fileglob: + - *.rule - name: configure prometheus template: src=prometheus.yml.j2 dest=/etc/prometheus/prometheus.yml diff --git a/roles/prometheus/templates/prometheus.yml.j2 b/roles/prometheus/templates/prometheus.yml.j2 index 031d581..186c2ee 100644 --- a/roles/prometheus/templates/prometheus.yml.j2 +++ b/roles/prometheus/templates/prometheus.yml.j2 @@ -44,4 +44,4 @@ scrape_configs: - targets: [{% for host in groups['all'] %}'{{ host }}.local:9100',{% endfor %}] rule_files: - - /etc/prometheus/rule* + - /etc/prometheus/*.rule From 893e853a051d886be392da537de28428fc79fd37 Mon Sep 17 00:00:00 2001 From: Pablo Date: Wed, 25 Apr 2018 12:12:21 +0200 Subject: [PATCH 5/8] Fix prometheus rule files path --- roles/prometheus/templates/prometheus.yml.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/roles/prometheus/templates/prometheus.yml.j2 b/roles/prometheus/templates/prometheus.yml.j2 index 186c2ee..2384f2b 100644 --- a/roles/prometheus/templates/prometheus.yml.j2 +++ b/roles/prometheus/templates/prometheus.yml.j2 @@ -44,4 +44,4 @@ scrape_configs: - targets: [{% for host in groups['all'] %}'{{ host }}.local:9100',{% endfor %}] rule_files: - - /etc/prometheus/*.rule + - /etc/prometheus/alerting-rules/*.rule From 8b8b7afb48655fdf25acd7bfa9c46261f8437c17 Mon Sep 17 00:00:00 2001 From: Pablo Date: Wed, 25 Apr 2018 14:56:05 +0200 Subject: [PATCH 6/8] Fix nats exporter --- roles/nats/tasks/main.yml | 6 ++++++ roles/nats/templates/prometheus-nats-exporter.service.j2 | 4 ++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/roles/nats/tasks/main.yml b/roles/nats/tasks/main.yml index 1edcebe..da8892a 100644 --- a/roles/nats/tasks/main.yml +++ b/roles/nats/tasks/main.yml @@ -10,6 +10,12 @@ - name: start nats service: name=gnatsd state=started enabled=yes +- name: install prometheus-nats-exporter + copy: + src: "{{ dist_directory }}/ext/prometheus-nats-exporter" + dest: /usr/local/sbin/prometheus-nats-exporter + mode: 0711 + - name: install prometheus-nats-exporter service template: src=prometheus-nats-exporter.service.j2 dest=/etc/systemd/system/prometheus-nats-exporter.service diff --git a/roles/nats/templates/prometheus-nats-exporter.service.j2 b/roles/nats/templates/prometheus-nats-exporter.service.j2 index d16c4ad..73ef6c6 100644 --- a/roles/nats/templates/prometheus-nats-exporter.service.j2 +++ b/roles/nats/templates/prometheus-nats-exporter.service.j2 @@ -3,8 +3,8 @@ Description=Nats prometheus exporter [Service] Restart=always -ExecStart=/var/tmp/prometheus-nats-exporter -connz -routez -subz -a {{ facter_ipaddress_eth1 }} http://{{ facter_ipaddress_eth1}}:8222 -ExecStop=pkill prometheus-nats-exporter +ExecStart=/usr/local/sbin/prometheus-nats-exporter -connz -routez -subz -varz -a {{ facter_ipaddress_eth1 }} http://{{ facter_ipaddress_eth1}}:8222 +ExecStop=pkill /usr/local/sbin/prometheus-nats-exporter [Install] WantedBy=local.target From fe37e5420849e34cf777a6592c7ada985a240415 Mon Sep 17 00:00:00 2001 From: Pablo Date: Fri, 27 Apr 2018 16:02:59 +0200 Subject: [PATCH 7/8] Fix jmx exporter version, 0.3.0 works fine --- external_version.yaml | 2 +- hosts.template | 2 +- roles/cassandra/templates/cassandra-env.sh | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/external_version.yaml b/external_version.yaml index 20f9baf..6d31d34 100644 --- a/external_version.yaml +++ b/external_version.yaml @@ -12,5 +12,5 @@ gnats_version: "1.0.2" # prometheus exporters redis_exporter_version: "0.12.2" -jmx_prometheus_javaagent_version: "0.1.0" +jmx_prometheus_javaagent_version: "0.3.0" node_exporter_version: "0.14.0" diff --git a/hosts.template b/hosts.template index e1fa806..d1788b5 100644 --- a/hosts.template +++ b/hosts.template @@ -23,7 +23,7 @@ gnats_version=1.0.2 # prometheus exporters redis_exporter_version=0.12.2 -jmx_prometheus_javaagent_version=0.1.0 +jmx_prometheus_javaagent_version=0.3.0 node_exporter_version=0.14.0 [monitoring:children] diff --git a/roles/cassandra/templates/cassandra-env.sh b/roles/cassandra/templates/cassandra-env.sh index a9d18e2..06e66e5 100644 --- a/roles/cassandra/templates/cassandra-env.sh +++ b/roles/cassandra/templates/cassandra-env.sh @@ -312,4 +312,4 @@ JVM_OPTS="$JVM_OPTS $JVM_EXTRA_OPTS" # Add JMX prometheus exporter JMX_EXPORTER_DIR="/etc/cassandra" -# JVM_OPTS="$JVM_OPTS -javaagent:$JMX_EXPORTER_DIR/jmx_prometheus_javaagent-{{ jmx_prometheus_javaagent_version }}.jar=7070:$JMX_EXPORTER_DIR/cassandra_exporter.yaml" +JVM_OPTS="$JVM_OPTS -javaagent:$JMX_EXPORTER_DIR/jmx_prometheus_javaagent-{{ jmx_prometheus_javaagent_version }}.jar=7070:$JMX_EXPORTER_DIR/cassandra_exporter.yaml" From 8033cd75d04518c17bfc0e3eac73b5e656c98cab Mon Sep 17 00:00:00 2001 From: Pablo Date: Tue, 3 Jul 2018 12:51:04 +0200 Subject: [PATCH 8/8] Add more rules --- roles/prometheus/files/cassandra-status.rule | 9 +++++++++ roles/prometheus/files/cpu-usage.rule | 2 +- roles/prometheus/files/redis-status.rule | 2 +- 3 files changed, 11 insertions(+), 2 deletions(-) create mode 100644 roles/prometheus/files/cassandra-status.rule diff --git a/roles/prometheus/files/cassandra-status.rule b/roles/prometheus/files/cassandra-status.rule new file mode 100644 index 0000000..c1a9212 --- /dev/null +++ b/roles/prometheus/files/cassandra-status.rule @@ -0,0 +1,9 @@ +ALERT xxxxxxxxxxxxx + IF xxxxxxxxxxxxxxxxxx + FOR 5m + LABELS { severity ="critical" } + ANNOTATIONS { + SUMMARY = "{{$labels.instance}}: ES cluster status yellow", + DESCRIPTION = "{{$labels.instance}}: Elasticsearch cluster has been in yellow state for more than 5 minutes)" +} + diff --git a/roles/prometheus/files/cpu-usage.rule b/roles/prometheus/files/cpu-usage.rule index 8a57d80..b7ec936 100644 --- a/roles/prometheus/files/cpu-usage.rule +++ b/roles/prometheus/files/cpu-usage.rule @@ -1,5 +1,5 @@ ALERT NodeCPUUsage - IF (100 - (avg by (instance) (irate(node_cpu{name="node-exporter",mode="idle"}[5m])) * 100)) > 75 + IF (100 - (avg by (instance) (irate(node_cpu{mode="idle"}[5m])) * 100)) > 75 FOR 2m LABELS { severity="critical" diff --git a/roles/prometheus/files/redis-status.rule b/roles/prometheus/files/redis-status.rule index 1102648..4066766 100644 --- a/roles/prometheus/files/redis-status.rule +++ b/roles/prometheus/files/redis-status.rule @@ -31,7 +31,7 @@ ALERT KeyEvictions } ALERT TotalMemoryUsed - IF (redis_memory_used_bytes / (0.95 *node_memory_MemTotal{instance="cache1.local:9100"})) > 0.8 + IF (redis_memory_used_bytes{instance="cache1.local:9121"} / node_memory_MemTotal{instance="cache1.local:9100"}) > 0.8 LABELS { severity = "critical" } ANNOTATIONS { summary = "{{ $labels.instance }}: Redis is using too much memory.",