From 6e6534f51ebb5b1ddb5264cbc9c0c0ba03f0a352 Mon Sep 17 00:00:00 2001 From: Amir Zarrinkafsh Date: Sun, 12 Aug 2018 14:37:35 +1000 Subject: [PATCH] Update dashboards, alerts and README to reflect changes in node-exporter v0.16.0 --- README.md | 10 ++--- grafana/dashboards/docker_containers.json | 14 +++---- grafana/dashboards/docker_host.json | 50 +++++++++++------------ grafana/dashboards/monitor_services.json | 2 +- grafana/dashboards/nginx_container.json | 2 +- prometheus/alert.rules | 6 +-- 6 files changed, 42 insertions(+), 42 deletions(-) diff --git a/README.md b/README.md index b5ad6d4..6b41de5 100644 --- a/README.md +++ b/README.md @@ -60,13 +60,13 @@ The Docker Host Dashboard shows key metrics for monitoring the resource usage of For storage and particularly Free Storage graph, you have to specify the fstype in grafana graph request. You can find it in `grafana/dashboards/docker_host.json`, at line 480 : - "expr": "sum(node_filesystem_free{fstype=\"btrfs\"})", + "expr": "sum(node_filesystem_free_bytes{fstype=\"btrfs\"})", I work on BTRFS, so i need to change `aufs` to `btrfs`. You can find right value for your system in Prometheus `http://:9090` launching this request : - node_filesystem_free + node_filesystem_free_bytes ***Docker Containers Dashboard*** @@ -161,7 +161,7 @@ Trigger an alert if the Docker host memory is almost full: ```yaml ALERT high_memory_load - IF (sum(node_memory_MemTotal) - sum(node_memory_MemFree + node_memory_Buffers + node_memory_Cached) ) / sum(node_memory_MemTotal) * 100 > 85 + IF (sum(node_memory_MemTotal_bytes) - sum(node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) ) / sum(node_memory_MemTotal_bytes) * 100 > 85 FOR 30s LABELS { severity = "warning" } ANNOTATIONS { @@ -174,7 +174,7 @@ Trigger an alert if the Docker host storage is almost full: ```yaml ALERT hight_storage_load - IF (node_filesystem_size{fstype="aufs"} - node_filesystem_free{fstype="aufs"}) / node_filesystem_size{fstype="aufs"} * 100 > 85 + IF (node_filesystem_size_bytes{fstype="aufs"} - node_filesystem_free_bytes{fstype="aufs"}) / node_filesystem_size_bytes{fstype="aufs"} * 100 > 85 FOR 30s LABELS { severity = "warning" } ANNOTATIONS { @@ -202,7 +202,7 @@ Trigger an alert if a container is using more than 10% of total CPU cores for mo ```yaml ALERT jenkins_high_cpu - IF sum(rate(container_cpu_usage_seconds_total{name="jenkins"}[1m])) / count(node_cpu{mode="system"}) * 100 > 10 + IF sum(rate(container_cpu_usage_seconds_total{name="jenkins"}[1m])) / count(node_cpu_seconds_total{mode="system"}) * 100 > 10 FOR 30s LABELS { severity = "warning" } ANNOTATIONS { diff --git a/grafana/dashboards/docker_containers.json b/grafana/dashboards/docker_containers.json index 29aef59..9bee5b5 100644 --- a/grafana/dashboards/docker_containers.json +++ b/grafana/dashboards/docker_containers.json @@ -75,7 +75,7 @@ }, "targets": [ { - "expr": "sum(rate(container_cpu_user_seconds_total{image!=\"\"}[1m])) / count(node_cpu{mode=\"user\"}) * 100", + "expr": "sum(rate(container_cpu_user_seconds_total{image!=\"\"}[1m])) / count(node_cpu_seconds_total{mode=\"user\"}) * 100", "interval": "10s", "intervalFactor": 1, "legendFormat": "", @@ -237,7 +237,7 @@ }, "targets": [ { - "expr": "(sum(node_memory_MemTotal) - sum(node_memory_MemFree+node_memory_Buffers+node_memory_Cached) ) / sum(node_memory_MemTotal) * 100", + "expr": "(sum(node_memory_MemTotal_bytes) - sum(node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes) ) / sum(node_memory_MemTotal_bytes) * 100", "interval": "10s", "intervalFactor": 2, "legendFormat": "", @@ -403,7 +403,7 @@ }, "targets": [ { - "expr": "(node_filesystem_size{fstype=\"aufs\"} - node_filesystem_free{fstype=\"aufs\"}) / node_filesystem_size{fstype=\"aufs\"} * 100", + "expr": "(node_filesystem_size_bytes{fstype=\"aufs\"} - node_filesystem_free_bytes{fstype=\"aufs\"}) / node_filesystem_size_bytes{fstype=\"aufs\"} * 100", "interval": "30s", "intervalFactor": 1, "legendFormat": "", @@ -735,7 +735,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(irate(node_disk_bytes_read[5m]))", + "expr": "sum(irate(node_disk_read_bytes_total[5m]))", "interval": "2s", "intervalFactor": 4, "legendFormat": "read", @@ -744,7 +744,7 @@ "step": 8 }, { - "expr": "sum(irate(node_disk_bytes_written[5m]))", + "expr": "sum(irate(node_disk_written_bytes_total[5m]))", "interval": "2s", "intervalFactor": 4, "legendFormat": "written", @@ -753,7 +753,7 @@ "step": 8 }, { - "expr": "sum(irate(node_disk_io_time_ms[5m]))", + "expr": "sum(irate(node_disk_io_time_seconds_total[5m]))", "interval": "2s", "intervalFactor": 4, "legendFormat": "io time", @@ -843,7 +843,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (name) (rate(container_cpu_usage_seconds_total{image!=\"\",container_label_org_label_schema_group=\"\"}[1m])) / scalar(count(node_cpu{mode=\"user\"})) * 100", + "expr": "sum by (name) (rate(container_cpu_usage_seconds_total{image!=\"\",container_label_org_label_schema_group=\"\"}[1m])) / scalar(count(node_cpu_seconds_total{mode=\"user\"})) * 100", "intervalFactor": 10, "legendFormat": "{{ name }}", "metric": "container_cpu_user_seconds_total", diff --git a/grafana/dashboards/docker_host.json b/grafana/dashboards/docker_host.json index 9ce726a..bce75dc 100644 --- a/grafana/dashboards/docker_host.json +++ b/grafana/dashboards/docker_host.json @@ -75,7 +75,7 @@ }, "targets": [ { - "expr": "node_time - node_boot_time", + "expr": "node_time_seconds - node_boot_time_seconds", "interval": "30s", "intervalFactor": 1, "refId": "A", @@ -155,7 +155,7 @@ }, "targets": [ { - "expr": "sum(rate(node_cpu{mode=\"idle\"}[1m])) * 100 / scalar(count(node_cpu{mode=\"user\"}))", + "expr": "sum(rate(node_cpu_seconds_total{mode=\"idle\"}[1m])) * 100 / scalar(count(node_cpu_seconds_total{mode=\"user\"}))", "interval": "10s", "intervalFactor": 2, "legendFormat": "", @@ -316,7 +316,7 @@ }, "targets": [ { - "expr": "node_memory_MemAvailable", + "expr": "node_memory_MemAvailable_bytes", "interval": "30s", "intervalFactor": 2, "legendFormat": "", @@ -397,7 +397,7 @@ }, "targets": [ { - "expr": "node_memory_SwapFree", + "expr": "node_memory_SwapFree_bytes", "interval": "30s", "intervalFactor": 2, "refId": "A", @@ -477,7 +477,7 @@ }, "targets": [ { - "expr": "sum(node_filesystem_free{fstype=\"aufs\"})", + "expr": "sum(node_filesystem_free_bytes{fstype=\"aufs\"})", "interval": "30s", "intervalFactor": 1, "legendFormat": "", @@ -728,11 +728,11 @@ "steppedLine": false, "targets": [ { - "expr": " irate(node_intr[5m])", + "expr": " irate(node_intr_total[5m])", "interval": "10s", "intervalFactor": 1, "legendFormat": "interrupts", - "metric": "node_intr", + "metric": "node_intr_total", "refId": "A", "step": 10 } @@ -818,10 +818,10 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(node_cpu[1m])) by (mode) * 100 / scalar(count(node_cpu{mode=\"user\"}))", + "expr": "sum(rate(node_cpu_seconds_total[1m])) by (mode) * 100 / scalar(count(node_cpu_seconds_total{mode=\"user\"}))", "intervalFactor": 10, "legendFormat": "{{ mode }}", - "metric": "node_cpu", + "metric": "node_cpu_seconds_total", "refId": "A", "step": 10 } @@ -924,28 +924,28 @@ "steppedLine": false, "targets": [ { - "expr": "node_memory_MemTotal - (node_memory_MemFree + node_memory_Buffers + node_memory_Cached)", + "expr": "node_memory_MemTotal_bytes - (node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes)", "intervalFactor": 1, "legendFormat": "Used", "refId": "A", "step": 1 }, { - "expr": "node_memory_MemFree", + "expr": "node_memory_MemFree_bytes", "intervalFactor": 1, "legendFormat": "Free", "refId": "B", "step": 1 }, { - "expr": "node_memory_Buffers", + "expr": "node_memory_Buffers_bytes", "intervalFactor": 1, "legendFormat": "Buffers", "refId": "C", "step": 1 }, { - "expr": "node_memory_Cached", + "expr": "node_memory_Cached_bytes", "intervalFactor": 1, "legendFormat": "Cached", "refId": "D", @@ -1046,27 +1046,27 @@ "steppedLine": false, "targets": [ { - "expr": "sum(irate(node_disk_bytes_read[1m]))", + "expr": "sum(irate(node_disk_read_bytes_total[1m]))", "interval": "", "intervalFactor": 1, "legendFormat": "read", - "metric": "node_disk_bytes_read", + "metric": "node_disk_read_bytes_total", "refId": "A", "step": 1 }, { - "expr": "sum(irate(node_disk_bytes_written[1m]))", + "expr": "sum(irate(node_disk_written_bytes_total[1m]))", "intervalFactor": 1, "legendFormat": "written", - "metric": "node_disk_bytes_written", + "metric": "node_disk_written_bytes_total", "refId": "B", "step": 1 }, { - "expr": "sum(irate(node_disk_io_time_ms[1m]))", + "expr": "sum(irate(node_disk_io_time_seconds_total[1m]))", "intervalFactor": 1, "legendFormat": "io time", - "metric": "node_disk_io_time_ms", + "metric": "node_disk_io_time_seconds_total", "refId": "C", "step": 1 } @@ -1152,18 +1152,18 @@ "steppedLine": false, "targets": [ { - "expr": "irate(node_network_receive_bytes{device!=\"lo\"}[1m])", + "expr": "irate(node_network_receive_bytes_total{device!=\"lo\"}[1m])", "intervalFactor": 1, "legendFormat": "In: {{ device }}", - "metric": "node_network_receive_bytes", + "metric": "node_network_receive_bytes_total", "refId": "A", "step": 1 }, { - "expr": "irate(node_network_transmit_bytes{device!=\"lo\"}[1m])", + "expr": "irate(node_network_transmit_bytes_total{device!=\"lo\"}[1m])", "intervalFactor": 1, "legendFormat": "Out: {{ device }}", - "metric": "node_network_transmit_bytes", + "metric": "node_network_transmit_bytes_total", "refId": "B", "step": 1 } @@ -1258,7 +1258,7 @@ "steppedLine": false, "targets": [ { - "expr": "node_memory_SwapTotal - node_memory_SwapFree", + "expr": "node_memory_SwapTotal_bytes - node_memory_SwapFree_bytes", "interval": "10s", "intervalFactor": 1, "legendFormat": "Used", @@ -1266,7 +1266,7 @@ "step": 10 }, { - "expr": "node_memory_SwapFree", + "expr": "node_memory_SwapFree_bytes", "interval": "10s", "intervalFactor": 1, "legendFormat": "Free", diff --git a/grafana/dashboards/monitor_services.json b/grafana/dashboards/monitor_services.json index 1693d02..e8314f7 100644 --- a/grafana/dashboards/monitor_services.json +++ b/grafana/dashboards/monitor_services.json @@ -392,7 +392,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(container_cpu_user_seconds_total{container_label_org_label_schema_group=\"monitoring\"}[1m]) * 100 / scalar(count(node_cpu{mode=\"user\"}))) by (name)", + "expr": "sum(rate(container_cpu_user_seconds_total{container_label_org_label_schema_group=\"monitoring\"}[1m]) * 100 / scalar(count(node_cpu_seconds_total{mode=\"user\"}))) by (name)", "intervalFactor": 10, "legendFormat": "{{ name }}", "refId": "A", diff --git a/grafana/dashboards/nginx_container.json b/grafana/dashboards/nginx_container.json index a80ace9..c3e7270 100644 --- a/grafana/dashboards/nginx_container.json +++ b/grafana/dashboards/nginx_container.json @@ -312,7 +312,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(container_cpu_usage_seconds_total{name=~\"nginx\"}[5m])) / count(node_cpu{mode=\"system\"}) * 100", + "expr": "sum(rate(container_cpu_usage_seconds_total{name=~\"nginx\"}[5m])) / count(node_cpu_seconds_total{mode=\"system\"}) * 100", "intervalFactor": 2, "legendFormat": "nginx", "refId": "A", diff --git a/prometheus/alert.rules b/prometheus/alert.rules index 402699b..7b6eb07 100644 --- a/prometheus/alert.rules +++ b/prometheus/alert.rules @@ -22,7 +22,7 @@ groups: description: "Docker host is under high load, the avg load 1m is at {{ $value}}. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}." - alert: high_memory_load - expr: (sum(node_memory_MemTotal) - sum(node_memory_MemFree + node_memory_Buffers + node_memory_Cached) ) / sum(node_memory_MemTotal) * 100 > 85 + expr: (sum(node_memory_MemTotal_bytes) - sum(node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) ) / sum(node_memory_MemTotal_bytes) * 100 > 85 for: 30s labels: severity: warning @@ -31,7 +31,7 @@ groups: description: "Docker host memory usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}." - alert: high_storage_load - expr: (node_filesystem_size{fstype="aufs"} - node_filesystem_free{fstype="aufs"}) / node_filesystem_size{fstype="aufs"} * 100 > 85 + expr: (node_filesystem_size_bytes{fstype="aufs"} - node_filesystem_free_bytes{fstype="aufs"}) / node_filesystem_size_bytes{fstype="aufs"} * 100 > 85 for: 30s labels: severity: warning @@ -51,7 +51,7 @@ groups: description: "Jenkins container is down for more than 30 seconds." - alert: jenkins_high_cpu - expr: sum(rate(container_cpu_usage_seconds_total{name="jenkins"}[1m])) / count(node_cpu{mode="system"}) * 100 > 10 + expr: sum(rate(container_cpu_usage_seconds_total{name="jenkins"}[1m])) / count(node_cpu_seconds_total{mode="system"}) * 100 > 10 for: 30s labels: severity: warning