diff --git a/README.md b/README.md index b5ad6d4..6b41de5 100644 --- a/README.md +++ b/README.md @@ -60,13 +60,13 @@ The Docker Host Dashboard shows key metrics for monitoring the resource usage of For storage and particularly Free Storage graph, you have to specify the fstype in grafana graph request. You can find it in `grafana/dashboards/docker_host.json`, at line 480 : - "expr": "sum(node_filesystem_free{fstype=\"btrfs\"})", + "expr": "sum(node_filesystem_free_bytes{fstype=\"btrfs\"})", I work on BTRFS, so i need to change `aufs` to `btrfs`. You can find right value for your system in Prometheus `http://:9090` launching this request : - node_filesystem_free + node_filesystem_free_bytes ***Docker Containers Dashboard*** @@ -161,7 +161,7 @@ Trigger an alert if the Docker host memory is almost full: ```yaml ALERT high_memory_load - IF (sum(node_memory_MemTotal) - sum(node_memory_MemFree + node_memory_Buffers + node_memory_Cached) ) / sum(node_memory_MemTotal) * 100 > 85 + IF (sum(node_memory_MemTotal_bytes) - sum(node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) ) / sum(node_memory_MemTotal_bytes) * 100 > 85 FOR 30s LABELS { severity = "warning" } ANNOTATIONS { @@ -174,7 +174,7 @@ Trigger an alert if the Docker host storage is almost full: ```yaml ALERT hight_storage_load - IF (node_filesystem_size{fstype="aufs"} - node_filesystem_free{fstype="aufs"}) / node_filesystem_size{fstype="aufs"} * 100 > 85 + IF (node_filesystem_size_bytes{fstype="aufs"} - node_filesystem_free_bytes{fstype="aufs"}) / node_filesystem_size_bytes{fstype="aufs"} * 100 > 85 FOR 30s LABELS { severity = "warning" } ANNOTATIONS { @@ -202,7 +202,7 @@ Trigger an alert if a container is using more than 10% of total CPU cores for mo ```yaml ALERT jenkins_high_cpu - IF sum(rate(container_cpu_usage_seconds_total{name="jenkins"}[1m])) / count(node_cpu{mode="system"}) * 100 > 10 + IF sum(rate(container_cpu_usage_seconds_total{name="jenkins"}[1m])) / count(node_cpu_seconds_total{mode="system"}) * 100 > 10 FOR 30s LABELS { severity = "warning" } ANNOTATIONS { diff --git a/grafana/dashboards/docker_containers.json b/grafana/dashboards/docker_containers.json index 29aef59..9bee5b5 100644 --- a/grafana/dashboards/docker_containers.json +++ b/grafana/dashboards/docker_containers.json @@ -75,7 +75,7 @@ }, "targets": [ { - "expr": "sum(rate(container_cpu_user_seconds_total{image!=\"\"}[1m])) / count(node_cpu{mode=\"user\"}) * 100", + "expr": "sum(rate(container_cpu_user_seconds_total{image!=\"\"}[1m])) / count(node_cpu_seconds_total{mode=\"user\"}) * 100", "interval": "10s", "intervalFactor": 1, "legendFormat": "", @@ -237,7 +237,7 @@ }, "targets": [ { - "expr": "(sum(node_memory_MemTotal) - sum(node_memory_MemFree+node_memory_Buffers+node_memory_Cached) ) / sum(node_memory_MemTotal) * 100", + "expr": "(sum(node_memory_MemTotal_bytes) - sum(node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes) ) / sum(node_memory_MemTotal_bytes) * 100", "interval": "10s", "intervalFactor": 2, "legendFormat": "", @@ -403,7 +403,7 @@ }, "targets": [ { - "expr": "(node_filesystem_size{fstype=\"aufs\"} - node_filesystem_free{fstype=\"aufs\"}) / node_filesystem_size{fstype=\"aufs\"} * 100", + "expr": "(node_filesystem_size_bytes{fstype=\"aufs\"} - node_filesystem_free_bytes{fstype=\"aufs\"}) / node_filesystem_size_bytes{fstype=\"aufs\"} * 100", "interval": "30s", "intervalFactor": 1, "legendFormat": "", @@ -735,7 +735,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(irate(node_disk_bytes_read[5m]))", + "expr": "sum(irate(node_disk_read_bytes_total[5m]))", "interval": "2s", "intervalFactor": 4, "legendFormat": "read", @@ -744,7 +744,7 @@ "step": 8 }, { - "expr": "sum(irate(node_disk_bytes_written[5m]))", + "expr": "sum(irate(node_disk_written_bytes_total[5m]))", "interval": "2s", "intervalFactor": 4, "legendFormat": "written", @@ -753,7 +753,7 @@ "step": 8 }, { - "expr": "sum(irate(node_disk_io_time_ms[5m]))", + "expr": "sum(irate(node_disk_io_time_seconds_total[5m]))", "interval": "2s", "intervalFactor": 4, "legendFormat": "io time", @@ -843,7 +843,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (name) (rate(container_cpu_usage_seconds_total{image!=\"\",container_label_org_label_schema_group=\"\"}[1m])) / scalar(count(node_cpu{mode=\"user\"})) * 100", + "expr": "sum by (name) (rate(container_cpu_usage_seconds_total{image!=\"\",container_label_org_label_schema_group=\"\"}[1m])) / scalar(count(node_cpu_seconds_total{mode=\"user\"})) * 100", "intervalFactor": 10, "legendFormat": "{{ name }}", "metric": "container_cpu_user_seconds_total", diff --git a/grafana/dashboards/docker_host.json b/grafana/dashboards/docker_host.json index 9ce726a..bce75dc 100644 --- a/grafana/dashboards/docker_host.json +++ b/grafana/dashboards/docker_host.json @@ -75,7 +75,7 @@ }, "targets": [ { - "expr": "node_time - node_boot_time", + "expr": "node_time_seconds - node_boot_time_seconds", "interval": "30s", "intervalFactor": 1, "refId": "A", @@ -155,7 +155,7 @@ }, "targets": [ { - "expr": "sum(rate(node_cpu{mode=\"idle\"}[1m])) * 100 / scalar(count(node_cpu{mode=\"user\"}))", + "expr": "sum(rate(node_cpu_seconds_total{mode=\"idle\"}[1m])) * 100 / scalar(count(node_cpu_seconds_total{mode=\"user\"}))", "interval": "10s", "intervalFactor": 2, "legendFormat": "", @@ -316,7 +316,7 @@ }, "targets": [ { - "expr": "node_memory_MemAvailable", + "expr": "node_memory_MemAvailable_bytes", "interval": "30s", "intervalFactor": 2, "legendFormat": "", @@ -397,7 +397,7 @@ }, "targets": [ { - "expr": "node_memory_SwapFree", + "expr": "node_memory_SwapFree_bytes", "interval": "30s", "intervalFactor": 2, "refId": "A", @@ -477,7 +477,7 @@ }, "targets": [ { - "expr": "sum(node_filesystem_free{fstype=\"aufs\"})", + "expr": "sum(node_filesystem_free_bytes{fstype=\"aufs\"})", "interval": "30s", "intervalFactor": 1, "legendFormat": "", @@ -728,11 +728,11 @@ "steppedLine": false, "targets": [ { - "expr": " irate(node_intr[5m])", + "expr": " irate(node_intr_total[5m])", "interval": "10s", "intervalFactor": 1, "legendFormat": "interrupts", - "metric": "node_intr", + "metric": "node_intr_total", "refId": "A", "step": 10 } @@ -818,10 +818,10 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(node_cpu[1m])) by (mode) * 100 / scalar(count(node_cpu{mode=\"user\"}))", + "expr": "sum(rate(node_cpu_seconds_total[1m])) by (mode) * 100 / scalar(count(node_cpu_seconds_total{mode=\"user\"}))", "intervalFactor": 10, "legendFormat": "{{ mode }}", - "metric": "node_cpu", + "metric": "node_cpu_seconds_total", "refId": "A", "step": 10 } @@ -924,28 +924,28 @@ "steppedLine": false, "targets": [ { - "expr": "node_memory_MemTotal - (node_memory_MemFree + node_memory_Buffers + node_memory_Cached)", + "expr": "node_memory_MemTotal_bytes - (node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes)", "intervalFactor": 1, "legendFormat": "Used", "refId": "A", "step": 1 }, { - "expr": "node_memory_MemFree", + "expr": "node_memory_MemFree_bytes", "intervalFactor": 1, "legendFormat": "Free", "refId": "B", "step": 1 }, { - "expr": "node_memory_Buffers", + "expr": "node_memory_Buffers_bytes", "intervalFactor": 1, "legendFormat": "Buffers", "refId": "C", "step": 1 }, { - "expr": "node_memory_Cached", + "expr": "node_memory_Cached_bytes", "intervalFactor": 1, "legendFormat": "Cached", "refId": "D", @@ -1046,27 +1046,27 @@ "steppedLine": false, "targets": [ { - "expr": "sum(irate(node_disk_bytes_read[1m]))", + "expr": "sum(irate(node_disk_read_bytes_total[1m]))", "interval": "", "intervalFactor": 1, "legendFormat": "read", - "metric": "node_disk_bytes_read", + "metric": "node_disk_read_bytes_total", "refId": "A", "step": 1 }, { - "expr": "sum(irate(node_disk_bytes_written[1m]))", + "expr": "sum(irate(node_disk_written_bytes_total[1m]))", "intervalFactor": 1, "legendFormat": "written", - "metric": "node_disk_bytes_written", + "metric": "node_disk_written_bytes_total", "refId": "B", "step": 1 }, { - "expr": "sum(irate(node_disk_io_time_ms[1m]))", + "expr": "sum(irate(node_disk_io_time_seconds_total[1m]))", "intervalFactor": 1, "legendFormat": "io time", - "metric": "node_disk_io_time_ms", + "metric": "node_disk_io_time_seconds_total", "refId": "C", "step": 1 } @@ -1152,18 +1152,18 @@ "steppedLine": false, "targets": [ { - "expr": "irate(node_network_receive_bytes{device!=\"lo\"}[1m])", + "expr": "irate(node_network_receive_bytes_total{device!=\"lo\"}[1m])", "intervalFactor": 1, "legendFormat": "In: {{ device }}", - "metric": "node_network_receive_bytes", + "metric": "node_network_receive_bytes_total", "refId": "A", "step": 1 }, { - "expr": "irate(node_network_transmit_bytes{device!=\"lo\"}[1m])", + "expr": "irate(node_network_transmit_bytes_total{device!=\"lo\"}[1m])", "intervalFactor": 1, "legendFormat": "Out: {{ device }}", - "metric": "node_network_transmit_bytes", + "metric": "node_network_transmit_bytes_total", "refId": "B", "step": 1 } @@ -1258,7 +1258,7 @@ "steppedLine": false, "targets": [ { - "expr": "node_memory_SwapTotal - node_memory_SwapFree", + "expr": "node_memory_SwapTotal_bytes - node_memory_SwapFree_bytes", "interval": "10s", "intervalFactor": 1, "legendFormat": "Used", @@ -1266,7 +1266,7 @@ "step": 10 }, { - "expr": "node_memory_SwapFree", + "expr": "node_memory_SwapFree_bytes", "interval": "10s", "intervalFactor": 1, "legendFormat": "Free", diff --git a/grafana/dashboards/monitor_services.json b/grafana/dashboards/monitor_services.json index 1693d02..e8314f7 100644 --- a/grafana/dashboards/monitor_services.json +++ b/grafana/dashboards/monitor_services.json @@ -392,7 +392,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(container_cpu_user_seconds_total{container_label_org_label_schema_group=\"monitoring\"}[1m]) * 100 / scalar(count(node_cpu{mode=\"user\"}))) by (name)", + "expr": "sum(rate(container_cpu_user_seconds_total{container_label_org_label_schema_group=\"monitoring\"}[1m]) * 100 / scalar(count(node_cpu_seconds_total{mode=\"user\"}))) by (name)", "intervalFactor": 10, "legendFormat": "{{ name }}", "refId": "A", diff --git a/grafana/dashboards/nginx_container.json b/grafana/dashboards/nginx_container.json index a80ace9..c3e7270 100644 --- a/grafana/dashboards/nginx_container.json +++ b/grafana/dashboards/nginx_container.json @@ -312,7 +312,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(container_cpu_usage_seconds_total{name=~\"nginx\"}[5m])) / count(node_cpu{mode=\"system\"}) * 100", + "expr": "sum(rate(container_cpu_usage_seconds_total{name=~\"nginx\"}[5m])) / count(node_cpu_seconds_total{mode=\"system\"}) * 100", "intervalFactor": 2, "legendFormat": "nginx", "refId": "A", diff --git a/prometheus/alert.rules b/prometheus/alert.rules index 402699b..7b6eb07 100644 --- a/prometheus/alert.rules +++ b/prometheus/alert.rules @@ -22,7 +22,7 @@ groups: description: "Docker host is under high load, the avg load 1m is at {{ $value}}. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}." - alert: high_memory_load - expr: (sum(node_memory_MemTotal) - sum(node_memory_MemFree + node_memory_Buffers + node_memory_Cached) ) / sum(node_memory_MemTotal) * 100 > 85 + expr: (sum(node_memory_MemTotal_bytes) - sum(node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) ) / sum(node_memory_MemTotal_bytes) * 100 > 85 for: 30s labels: severity: warning @@ -31,7 +31,7 @@ groups: description: "Docker host memory usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}." - alert: high_storage_load - expr: (node_filesystem_size{fstype="aufs"} - node_filesystem_free{fstype="aufs"}) / node_filesystem_size{fstype="aufs"} * 100 > 85 + expr: (node_filesystem_size_bytes{fstype="aufs"} - node_filesystem_free_bytes{fstype="aufs"}) / node_filesystem_size_bytes{fstype="aufs"} * 100 > 85 for: 30s labels: severity: warning @@ -51,7 +51,7 @@ groups: description: "Jenkins container is down for more than 30 seconds." - alert: jenkins_high_cpu - expr: sum(rate(container_cpu_usage_seconds_total{name="jenkins"}[1m])) / count(node_cpu{mode="system"}) * 100 > 10 + expr: sum(rate(container_cpu_usage_seconds_total{name="jenkins"}[1m])) / count(node_cpu_seconds_total{mode="system"}) * 100 > 10 for: 30s labels: severity: warning