Update dashboards, alerts and README to reflect changes in node-exporter v0.16.0

This commit is contained in:
Amir Zarrinkafsh 2018-08-12 14:37:35 +10:00
parent dcdd37ca1f
commit 6e6534f51e
No known key found for this signature in database
GPG Key ID: ECDB8EF9E77E4EBF
6 changed files with 42 additions and 42 deletions

View File

@ -60,13 +60,13 @@ The Docker Host Dashboard shows key metrics for monitoring the resource usage of
For storage and particularly Free Storage graph, you have to specify the fstype in grafana graph request.
You can find it in `grafana/dashboards/docker_host.json`, at line 480 :
"expr": "sum(node_filesystem_free{fstype=\"btrfs\"})",
"expr": "sum(node_filesystem_free_bytes{fstype=\"btrfs\"})",
I work on BTRFS, so i need to change `aufs` to `btrfs`.
You can find right value for your system in Prometheus `http://<host-ip>:9090` launching this request :
node_filesystem_free
node_filesystem_free_bytes
***Docker Containers Dashboard***
@ -161,7 +161,7 @@ Trigger an alert if the Docker host memory is almost full:
```yaml
ALERT high_memory_load
IF (sum(node_memory_MemTotal) - sum(node_memory_MemFree + node_memory_Buffers + node_memory_Cached) ) / sum(node_memory_MemTotal) * 100 > 85
IF (sum(node_memory_MemTotal_bytes) - sum(node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) ) / sum(node_memory_MemTotal_bytes) * 100 > 85
FOR 30s
LABELS { severity = "warning" }
ANNOTATIONS {
@ -174,7 +174,7 @@ Trigger an alert if the Docker host storage is almost full:
```yaml
ALERT hight_storage_load
IF (node_filesystem_size{fstype="aufs"} - node_filesystem_free{fstype="aufs"}) / node_filesystem_size{fstype="aufs"} * 100 > 85
IF (node_filesystem_size_bytes{fstype="aufs"} - node_filesystem_free_bytes{fstype="aufs"}) / node_filesystem_size_bytes{fstype="aufs"} * 100 > 85
FOR 30s
LABELS { severity = "warning" }
ANNOTATIONS {
@ -202,7 +202,7 @@ Trigger an alert if a container is using more than 10% of total CPU cores for mo
```yaml
ALERT jenkins_high_cpu
IF sum(rate(container_cpu_usage_seconds_total{name="jenkins"}[1m])) / count(node_cpu{mode="system"}) * 100 > 10
IF sum(rate(container_cpu_usage_seconds_total{name="jenkins"}[1m])) / count(node_cpu_seconds_total{mode="system"}) * 100 > 10
FOR 30s
LABELS { severity = "warning" }
ANNOTATIONS {

View File

@ -75,7 +75,7 @@
},
"targets": [
{
"expr": "sum(rate(container_cpu_user_seconds_total{image!=\"\"}[1m])) / count(node_cpu{mode=\"user\"}) * 100",
"expr": "sum(rate(container_cpu_user_seconds_total{image!=\"\"}[1m])) / count(node_cpu_seconds_total{mode=\"user\"}) * 100",
"interval": "10s",
"intervalFactor": 1,
"legendFormat": "",
@ -237,7 +237,7 @@
},
"targets": [
{
"expr": "(sum(node_memory_MemTotal) - sum(node_memory_MemFree+node_memory_Buffers+node_memory_Cached) ) / sum(node_memory_MemTotal) * 100",
"expr": "(sum(node_memory_MemTotal_bytes) - sum(node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes) ) / sum(node_memory_MemTotal_bytes) * 100",
"interval": "10s",
"intervalFactor": 2,
"legendFormat": "",
@ -403,7 +403,7 @@
},
"targets": [
{
"expr": "(node_filesystem_size{fstype=\"aufs\"} - node_filesystem_free{fstype=\"aufs\"}) / node_filesystem_size{fstype=\"aufs\"} * 100",
"expr": "(node_filesystem_size_bytes{fstype=\"aufs\"} - node_filesystem_free_bytes{fstype=\"aufs\"}) / node_filesystem_size_bytes{fstype=\"aufs\"} * 100",
"interval": "30s",
"intervalFactor": 1,
"legendFormat": "",
@ -735,7 +735,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(irate(node_disk_bytes_read[5m]))",
"expr": "sum(irate(node_disk_read_bytes_total[5m]))",
"interval": "2s",
"intervalFactor": 4,
"legendFormat": "read",
@ -744,7 +744,7 @@
"step": 8
},
{
"expr": "sum(irate(node_disk_bytes_written[5m]))",
"expr": "sum(irate(node_disk_written_bytes_total[5m]))",
"interval": "2s",
"intervalFactor": 4,
"legendFormat": "written",
@ -753,7 +753,7 @@
"step": 8
},
{
"expr": "sum(irate(node_disk_io_time_ms[5m]))",
"expr": "sum(irate(node_disk_io_time_seconds_total[5m]))",
"interval": "2s",
"intervalFactor": 4,
"legendFormat": "io time",
@ -843,7 +843,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by (name) (rate(container_cpu_usage_seconds_total{image!=\"\",container_label_org_label_schema_group=\"\"}[1m])) / scalar(count(node_cpu{mode=\"user\"})) * 100",
"expr": "sum by (name) (rate(container_cpu_usage_seconds_total{image!=\"\",container_label_org_label_schema_group=\"\"}[1m])) / scalar(count(node_cpu_seconds_total{mode=\"user\"})) * 100",
"intervalFactor": 10,
"legendFormat": "{{ name }}",
"metric": "container_cpu_user_seconds_total",

View File

@ -75,7 +75,7 @@
},
"targets": [
{
"expr": "node_time - node_boot_time",
"expr": "node_time_seconds - node_boot_time_seconds",
"interval": "30s",
"intervalFactor": 1,
"refId": "A",
@ -155,7 +155,7 @@
},
"targets": [
{
"expr": "sum(rate(node_cpu{mode=\"idle\"}[1m])) * 100 / scalar(count(node_cpu{mode=\"user\"}))",
"expr": "sum(rate(node_cpu_seconds_total{mode=\"idle\"}[1m])) * 100 / scalar(count(node_cpu_seconds_total{mode=\"user\"}))",
"interval": "10s",
"intervalFactor": 2,
"legendFormat": "",
@ -316,7 +316,7 @@
},
"targets": [
{
"expr": "node_memory_MemAvailable",
"expr": "node_memory_MemAvailable_bytes",
"interval": "30s",
"intervalFactor": 2,
"legendFormat": "",
@ -397,7 +397,7 @@
},
"targets": [
{
"expr": "node_memory_SwapFree",
"expr": "node_memory_SwapFree_bytes",
"interval": "30s",
"intervalFactor": 2,
"refId": "A",
@ -477,7 +477,7 @@
},
"targets": [
{
"expr": "sum(node_filesystem_free{fstype=\"aufs\"})",
"expr": "sum(node_filesystem_free_bytes{fstype=\"aufs\"})",
"interval": "30s",
"intervalFactor": 1,
"legendFormat": "",
@ -728,11 +728,11 @@
"steppedLine": false,
"targets": [
{
"expr": " irate(node_intr[5m])",
"expr": " irate(node_intr_total[5m])",
"interval": "10s",
"intervalFactor": 1,
"legendFormat": "interrupts",
"metric": "node_intr",
"metric": "node_intr_total",
"refId": "A",
"step": 10
}
@ -818,10 +818,10 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(node_cpu[1m])) by (mode) * 100 / scalar(count(node_cpu{mode=\"user\"}))",
"expr": "sum(rate(node_cpu_seconds_total[1m])) by (mode) * 100 / scalar(count(node_cpu_seconds_total{mode=\"user\"}))",
"intervalFactor": 10,
"legendFormat": "{{ mode }}",
"metric": "node_cpu",
"metric": "node_cpu_seconds_total",
"refId": "A",
"step": 10
}
@ -924,28 +924,28 @@
"steppedLine": false,
"targets": [
{
"expr": "node_memory_MemTotal - (node_memory_MemFree + node_memory_Buffers + node_memory_Cached)",
"expr": "node_memory_MemTotal_bytes - (node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes)",
"intervalFactor": 1,
"legendFormat": "Used",
"refId": "A",
"step": 1
},
{
"expr": "node_memory_MemFree",
"expr": "node_memory_MemFree_bytes",
"intervalFactor": 1,
"legendFormat": "Free",
"refId": "B",
"step": 1
},
{
"expr": "node_memory_Buffers",
"expr": "node_memory_Buffers_bytes",
"intervalFactor": 1,
"legendFormat": "Buffers",
"refId": "C",
"step": 1
},
{
"expr": "node_memory_Cached",
"expr": "node_memory_Cached_bytes",
"intervalFactor": 1,
"legendFormat": "Cached",
"refId": "D",
@ -1046,27 +1046,27 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(irate(node_disk_bytes_read[1m]))",
"expr": "sum(irate(node_disk_read_bytes_total[1m]))",
"interval": "",
"intervalFactor": 1,
"legendFormat": "read",
"metric": "node_disk_bytes_read",
"metric": "node_disk_read_bytes_total",
"refId": "A",
"step": 1
},
{
"expr": "sum(irate(node_disk_bytes_written[1m]))",
"expr": "sum(irate(node_disk_written_bytes_total[1m]))",
"intervalFactor": 1,
"legendFormat": "written",
"metric": "node_disk_bytes_written",
"metric": "node_disk_written_bytes_total",
"refId": "B",
"step": 1
},
{
"expr": "sum(irate(node_disk_io_time_ms[1m]))",
"expr": "sum(irate(node_disk_io_time_seconds_total[1m]))",
"intervalFactor": 1,
"legendFormat": "io time",
"metric": "node_disk_io_time_ms",
"metric": "node_disk_io_time_seconds_total",
"refId": "C",
"step": 1
}
@ -1152,18 +1152,18 @@
"steppedLine": false,
"targets": [
{
"expr": "irate(node_network_receive_bytes{device!=\"lo\"}[1m])",
"expr": "irate(node_network_receive_bytes_total{device!=\"lo\"}[1m])",
"intervalFactor": 1,
"legendFormat": "In: {{ device }}",
"metric": "node_network_receive_bytes",
"metric": "node_network_receive_bytes_total",
"refId": "A",
"step": 1
},
{
"expr": "irate(node_network_transmit_bytes{device!=\"lo\"}[1m])",
"expr": "irate(node_network_transmit_bytes_total{device!=\"lo\"}[1m])",
"intervalFactor": 1,
"legendFormat": "Out: {{ device }}",
"metric": "node_network_transmit_bytes",
"metric": "node_network_transmit_bytes_total",
"refId": "B",
"step": 1
}
@ -1258,7 +1258,7 @@
"steppedLine": false,
"targets": [
{
"expr": "node_memory_SwapTotal - node_memory_SwapFree",
"expr": "node_memory_SwapTotal_bytes - node_memory_SwapFree_bytes",
"interval": "10s",
"intervalFactor": 1,
"legendFormat": "Used",
@ -1266,7 +1266,7 @@
"step": 10
},
{
"expr": "node_memory_SwapFree",
"expr": "node_memory_SwapFree_bytes",
"interval": "10s",
"intervalFactor": 1,
"legendFormat": "Free",

View File

@ -392,7 +392,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(container_cpu_user_seconds_total{container_label_org_label_schema_group=\"monitoring\"}[1m]) * 100 / scalar(count(node_cpu{mode=\"user\"}))) by (name)",
"expr": "sum(rate(container_cpu_user_seconds_total{container_label_org_label_schema_group=\"monitoring\"}[1m]) * 100 / scalar(count(node_cpu_seconds_total{mode=\"user\"}))) by (name)",
"intervalFactor": 10,
"legendFormat": "{{ name }}",
"refId": "A",

View File

@ -312,7 +312,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(container_cpu_usage_seconds_total{name=~\"nginx\"}[5m])) / count(node_cpu{mode=\"system\"}) * 100",
"expr": "sum(rate(container_cpu_usage_seconds_total{name=~\"nginx\"}[5m])) / count(node_cpu_seconds_total{mode=\"system\"}) * 100",
"intervalFactor": 2,
"legendFormat": "nginx",
"refId": "A",

View File

@ -22,7 +22,7 @@ groups:
description: "Docker host is under high load, the avg load 1m is at {{ $value}}. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
- alert: high_memory_load
expr: (sum(node_memory_MemTotal) - sum(node_memory_MemFree + node_memory_Buffers + node_memory_Cached) ) / sum(node_memory_MemTotal) * 100 > 85
expr: (sum(node_memory_MemTotal_bytes) - sum(node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) ) / sum(node_memory_MemTotal_bytes) * 100 > 85
for: 30s
labels:
severity: warning
@ -31,7 +31,7 @@ groups:
description: "Docker host memory usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
- alert: high_storage_load
expr: (node_filesystem_size{fstype="aufs"} - node_filesystem_free{fstype="aufs"}) / node_filesystem_size{fstype="aufs"} * 100 > 85
expr: (node_filesystem_size_bytes{fstype="aufs"} - node_filesystem_free_bytes{fstype="aufs"}) / node_filesystem_size_bytes{fstype="aufs"} * 100 > 85
for: 30s
labels:
severity: warning
@ -51,7 +51,7 @@ groups:
description: "Jenkins container is down for more than 30 seconds."
- alert: jenkins_high_cpu
expr: sum(rate(container_cpu_usage_seconds_total{name="jenkins"}[1m])) / count(node_cpu{mode="system"}) * 100 > 10
expr: sum(rate(container_cpu_usage_seconds_total{name="jenkins"}[1m])) / count(node_cpu_seconds_total{mode="system"}) * 100 > 10
for: 30s
labels:
severity: warning