mirror of
https://github.com/stefanprodan/dockprom.git
synced 2025-05-05 15:32:50 +00:00
Update dashboards, alerts and README to reflect changes in node-exporter v0.16.0
This commit is contained in:
parent
dcdd37ca1f
commit
6e6534f51e
10
README.md
10
README.md
@ -60,13 +60,13 @@ The Docker Host Dashboard shows key metrics for monitoring the resource usage of
|
||||
For storage and particularly Free Storage graph, you have to specify the fstype in grafana graph request.
|
||||
You can find it in `grafana/dashboards/docker_host.json`, at line 480 :
|
||||
|
||||
"expr": "sum(node_filesystem_free{fstype=\"btrfs\"})",
|
||||
"expr": "sum(node_filesystem_free_bytes{fstype=\"btrfs\"})",
|
||||
|
||||
I work on BTRFS, so i need to change `aufs` to `btrfs`.
|
||||
|
||||
You can find right value for your system in Prometheus `http://<host-ip>:9090` launching this request :
|
||||
|
||||
node_filesystem_free
|
||||
node_filesystem_free_bytes
|
||||
|
||||
***Docker Containers Dashboard***
|
||||
|
||||
@ -161,7 +161,7 @@ Trigger an alert if the Docker host memory is almost full:
|
||||
|
||||
```yaml
|
||||
ALERT high_memory_load
|
||||
IF (sum(node_memory_MemTotal) - sum(node_memory_MemFree + node_memory_Buffers + node_memory_Cached) ) / sum(node_memory_MemTotal) * 100 > 85
|
||||
IF (sum(node_memory_MemTotal_bytes) - sum(node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) ) / sum(node_memory_MemTotal_bytes) * 100 > 85
|
||||
FOR 30s
|
||||
LABELS { severity = "warning" }
|
||||
ANNOTATIONS {
|
||||
@ -174,7 +174,7 @@ Trigger an alert if the Docker host storage is almost full:
|
||||
|
||||
```yaml
|
||||
ALERT hight_storage_load
|
||||
IF (node_filesystem_size{fstype="aufs"} - node_filesystem_free{fstype="aufs"}) / node_filesystem_size{fstype="aufs"} * 100 > 85
|
||||
IF (node_filesystem_size_bytes{fstype="aufs"} - node_filesystem_free_bytes{fstype="aufs"}) / node_filesystem_size_bytes{fstype="aufs"} * 100 > 85
|
||||
FOR 30s
|
||||
LABELS { severity = "warning" }
|
||||
ANNOTATIONS {
|
||||
@ -202,7 +202,7 @@ Trigger an alert if a container is using more than 10% of total CPU cores for mo
|
||||
|
||||
```yaml
|
||||
ALERT jenkins_high_cpu
|
||||
IF sum(rate(container_cpu_usage_seconds_total{name="jenkins"}[1m])) / count(node_cpu{mode="system"}) * 100 > 10
|
||||
IF sum(rate(container_cpu_usage_seconds_total{name="jenkins"}[1m])) / count(node_cpu_seconds_total{mode="system"}) * 100 > 10
|
||||
FOR 30s
|
||||
LABELS { severity = "warning" }
|
||||
ANNOTATIONS {
|
||||
|
@ -75,7 +75,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(container_cpu_user_seconds_total{image!=\"\"}[1m])) / count(node_cpu{mode=\"user\"}) * 100",
|
||||
"expr": "sum(rate(container_cpu_user_seconds_total{image!=\"\"}[1m])) / count(node_cpu_seconds_total{mode=\"user\"}) * 100",
|
||||
"interval": "10s",
|
||||
"intervalFactor": 1,
|
||||
"legendFormat": "",
|
||||
@ -237,7 +237,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(sum(node_memory_MemTotal) - sum(node_memory_MemFree+node_memory_Buffers+node_memory_Cached) ) / sum(node_memory_MemTotal) * 100",
|
||||
"expr": "(sum(node_memory_MemTotal_bytes) - sum(node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes) ) / sum(node_memory_MemTotal_bytes) * 100",
|
||||
"interval": "10s",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "",
|
||||
@ -403,7 +403,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(node_filesystem_size{fstype=\"aufs\"} - node_filesystem_free{fstype=\"aufs\"}) / node_filesystem_size{fstype=\"aufs\"} * 100",
|
||||
"expr": "(node_filesystem_size_bytes{fstype=\"aufs\"} - node_filesystem_free_bytes{fstype=\"aufs\"}) / node_filesystem_size_bytes{fstype=\"aufs\"} * 100",
|
||||
"interval": "30s",
|
||||
"intervalFactor": 1,
|
||||
"legendFormat": "",
|
||||
@ -735,7 +735,7 @@
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(irate(node_disk_bytes_read[5m]))",
|
||||
"expr": "sum(irate(node_disk_read_bytes_total[5m]))",
|
||||
"interval": "2s",
|
||||
"intervalFactor": 4,
|
||||
"legendFormat": "read",
|
||||
@ -744,7 +744,7 @@
|
||||
"step": 8
|
||||
},
|
||||
{
|
||||
"expr": "sum(irate(node_disk_bytes_written[5m]))",
|
||||
"expr": "sum(irate(node_disk_written_bytes_total[5m]))",
|
||||
"interval": "2s",
|
||||
"intervalFactor": 4,
|
||||
"legendFormat": "written",
|
||||
@ -753,7 +753,7 @@
|
||||
"step": 8
|
||||
},
|
||||
{
|
||||
"expr": "sum(irate(node_disk_io_time_ms[5m]))",
|
||||
"expr": "sum(irate(node_disk_io_time_seconds_total[5m]))",
|
||||
"interval": "2s",
|
||||
"intervalFactor": 4,
|
||||
"legendFormat": "io time",
|
||||
@ -843,7 +843,7 @@
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by (name) (rate(container_cpu_usage_seconds_total{image!=\"\",container_label_org_label_schema_group=\"\"}[1m])) / scalar(count(node_cpu{mode=\"user\"})) * 100",
|
||||
"expr": "sum by (name) (rate(container_cpu_usage_seconds_total{image!=\"\",container_label_org_label_schema_group=\"\"}[1m])) / scalar(count(node_cpu_seconds_total{mode=\"user\"})) * 100",
|
||||
"intervalFactor": 10,
|
||||
"legendFormat": "{{ name }}",
|
||||
"metric": "container_cpu_user_seconds_total",
|
||||
|
@ -75,7 +75,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "node_time - node_boot_time",
|
||||
"expr": "node_time_seconds - node_boot_time_seconds",
|
||||
"interval": "30s",
|
||||
"intervalFactor": 1,
|
||||
"refId": "A",
|
||||
@ -155,7 +155,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(node_cpu{mode=\"idle\"}[1m])) * 100 / scalar(count(node_cpu{mode=\"user\"}))",
|
||||
"expr": "sum(rate(node_cpu_seconds_total{mode=\"idle\"}[1m])) * 100 / scalar(count(node_cpu_seconds_total{mode=\"user\"}))",
|
||||
"interval": "10s",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "",
|
||||
@ -316,7 +316,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "node_memory_MemAvailable",
|
||||
"expr": "node_memory_MemAvailable_bytes",
|
||||
"interval": "30s",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "",
|
||||
@ -397,7 +397,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "node_memory_SwapFree",
|
||||
"expr": "node_memory_SwapFree_bytes",
|
||||
"interval": "30s",
|
||||
"intervalFactor": 2,
|
||||
"refId": "A",
|
||||
@ -477,7 +477,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(node_filesystem_free{fstype=\"aufs\"})",
|
||||
"expr": "sum(node_filesystem_free_bytes{fstype=\"aufs\"})",
|
||||
"interval": "30s",
|
||||
"intervalFactor": 1,
|
||||
"legendFormat": "",
|
||||
@ -728,11 +728,11 @@
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": " irate(node_intr[5m])",
|
||||
"expr": " irate(node_intr_total[5m])",
|
||||
"interval": "10s",
|
||||
"intervalFactor": 1,
|
||||
"legendFormat": "interrupts",
|
||||
"metric": "node_intr",
|
||||
"metric": "node_intr_total",
|
||||
"refId": "A",
|
||||
"step": 10
|
||||
}
|
||||
@ -818,10 +818,10 @@
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(node_cpu[1m])) by (mode) * 100 / scalar(count(node_cpu{mode=\"user\"}))",
|
||||
"expr": "sum(rate(node_cpu_seconds_total[1m])) by (mode) * 100 / scalar(count(node_cpu_seconds_total{mode=\"user\"}))",
|
||||
"intervalFactor": 10,
|
||||
"legendFormat": "{{ mode }}",
|
||||
"metric": "node_cpu",
|
||||
"metric": "node_cpu_seconds_total",
|
||||
"refId": "A",
|
||||
"step": 10
|
||||
}
|
||||
@ -924,28 +924,28 @@
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "node_memory_MemTotal - (node_memory_MemFree + node_memory_Buffers + node_memory_Cached)",
|
||||
"expr": "node_memory_MemTotal_bytes - (node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes)",
|
||||
"intervalFactor": 1,
|
||||
"legendFormat": "Used",
|
||||
"refId": "A",
|
||||
"step": 1
|
||||
},
|
||||
{
|
||||
"expr": "node_memory_MemFree",
|
||||
"expr": "node_memory_MemFree_bytes",
|
||||
"intervalFactor": 1,
|
||||
"legendFormat": "Free",
|
||||
"refId": "B",
|
||||
"step": 1
|
||||
},
|
||||
{
|
||||
"expr": "node_memory_Buffers",
|
||||
"expr": "node_memory_Buffers_bytes",
|
||||
"intervalFactor": 1,
|
||||
"legendFormat": "Buffers",
|
||||
"refId": "C",
|
||||
"step": 1
|
||||
},
|
||||
{
|
||||
"expr": "node_memory_Cached",
|
||||
"expr": "node_memory_Cached_bytes",
|
||||
"intervalFactor": 1,
|
||||
"legendFormat": "Cached",
|
||||
"refId": "D",
|
||||
@ -1046,27 +1046,27 @@
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(irate(node_disk_bytes_read[1m]))",
|
||||
"expr": "sum(irate(node_disk_read_bytes_total[1m]))",
|
||||
"interval": "",
|
||||
"intervalFactor": 1,
|
||||
"legendFormat": "read",
|
||||
"metric": "node_disk_bytes_read",
|
||||
"metric": "node_disk_read_bytes_total",
|
||||
"refId": "A",
|
||||
"step": 1
|
||||
},
|
||||
{
|
||||
"expr": "sum(irate(node_disk_bytes_written[1m]))",
|
||||
"expr": "sum(irate(node_disk_written_bytes_total[1m]))",
|
||||
"intervalFactor": 1,
|
||||
"legendFormat": "written",
|
||||
"metric": "node_disk_bytes_written",
|
||||
"metric": "node_disk_written_bytes_total",
|
||||
"refId": "B",
|
||||
"step": 1
|
||||
},
|
||||
{
|
||||
"expr": "sum(irate(node_disk_io_time_ms[1m]))",
|
||||
"expr": "sum(irate(node_disk_io_time_seconds_total[1m]))",
|
||||
"intervalFactor": 1,
|
||||
"legendFormat": "io time",
|
||||
"metric": "node_disk_io_time_ms",
|
||||
"metric": "node_disk_io_time_seconds_total",
|
||||
"refId": "C",
|
||||
"step": 1
|
||||
}
|
||||
@ -1152,18 +1152,18 @@
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "irate(node_network_receive_bytes{device!=\"lo\"}[1m])",
|
||||
"expr": "irate(node_network_receive_bytes_total{device!=\"lo\"}[1m])",
|
||||
"intervalFactor": 1,
|
||||
"legendFormat": "In: {{ device }}",
|
||||
"metric": "node_network_receive_bytes",
|
||||
"metric": "node_network_receive_bytes_total",
|
||||
"refId": "A",
|
||||
"step": 1
|
||||
},
|
||||
{
|
||||
"expr": "irate(node_network_transmit_bytes{device!=\"lo\"}[1m])",
|
||||
"expr": "irate(node_network_transmit_bytes_total{device!=\"lo\"}[1m])",
|
||||
"intervalFactor": 1,
|
||||
"legendFormat": "Out: {{ device }}",
|
||||
"metric": "node_network_transmit_bytes",
|
||||
"metric": "node_network_transmit_bytes_total",
|
||||
"refId": "B",
|
||||
"step": 1
|
||||
}
|
||||
@ -1258,7 +1258,7 @@
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "node_memory_SwapTotal - node_memory_SwapFree",
|
||||
"expr": "node_memory_SwapTotal_bytes - node_memory_SwapFree_bytes",
|
||||
"interval": "10s",
|
||||
"intervalFactor": 1,
|
||||
"legendFormat": "Used",
|
||||
@ -1266,7 +1266,7 @@
|
||||
"step": 10
|
||||
},
|
||||
{
|
||||
"expr": "node_memory_SwapFree",
|
||||
"expr": "node_memory_SwapFree_bytes",
|
||||
"interval": "10s",
|
||||
"intervalFactor": 1,
|
||||
"legendFormat": "Free",
|
||||
|
@ -392,7 +392,7 @@
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(container_cpu_user_seconds_total{container_label_org_label_schema_group=\"monitoring\"}[1m]) * 100 / scalar(count(node_cpu{mode=\"user\"}))) by (name)",
|
||||
"expr": "sum(rate(container_cpu_user_seconds_total{container_label_org_label_schema_group=\"monitoring\"}[1m]) * 100 / scalar(count(node_cpu_seconds_total{mode=\"user\"}))) by (name)",
|
||||
"intervalFactor": 10,
|
||||
"legendFormat": "{{ name }}",
|
||||
"refId": "A",
|
||||
|
@ -312,7 +312,7 @@
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(container_cpu_usage_seconds_total{name=~\"nginx\"}[5m])) / count(node_cpu{mode=\"system\"}) * 100",
|
||||
"expr": "sum(rate(container_cpu_usage_seconds_total{name=~\"nginx\"}[5m])) / count(node_cpu_seconds_total{mode=\"system\"}) * 100",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "nginx",
|
||||
"refId": "A",
|
||||
|
@ -22,7 +22,7 @@ groups:
|
||||
description: "Docker host is under high load, the avg load 1m is at {{ $value}}. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
|
||||
|
||||
- alert: high_memory_load
|
||||
expr: (sum(node_memory_MemTotal) - sum(node_memory_MemFree + node_memory_Buffers + node_memory_Cached) ) / sum(node_memory_MemTotal) * 100 > 85
|
||||
expr: (sum(node_memory_MemTotal_bytes) - sum(node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) ) / sum(node_memory_MemTotal_bytes) * 100 > 85
|
||||
for: 30s
|
||||
labels:
|
||||
severity: warning
|
||||
@ -31,7 +31,7 @@ groups:
|
||||
description: "Docker host memory usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
|
||||
|
||||
- alert: high_storage_load
|
||||
expr: (node_filesystem_size{fstype="aufs"} - node_filesystem_free{fstype="aufs"}) / node_filesystem_size{fstype="aufs"} * 100 > 85
|
||||
expr: (node_filesystem_size_bytes{fstype="aufs"} - node_filesystem_free_bytes{fstype="aufs"}) / node_filesystem_size_bytes{fstype="aufs"} * 100 > 85
|
||||
for: 30s
|
||||
labels:
|
||||
severity: warning
|
||||
@ -51,7 +51,7 @@ groups:
|
||||
description: "Jenkins container is down for more than 30 seconds."
|
||||
|
||||
- alert: jenkins_high_cpu
|
||||
expr: sum(rate(container_cpu_usage_seconds_total{name="jenkins"}[1m])) / count(node_cpu{mode="system"}) * 100 > 10
|
||||
expr: sum(rate(container_cpu_usage_seconds_total{name="jenkins"}[1m])) / count(node_cpu_seconds_total{mode="system"}) * 100 > 10
|
||||
for: 30s
|
||||
labels:
|
||||
severity: warning
|
||||
|
Loading…
x
Reference in New Issue
Block a user