From fc967aa992c8f77d1a3d786e0954428ba62160f2 Mon Sep 17 00:00:00 2001 From: Vitaly Zhuravlev Date: Tue, 28 Mar 2023 01:26:42 +0800 Subject: [PATCH 01/19] Add mountpoint to NodeFilesystem alerts This helps to identify alerting filesystem. Signed-off-by: Vitaly Zhuravlev --- docs/node-mixin/alerts/alerts.libsonnet | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/node-mixin/alerts/alerts.libsonnet b/docs/node-mixin/alerts/alerts.libsonnet index f92d97dffe..6c215949a8 100644 --- a/docs/node-mixin/alerts/alerts.libsonnet +++ b/docs/node-mixin/alerts/alerts.libsonnet @@ -21,7 +21,7 @@ }, annotations: { summary: 'Filesystem is predicted to run out of space within the next 24 hours.', - description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up.', + description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up.', }, }, { @@ -41,7 +41,7 @@ }, annotations: { summary: 'Filesystem is predicted to run out of space within the next 4 hours.', - description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast.', + description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast.', }, }, { @@ -59,7 +59,7 @@ }, annotations: { summary: 'Filesystem has less than %(fsSpaceAvailableWarningThreshold)d%% space left.' % $._config, - description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.', + description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.', }, }, { @@ -77,7 +77,7 @@ }, annotations: { summary: 'Filesystem has less than %(fsSpaceAvailableCriticalThreshold)d%% space left.' % $._config, - description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.', + description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.', }, }, { @@ -97,7 +97,7 @@ }, annotations: { summary: 'Filesystem is predicted to run out of inodes within the next 24 hours.', - description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up.', + description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up.', }, }, { @@ -117,7 +117,7 @@ }, annotations: { summary: 'Filesystem is predicted to run out of inodes within the next 4 hours.', - description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.', + description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.', }, }, { @@ -135,7 +135,7 @@ }, annotations: { summary: 'Filesystem has less than 5% inodes left.', - description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.', + description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.', }, }, { @@ -153,7 +153,7 @@ }, annotations: { summary: 'Filesystem has less than 3% inodes left.', - description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.', + description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.', }, }, { From 0e0399d41ef56022f13b13c76991c195fc64f826 Mon Sep 17 00:00:00 2001 From: Vitaly Zhuravlev Date: Tue, 28 Mar 2023 01:43:53 +0800 Subject: [PATCH 02/19] Decrease NodeFilesystem pending time to 15m 30m is too long and there is a risk of running out of disk space/inodes completely if something is filling up disk very fast (like log file). Signed-off-by: Vitaly Zhuravlev --- docs/node-mixin/alerts/alerts.libsonnet | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/node-mixin/alerts/alerts.libsonnet b/docs/node-mixin/alerts/alerts.libsonnet index 6c215949a8..e8eba08912 100644 --- a/docs/node-mixin/alerts/alerts.libsonnet +++ b/docs/node-mixin/alerts/alerts.libsonnet @@ -53,7 +53,7 @@ node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} == 0 ) ||| % $._config, - 'for': '30m', + 'for': '15m', labels: { severity: 'warning', }, @@ -71,7 +71,7 @@ node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} == 0 ) ||| % $._config, - 'for': '30m', + 'for': '15m', labels: { severity: '%(nodeCriticalSeverity)s' % $._config, }, @@ -129,7 +129,7 @@ node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} == 0 ) ||| % $._config, - 'for': '1h', + 'for': '15m', labels: { severity: 'warning', }, @@ -147,7 +147,7 @@ node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} == 0 ) ||| % $._config, - 'for': '1h', + 'for': '15m', labels: { severity: '%(nodeCriticalSeverity)s' % $._config, }, From fd2d62af63fa493001b858ab8aa5860591b33fe7 Mon Sep 17 00:00:00 2001 From: Vitaly Zhuravlev Date: Tue, 28 Mar 2023 02:57:02 +0800 Subject: [PATCH 03/19] Add CPU and memory alerts Signed-off-by: Vitaly Zhuravlev --- docs/node-mixin/alerts/alerts.libsonnet | 28 +++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/docs/node-mixin/alerts/alerts.libsonnet b/docs/node-mixin/alerts/alerts.libsonnet index e8eba08912..4f732058c6 100644 --- a/docs/node-mixin/alerts/alerts.libsonnet +++ b/docs/node-mixin/alerts/alerts.libsonnet @@ -309,6 +309,34 @@ description: 'File descriptors limit at {{ $labels.instance }} is currently at {{ printf "%.2f" $value }}%.', }, }, + { + alert: 'NodeCPUHighUsage', + expr: ||| + sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode!="idle"}[2m]))) > 0.8 + ||| % $._config, + 'for': '15m', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'High CPU usage.', + description: 'CPU usage on {{ $labels.instance }} has been above 80% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%.', + }, + }, + { + alert: 'NodeMemoryHighUtilization', + expr: ||| + 100 - node_memory_MemAvailable_bytes{%(nodeExporterSelector)s} / node_memory_MemTotal_bytes{%(nodeExporterSelector)s} * 100 < 10 + ||| % $._config, + 'for': '15m', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'Host is running out of memory', + description: 'Memory is filling up on {{ $labels.instance }}, has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%.', + }, + }, ], }, ], From 74794182a7a9cb736afd5b3e41dcb96258ab51f1 Mon Sep 17 00:00:00 2001 From: Vitaly Zhuravlev Date: Tue, 28 Mar 2023 03:35:41 +0800 Subject: [PATCH 04/19] Add failed systemd service alert Signed-off-by: Vitaly Zhuravlev --- docs/node-mixin/alerts/alerts.libsonnet | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/docs/node-mixin/alerts/alerts.libsonnet b/docs/node-mixin/alerts/alerts.libsonnet index 4f732058c6..c1f98c9c6d 100644 --- a/docs/node-mixin/alerts/alerts.libsonnet +++ b/docs/node-mixin/alerts/alerts.libsonnet @@ -337,6 +337,20 @@ description: 'Memory is filling up on {{ $labels.instance }}, has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%.', }, }, + { + alert: 'NodeSystemdServiceFailed', + expr: ||| + node_systemd_unit_state{%(nodeExporterSelector)s, state="failed"} == 1 + ||| % $._config, + 'for': '5m', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'Systemd service has entered failed state.', + description: 'Systemd service {{ $labels.name }} has entered failed state on {{ $labels.instance }}', + }, + }, ], }, ], From 3d8075da7dc86b4b5d1dd43ff886b13c1d4006ab Mon Sep 17 00:00:00 2001 From: Vitaly Zhuravlev Date: Tue, 28 Mar 2023 04:22:14 +0800 Subject: [PATCH 05/19] Decrease NodeNetwork*Errs pending period Signed-off-by: Vitaly Zhuravlev --- docs/node-mixin/alerts/alerts.libsonnet | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/node-mixin/alerts/alerts.libsonnet b/docs/node-mixin/alerts/alerts.libsonnet index c1f98c9c6d..03eebc3383 100644 --- a/docs/node-mixin/alerts/alerts.libsonnet +++ b/docs/node-mixin/alerts/alerts.libsonnet @@ -161,7 +161,7 @@ expr: ||| rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01 ||| % $._config, - 'for': '1h', + 'for': '15m', labels: { severity: 'warning', }, @@ -175,7 +175,7 @@ expr: ||| rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01 ||| % $._config, - 'for': '1h', + 'for': '15m', labels: { severity: 'warning', }, From 614030bb8027e9cbe7e39aec4bf7e1db3290b1fe Mon Sep 17 00:00:00 2001 From: Vitaly Zhuravlev Date: Tue, 28 Mar 2023 04:25:10 +0800 Subject: [PATCH 06/19] Set 'at' everywhere as preposition for instance Signed-off-by: Vitaly Zhuravlev --- docs/node-mixin/alerts/alerts.libsonnet | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/node-mixin/alerts/alerts.libsonnet b/docs/node-mixin/alerts/alerts.libsonnet index 03eebc3383..6c0ef5b4bb 100644 --- a/docs/node-mixin/alerts/alerts.libsonnet +++ b/docs/node-mixin/alerts/alerts.libsonnet @@ -231,7 +231,7 @@ }, annotations: { summary: 'Clock skew detected.', - description: 'Clock on {{ $labels.instance }} is out of sync by more than 0.05s. Ensure NTP is configured correctly on this host.', + description: 'Clock at {{ $labels.instance }} is out of sync by more than 0.05s. Ensure NTP is configured correctly on this host.', }, }, { @@ -247,7 +247,7 @@ }, annotations: { summary: 'Clock not synchronising.', - description: 'Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.', + description: 'Clock at {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.', }, }, { @@ -261,7 +261,7 @@ }, annotations: { summary: 'RAID Array is degraded', - description: "RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.", + description: "RAID array '{{ $labels.device }}' at {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.", }, }, { @@ -274,7 +274,7 @@ }, annotations: { summary: 'Failed device in RAID array', - description: "At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap.", + description: "At least one device in RAID array at {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap.", }, }, { @@ -320,7 +320,7 @@ }, annotations: { summary: 'High CPU usage.', - description: 'CPU usage on {{ $labels.instance }} has been above 80% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%.', + description: 'CPU usage at {{ $labels.instance }} has been above 80% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%.', }, }, { @@ -334,7 +334,7 @@ }, annotations: { summary: 'Host is running out of memory', - description: 'Memory is filling up on {{ $labels.instance }}, has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%.', + description: 'Memory is filling up at {{ $labels.instance }}, has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%.', }, }, { @@ -348,7 +348,7 @@ }, annotations: { summary: 'Systemd service has entered failed state.', - description: 'Systemd service {{ $labels.name }} has entered failed state on {{ $labels.instance }}', + description: 'Systemd service {{ $labels.name }} has entered failed state at {{ $labels.instance }}', }, }, ], From 94fc82e4183630050523535ff1683ab090349d53 Mon Sep 17 00:00:00 2001 From: Vitaly Zhuravlev Date: Tue, 28 Mar 2023 05:26:30 +0800 Subject: [PATCH 07/19] Add NodeDiskIOSaturation alert Signed-off-by: Vitaly Zhuravlev --- docs/node-mixin/alerts/alerts.libsonnet | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/docs/node-mixin/alerts/alerts.libsonnet b/docs/node-mixin/alerts/alerts.libsonnet index 6c0ef5b4bb..bcd955f1e2 100644 --- a/docs/node-mixin/alerts/alerts.libsonnet +++ b/docs/node-mixin/alerts/alerts.libsonnet @@ -333,10 +333,27 @@ severity: 'warning', }, annotations: { - summary: 'Host is running out of memory', + summary: 'Host is running out of memory.', description: 'Memory is filling up at {{ $labels.instance }}, has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%.', }, }, + { + alert: 'NodeDiskIOSaturation', + expr: ||| + rate(node_disk_io_time_weighted_seconds_total{%(nodeExporterSelector)s}[5m]) > 10 + ||| % $._config, + 'for': '30m', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'Disk IO queue is high.', + description: ||| + Disk IO queue (aqu-sq) is high on {{ $labels.device }} at {{ $labels.instance }}, has been above 10 for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%. + This symptom might indicate disk saturation., + |||, + }, + }, { alert: 'NodeSystemdServiceFailed', expr: ||| From 962de6c92119049918fd4290342d79b799838813 Mon Sep 17 00:00:00 2001 From: Vitaly Zhuravlev Date: Tue, 28 Mar 2023 05:27:04 +0800 Subject: [PATCH 08/19] Add %(nodeExporterSelector)s to Network and conntrack alerts Signed-off-by: Vitaly Zhuravlev --- docs/node-mixin/alerts/alerts.libsonnet | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/node-mixin/alerts/alerts.libsonnet b/docs/node-mixin/alerts/alerts.libsonnet index bcd955f1e2..52f3baaf10 100644 --- a/docs/node-mixin/alerts/alerts.libsonnet +++ b/docs/node-mixin/alerts/alerts.libsonnet @@ -159,7 +159,7 @@ { alert: 'NodeNetworkReceiveErrs', expr: ||| - rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01 + rate(node_network_receive_errs_total{%(nodeExporterSelector)s}[2m]) / rate(node_network_receive_packets_total{%(nodeExporterSelector)s}[2m]) > 0.01 ||| % $._config, 'for': '15m', labels: { @@ -173,7 +173,7 @@ { alert: 'NodeNetworkTransmitErrs', expr: ||| - rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01 + rate(node_network_transmit_errs_total{%(nodeExporterSelector)s}[2m]) / rate(node_network_transmit_packets_total{%(nodeExporterSelector)s}[2m]) > 0.01 ||| % $._config, 'for': '15m', labels: { @@ -187,7 +187,7 @@ { alert: 'NodeHighNumberConntrackEntriesUsed', expr: ||| - (node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75 + (node_nf_conntrack_entries{%(nodeExporterSelector)s} / node_nf_conntrack_entries_limit) > 0.75 ||| % $._config, annotations: { summary: 'Number of conntrack are getting close to the limit.', From c3ec6e8af12efe28e3d9f6087a6f4e7d0b58ee24 Mon Sep 17 00:00:00 2001 From: Vitaly Zhuravlev Date: Tue, 28 Mar 2023 06:44:02 +0800 Subject: [PATCH 09/19] Add diskDevice selector Signed-off-by: Vitaly Zhuravlev --- docs/node-mixin/alerts/alerts.libsonnet | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/node-mixin/alerts/alerts.libsonnet b/docs/node-mixin/alerts/alerts.libsonnet index 52f3baaf10..15f5db8fcb 100644 --- a/docs/node-mixin/alerts/alerts.libsonnet +++ b/docs/node-mixin/alerts/alerts.libsonnet @@ -340,7 +340,7 @@ { alert: 'NodeDiskIOSaturation', expr: ||| - rate(node_disk_io_time_weighted_seconds_total{%(nodeExporterSelector)s}[5m]) > 10 + rate(node_disk_io_time_weighted_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s}[5m]) > 10 ||| % $._config, 'for': '30m', labels: { @@ -349,7 +349,7 @@ annotations: { summary: 'Disk IO queue is high.', description: ||| - Disk IO queue (aqu-sq) is high on {{ $labels.device }} at {{ $labels.instance }}, has been above 10 for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%. + Disk IO queue (aqu-sq) is high on {{ $labels.device }} at {{ $labels.instance }}, has been above 10 for the last 15 minutes, is currently at {{ printf "%.2f" $value }}. This symptom might indicate disk saturation., |||, }, From e15e7d6a7b9e674d00357d471ff8fa332704231f Mon Sep 17 00:00:00 2001 From: Vitaly Zhuravlev Date: Tue, 28 Mar 2023 06:44:50 +0800 Subject: [PATCH 10/19] Fix NodeMemoryHighUtilization alert Signed-off-by: Vitaly Zhuravlev --- docs/node-mixin/alerts/alerts.libsonnet | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/node-mixin/alerts/alerts.libsonnet b/docs/node-mixin/alerts/alerts.libsonnet index 15f5db8fcb..2ea61ba1a9 100644 --- a/docs/node-mixin/alerts/alerts.libsonnet +++ b/docs/node-mixin/alerts/alerts.libsonnet @@ -326,7 +326,7 @@ { alert: 'NodeMemoryHighUtilization', expr: ||| - 100 - node_memory_MemAvailable_bytes{%(nodeExporterSelector)s} / node_memory_MemTotal_bytes{%(nodeExporterSelector)s} * 100 < 10 + 100 - (node_memory_MemAvailable_bytes{%(nodeExporterSelector)s} / node_memory_MemTotal_bytes{%(nodeExporterSelector)s} * 100) > 90 ||| % $._config, 'for': '15m', labels: { @@ -334,7 +334,9 @@ }, annotations: { summary: 'Host is running out of memory.', - description: 'Memory is filling up at {{ $labels.instance }}, has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%.', + description: ||| + Memory is filling up at {{ $labels.instance }}, has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%. + |||, }, }, { From 580c497261dd03026a83266d2a383d7546fbbfd8 Mon Sep 17 00:00:00 2001 From: Vitaly Zhuravlev Date: Tue, 28 Mar 2023 06:58:17 +0800 Subject: [PATCH 11/19] Add NodeSystemSaturation and NodeMemoryMajorPagesFaults Signed-off-by: Vitaly Zhuravlev --- docs/node-mixin/alerts/alerts.libsonnet | 37 ++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/docs/node-mixin/alerts/alerts.libsonnet b/docs/node-mixin/alerts/alerts.libsonnet index 2ea61ba1a9..071cfd8dbe 100644 --- a/docs/node-mixin/alerts/alerts.libsonnet +++ b/docs/node-mixin/alerts/alerts.libsonnet @@ -323,6 +323,41 @@ description: 'CPU usage at {{ $labels.instance }} has been above 80% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%.', }, }, + { + alert: 'NodeSystemSaturation', + expr: ||| + node_load1{%(nodeExporterSelector)s} + / count without (cpu, mode) (node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle"}) > 2 + ||| % $._config, + 'for': '15m', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'System saturated, load per core is very high.', + description: ||| + System load per core at {{ $labels.instance }} has been above 2 for the last 15 minutes, is currently at {{ printf "%.2f" $value }}. + This might indicate this instance resources saturation and can cause it becoming unresponsive. + |||, + }, + }, + { + alert: 'NodeMemoryMajorPagesFaults', + expr: ||| + rate(node_vmstat_pgmajfault{%(nodeExporterSelector)s}[5m]) > 500 + ||| % $._config, + 'for': '15m', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'Memory major page faults are occurring at very high rate.', + description: ||| + Memory major pages are occurring at very high rate at {{ $labels.instance }}, 500 major page faults per second for the last 15 minutes, is currently at {{ printf "%.2f" $value }}. + Please check that there is enough memory available at this instance. + |||, + }, + }, { alert: 'NodeMemoryHighUtilization', expr: ||| @@ -352,7 +387,7 @@ summary: 'Disk IO queue is high.', description: ||| Disk IO queue (aqu-sq) is high on {{ $labels.device }} at {{ $labels.instance }}, has been above 10 for the last 15 minutes, is currently at {{ printf "%.2f" $value }}. - This symptom might indicate disk saturation., + This symptom might indicate disk saturation. |||, }, }, From da32f8de1763ab42fc17f6b767561c4d6c22bb13 Mon Sep 17 00:00:00 2001 From: Vitaly Zhuravlev Date: Wed, 29 Mar 2023 19:29:58 +0800 Subject: [PATCH 12/19] Decrease NodeSystemdServiceFailed severity to warning Signed-off-by: Vitaly Zhuravlev --- docs/node-mixin/alerts/alerts.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/node-mixin/alerts/alerts.libsonnet b/docs/node-mixin/alerts/alerts.libsonnet index 071cfd8dbe..5e82ff29c3 100644 --- a/docs/node-mixin/alerts/alerts.libsonnet +++ b/docs/node-mixin/alerts/alerts.libsonnet @@ -398,7 +398,7 @@ ||| % $._config, 'for': '5m', labels: { - severity: 'critical', + severity: 'warning', }, annotations: { summary: 'Systemd service has entered failed state.', From e48e7909f4c66f19350464da1b1f9f1eb164d3a2 Mon Sep 17 00:00:00 2001 From: Vitaly Zhuravlev Date: Wed, 5 Apr 2023 23:47:10 +0800 Subject: [PATCH 13/19] Extend alert description Signed-off-by: Vitaly Zhuravlev --- docs/node-mixin/alerts/alerts.libsonnet | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/node-mixin/alerts/alerts.libsonnet b/docs/node-mixin/alerts/alerts.libsonnet index 5e82ff29c3..3f8586d5f9 100644 --- a/docs/node-mixin/alerts/alerts.libsonnet +++ b/docs/node-mixin/alerts/alerts.libsonnet @@ -204,7 +204,7 @@ ||| % $._config, annotations: { summary: 'Node Exporter text file collector failed to scrape.', - description: 'Node Exporter text file collector failed to scrape.', + description: 'Node Exporter text file collector on {{ $labels.instance }} failed to scrape.', }, labels: { severity: 'warning', @@ -260,7 +260,7 @@ severity: 'critical', }, annotations: { - summary: 'RAID Array is degraded', + summary: 'RAID Array is degraded.', description: "RAID array '{{ $labels.device }}' at {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.", }, }, @@ -273,7 +273,7 @@ severity: 'warning', }, annotations: { - summary: 'Failed device in RAID array', + summary: 'Failed device in RAID array.', description: "At least one device in RAID array at {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap.", }, }, From 2111e70ac7f391bcd6945601da0c6a86a63b852f Mon Sep 17 00:00:00 2001 From: Vitaly Zhuravlev Date: Wed, 5 Apr 2023 23:53:10 +0800 Subject: [PATCH 14/19] Add comma after 'mounted on' Signed-off-by: Vitaly Zhuravlev --- docs/node-mixin/alerts/alerts.libsonnet | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/node-mixin/alerts/alerts.libsonnet b/docs/node-mixin/alerts/alerts.libsonnet index 3f8586d5f9..81ad3c64bc 100644 --- a/docs/node-mixin/alerts/alerts.libsonnet +++ b/docs/node-mixin/alerts/alerts.libsonnet @@ -21,7 +21,7 @@ }, annotations: { summary: 'Filesystem is predicted to run out of space within the next 24 hours.', - description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up.', + description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up.', }, }, { @@ -41,7 +41,7 @@ }, annotations: { summary: 'Filesystem is predicted to run out of space within the next 4 hours.', - description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast.', + description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast.', }, }, { @@ -59,7 +59,7 @@ }, annotations: { summary: 'Filesystem has less than %(fsSpaceAvailableWarningThreshold)d%% space left.' % $._config, - description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.', + description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.', }, }, { @@ -77,7 +77,7 @@ }, annotations: { summary: 'Filesystem has less than %(fsSpaceAvailableCriticalThreshold)d%% space left.' % $._config, - description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.', + description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.', }, }, { @@ -97,7 +97,7 @@ }, annotations: { summary: 'Filesystem is predicted to run out of inodes within the next 24 hours.', - description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up.', + description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up.', }, }, { @@ -117,7 +117,7 @@ }, annotations: { summary: 'Filesystem is predicted to run out of inodes within the next 4 hours.', - description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.', + description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.', }, }, { @@ -135,7 +135,7 @@ }, annotations: { summary: 'Filesystem has less than 5% inodes left.', - description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.', + description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.', }, }, { @@ -153,7 +153,7 @@ }, annotations: { summary: 'Filesystem has less than 3% inodes left.', - description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.', + description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.', }, }, { From 77ae769179acf92cee40fa5d27aa41d817496a7a Mon Sep 17 00:00:00 2001 From: Vitaly Zhuravlev Date: Thu, 6 Apr 2023 00:21:50 +0800 Subject: [PATCH 15/19] Add thresholds for memory alerts Signed-off-by: Vitaly Zhuravlev --- docs/node-mixin/alerts/alerts.libsonnet | 10 +++++----- docs/node-mixin/config.libsonnet | 8 ++++++++ 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/docs/node-mixin/alerts/alerts.libsonnet b/docs/node-mixin/alerts/alerts.libsonnet index 81ad3c64bc..a51e6f2cab 100644 --- a/docs/node-mixin/alerts/alerts.libsonnet +++ b/docs/node-mixin/alerts/alerts.libsonnet @@ -344,7 +344,7 @@ { alert: 'NodeMemoryMajorPagesFaults', expr: ||| - rate(node_vmstat_pgmajfault{%(nodeExporterSelector)s}[5m]) > 500 + rate(node_vmstat_pgmajfault{%(nodeExporterSelector)s}[5m]) > %(memoryMajorPagesFaultsWarningThreshold)s ||| % $._config, 'for': '15m', labels: { @@ -353,15 +353,15 @@ annotations: { summary: 'Memory major page faults are occurring at very high rate.', description: ||| - Memory major pages are occurring at very high rate at {{ $labels.instance }}, 500 major page faults per second for the last 15 minutes, is currently at {{ printf "%.2f" $value }}. + Memory major pages are occurring at very high rate at {{ $labels.instance }}, %(memoryMajorPagesFaultsWarningThreshold)s major page faults per second for the last 15 minutes, is currently at {{ printf "%.2f" $value }}. Please check that there is enough memory available at this instance. - |||, + ||| % $._config, }, }, { alert: 'NodeMemoryHighUtilization', expr: ||| - 100 - (node_memory_MemAvailable_bytes{%(nodeExporterSelector)s} / node_memory_MemTotal_bytes{%(nodeExporterSelector)s} * 100) > 90 + 100 - (node_memory_MemAvailable_bytes{%(nodeExporterSelector)s} / node_memory_MemTotal_bytes{%(nodeExporterSelector)s} * 100) > %(memoryHighUtilizationThreshold)s ||| % $._config, 'for': '15m', labels: { @@ -370,7 +370,7 @@ annotations: { summary: 'Host is running out of memory.', description: ||| - Memory is filling up at {{ $labels.instance }}, has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%. + Memory is filling up at {{ $labels.instance }}, has been above %(memoryHighUtilizationThreshold)s% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%. |||, }, }, diff --git a/docs/node-mixin/config.libsonnet b/docs/node-mixin/config.libsonnet index 86179c8f93..0e32ac158d 100644 --- a/docs/node-mixin/config.libsonnet +++ b/docs/node-mixin/config.libsonnet @@ -60,6 +60,14 @@ fsSpaceAvailableWarningThreshold: 5, fsSpaceAvailableCriticalThreshold: 3, + // Memory utilzation (%) level on which to trigger the + // 'NodeMemoryHighUtilization' alert. + memoryHighUtilizationThreshold: 90, + + // Threshold for the rate of memory major page faults to trigger + // 'NodeMemoryMajorPagesFaults' alert. + memoryMajorPagesFaultsWarningThreshold: 500, + rateInterval: '5m', // Opt-in for multi-cluster support. showMultiCluster: false, From 6bdc1d9c98f237b6c165b5365c96a200fe3c667a Mon Sep 17 00:00:00 2001 From: Vitaly Zhuravlev Date: Thu, 6 Apr 2023 00:56:00 +0800 Subject: [PATCH 16/19] Add thresholds for memory, disk and system alerts Signed-off-by: Vitaly Zhuravlev --- docs/node-mixin/alerts/alerts.libsonnet | 22 +++++++++++----------- docs/node-mixin/config.libsonnet | 11 ++++++++++- 2 files changed, 21 insertions(+), 12 deletions(-) diff --git a/docs/node-mixin/alerts/alerts.libsonnet b/docs/node-mixin/alerts/alerts.libsonnet index a51e6f2cab..68455e44d3 100644 --- a/docs/node-mixin/alerts/alerts.libsonnet +++ b/docs/node-mixin/alerts/alerts.libsonnet @@ -327,7 +327,7 @@ alert: 'NodeSystemSaturation', expr: ||| node_load1{%(nodeExporterSelector)s} - / count without (cpu, mode) (node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle"}) > 2 + / count without (cpu, mode) (node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle"}) > %(systemSaturationPerCoreThreshold)d ||| % $._config, 'for': '15m', labels: { @@ -336,15 +336,15 @@ annotations: { summary: 'System saturated, load per core is very high.', description: ||| - System load per core at {{ $labels.instance }} has been above 2 for the last 15 minutes, is currently at {{ printf "%.2f" $value }}. + System load per core at {{ $labels.instance }} has been above %(systemSaturationPerCoreThreshold)d for the last 15 minutes, is currently at {{ printf "%.2f" $value }}. This might indicate this instance resources saturation and can cause it becoming unresponsive. - |||, + ||| % $._config, }, }, { alert: 'NodeMemoryMajorPagesFaults', expr: ||| - rate(node_vmstat_pgmajfault{%(nodeExporterSelector)s}[5m]) > %(memoryMajorPagesFaultsWarningThreshold)s + rate(node_vmstat_pgmajfault{%(nodeExporterSelector)s}[5m]) > %(memoryMajorPagesFaultsThreshold)d ||| % $._config, 'for': '15m', labels: { @@ -353,7 +353,7 @@ annotations: { summary: 'Memory major page faults are occurring at very high rate.', description: ||| - Memory major pages are occurring at very high rate at {{ $labels.instance }}, %(memoryMajorPagesFaultsWarningThreshold)s major page faults per second for the last 15 minutes, is currently at {{ printf "%.2f" $value }}. + Memory major pages are occurring at very high rate at {{ $labels.instance }}, %(memoryMajorPagesFaultsThreshold)d major page faults per second for the last 15 minutes, is currently at {{ printf "%.2f" $value }}. Please check that there is enough memory available at this instance. ||| % $._config, }, @@ -361,7 +361,7 @@ { alert: 'NodeMemoryHighUtilization', expr: ||| - 100 - (node_memory_MemAvailable_bytes{%(nodeExporterSelector)s} / node_memory_MemTotal_bytes{%(nodeExporterSelector)s} * 100) > %(memoryHighUtilizationThreshold)s + 100 - (node_memory_MemAvailable_bytes{%(nodeExporterSelector)s} / node_memory_MemTotal_bytes{%(nodeExporterSelector)s} * 100) > %(memoryHighUtilizationThreshold)d ||| % $._config, 'for': '15m', labels: { @@ -370,14 +370,14 @@ annotations: { summary: 'Host is running out of memory.', description: ||| - Memory is filling up at {{ $labels.instance }}, has been above %(memoryHighUtilizationThreshold)s% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%. - |||, + Memory is filling up at {{ $labels.instance }}, has been above %(memoryHighUtilizationThreshold)d%% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%. + ||| % $._config, }, }, { alert: 'NodeDiskIOSaturation', expr: ||| - rate(node_disk_io_time_weighted_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s}[5m]) > 10 + rate(node_disk_io_time_weighted_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s}[5m]) > %(diskIOSaturationThreshold)d ||| % $._config, 'for': '30m', labels: { @@ -386,9 +386,9 @@ annotations: { summary: 'Disk IO queue is high.', description: ||| - Disk IO queue (aqu-sq) is high on {{ $labels.device }} at {{ $labels.instance }}, has been above 10 for the last 15 minutes, is currently at {{ printf "%.2f" $value }}. + Disk IO queue (aqu-sq) is high on {{ $labels.device }} at {{ $labels.instance }}, has been above %(diskIOSaturationThreshold)d for the last 15 minutes, is currently at {{ printf "%.2f" $value }}. This symptom might indicate disk saturation. - |||, + ||| % $._config, }, }, { diff --git a/docs/node-mixin/config.libsonnet b/docs/node-mixin/config.libsonnet index 0e32ac158d..49ca6ff868 100644 --- a/docs/node-mixin/config.libsonnet +++ b/docs/node-mixin/config.libsonnet @@ -43,6 +43,11 @@ // just a warning for K8s nodes. nodeCriticalSeverity: 'critical', + + // Load average 1m (per core) on which to trigger the + // 'NodeSystemSaturation' alert. + systemSaturationPerCoreThreshold: 2, + // Available disk space (%) thresholds on which to trigger the // 'NodeFilesystemSpaceFillingUp' alerts. These alerts fire if the disk // usage grows in a way that it is predicted to run out in 4h or 1d @@ -66,7 +71,11 @@ // Threshold for the rate of memory major page faults to trigger // 'NodeMemoryMajorPagesFaults' alert. - memoryMajorPagesFaultsWarningThreshold: 500, + memoryMajorPagesFaultsThreshold: 500, + + // Disk IO queue level above which to trigger + // 'NodeDiskIOSaturation' alert. + diskIOSaturationThreshold: 10, rateInterval: '5m', // Opt-in for multi-cluster support. From b7dfb32bfc1e20bf8c7493427ac085d550589c7e Mon Sep 17 00:00:00 2001 From: Vitaly Zhuravlev Date: Thu, 6 Apr 2023 02:30:53 +0800 Subject: [PATCH 17/19] Set severity to NodeCPUHighUsage to info Signed-off-by: Vitaly Zhuravlev --- docs/node-mixin/alerts/alerts.libsonnet | 16 +++++++++------- docs/node-mixin/config.libsonnet | 4 +++- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/docs/node-mixin/alerts/alerts.libsonnet b/docs/node-mixin/alerts/alerts.libsonnet index 68455e44d3..65794219d2 100644 --- a/docs/node-mixin/alerts/alerts.libsonnet +++ b/docs/node-mixin/alerts/alerts.libsonnet @@ -312,15 +312,17 @@ { alert: 'NodeCPUHighUsage', expr: ||| - sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode!="idle"}[2m]))) > 0.8 + sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode!="idle"}[2m]))) * 100 > %(cpuHighUsageThreshold)d ||| % $._config, 'for': '15m', labels: { - severity: 'warning', + severity: 'info', }, annotations: { summary: 'High CPU usage.', - description: 'CPU usage at {{ $labels.instance }} has been above 80% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%.', + description: ||| + CPU usage at {{ $labels.instance }} has been above %(cpuHighUsageThreshold)d%% for the last 15 minutes, is currently at {{ printf "%%.2f" $value }}%%. + ||| % $._config, }, }, { @@ -336,7 +338,7 @@ annotations: { summary: 'System saturated, load per core is very high.', description: ||| - System load per core at {{ $labels.instance }} has been above %(systemSaturationPerCoreThreshold)d for the last 15 minutes, is currently at {{ printf "%.2f" $value }}. + System load per core at {{ $labels.instance }} has been above %(systemSaturationPerCoreThreshold)d for the last 15 minutes, is currently at {{ printf "%%.2f" $value }}. This might indicate this instance resources saturation and can cause it becoming unresponsive. ||| % $._config, }, @@ -353,7 +355,7 @@ annotations: { summary: 'Memory major page faults are occurring at very high rate.', description: ||| - Memory major pages are occurring at very high rate at {{ $labels.instance }}, %(memoryMajorPagesFaultsThreshold)d major page faults per second for the last 15 minutes, is currently at {{ printf "%.2f" $value }}. + Memory major pages are occurring at very high rate at {{ $labels.instance }}, %(memoryMajorPagesFaultsThreshold)d major page faults per second for the last 15 minutes, is currently at {{ printf "%%.2f" $value }}. Please check that there is enough memory available at this instance. ||| % $._config, }, @@ -370,7 +372,7 @@ annotations: { summary: 'Host is running out of memory.', description: ||| - Memory is filling up at {{ $labels.instance }}, has been above %(memoryHighUtilizationThreshold)d%% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%. + Memory is filling up at {{ $labels.instance }}, has been above %(memoryHighUtilizationThreshold)d%% for the last 15 minutes, is currently at {{ printf "%%.2f" $value }}%%. ||| % $._config, }, }, @@ -386,7 +388,7 @@ annotations: { summary: 'Disk IO queue is high.', description: ||| - Disk IO queue (aqu-sq) is high on {{ $labels.device }} at {{ $labels.instance }}, has been above %(diskIOSaturationThreshold)d for the last 15 minutes, is currently at {{ printf "%.2f" $value }}. + Disk IO queue (aqu-sq) is high on {{ $labels.device }} at {{ $labels.instance }}, has been above %(diskIOSaturationThreshold)d for the last 15 minutes, is currently at {{ printf "%%.2f" $value }}. This symptom might indicate disk saturation. ||| % $._config, }, diff --git a/docs/node-mixin/config.libsonnet b/docs/node-mixin/config.libsonnet index 49ca6ff868..4427b59d14 100644 --- a/docs/node-mixin/config.libsonnet +++ b/docs/node-mixin/config.libsonnet @@ -43,7 +43,9 @@ // just a warning for K8s nodes. nodeCriticalSeverity: 'critical', - + // CPU utilization (%) on which to trigger the + // 'NodeCPUHighUsage' alert. + cpuHighUsageThreshold: 90, // Load average 1m (per core) on which to trigger the // 'NodeSystemSaturation' alert. systemSaturationPerCoreThreshold: 2, From 3e250a95a028ad4ed8c778148aac6d9ab89536ea Mon Sep 17 00:00:00 2001 From: Vitaly Date: Wed, 26 Apr 2023 22:52:40 +0800 Subject: [PATCH 18/19] Update NodeSystemSaturation severity Signed-off-by: Vitaly Zhuravlev --- docs/node-mixin/alerts/alerts.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/node-mixin/alerts/alerts.libsonnet b/docs/node-mixin/alerts/alerts.libsonnet index 65794219d2..df4a3d14dd 100644 --- a/docs/node-mixin/alerts/alerts.libsonnet +++ b/docs/node-mixin/alerts/alerts.libsonnet @@ -333,7 +333,7 @@ ||| % $._config, 'for': '15m', labels: { - severity: 'critical', + severity: 'warning', }, annotations: { summary: 'System saturated, load per core is very high.', From e8d7f4e8b3ced919a16e3afb7056564833ba05ed Mon Sep 17 00:00:00 2001 From: Vitaly Zhuravlev Date: Thu, 29 Jun 2023 23:24:03 +0800 Subject: [PATCH 19/19] Revert alerts pending durtions Signed-off-by: Vitaly Zhuravlev --- docs/node-mixin/alerts/alerts.libsonnet | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/node-mixin/alerts/alerts.libsonnet b/docs/node-mixin/alerts/alerts.libsonnet index df4a3d14dd..1eaedd3d2e 100644 --- a/docs/node-mixin/alerts/alerts.libsonnet +++ b/docs/node-mixin/alerts/alerts.libsonnet @@ -53,7 +53,7 @@ node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} == 0 ) ||| % $._config, - 'for': '15m', + 'for': '30m', labels: { severity: 'warning', }, @@ -71,7 +71,7 @@ node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} == 0 ) ||| % $._config, - 'for': '15m', + 'for': '30m', labels: { severity: '%(nodeCriticalSeverity)s' % $._config, }, @@ -129,7 +129,7 @@ node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} == 0 ) ||| % $._config, - 'for': '15m', + 'for': '1h', labels: { severity: 'warning', }, @@ -147,7 +147,7 @@ node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} == 0 ) ||| % $._config, - 'for': '15m', + 'for': '1h', labels: { severity: '%(nodeCriticalSeverity)s' % $._config, }, @@ -161,7 +161,7 @@ expr: ||| rate(node_network_receive_errs_total{%(nodeExporterSelector)s}[2m]) / rate(node_network_receive_packets_total{%(nodeExporterSelector)s}[2m]) > 0.01 ||| % $._config, - 'for': '15m', + 'for': '1h', labels: { severity: 'warning', }, @@ -175,7 +175,7 @@ expr: ||| rate(node_network_transmit_errs_total{%(nodeExporterSelector)s}[2m]) / rate(node_network_transmit_packets_total{%(nodeExporterSelector)s}[2m]) > 0.01 ||| % $._config, - 'for': '15m', + 'for': '1h', labels: { severity: 'warning', },