diff --git a/docs/node-mixin/alerts/alerts.libsonnet b/docs/node-mixin/alerts/alerts.libsonnet index f92d97dffe..1eaedd3d2e 100644 --- a/docs/node-mixin/alerts/alerts.libsonnet +++ b/docs/node-mixin/alerts/alerts.libsonnet @@ -21,7 +21,7 @@ }, annotations: { summary: 'Filesystem is predicted to run out of space within the next 24 hours.', - description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up.', + description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up.', }, }, { @@ -41,7 +41,7 @@ }, annotations: { summary: 'Filesystem is predicted to run out of space within the next 4 hours.', - description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast.', + description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast.', }, }, { @@ -59,7 +59,7 @@ }, annotations: { summary: 'Filesystem has less than %(fsSpaceAvailableWarningThreshold)d%% space left.' % $._config, - description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.', + description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.', }, }, { @@ -77,7 +77,7 @@ }, annotations: { summary: 'Filesystem has less than %(fsSpaceAvailableCriticalThreshold)d%% space left.' % $._config, - description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.', + description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.', }, }, { @@ -97,7 +97,7 @@ }, annotations: { summary: 'Filesystem is predicted to run out of inodes within the next 24 hours.', - description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up.', + description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up.', }, }, { @@ -117,7 +117,7 @@ }, annotations: { summary: 'Filesystem is predicted to run out of inodes within the next 4 hours.', - description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.', + description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.', }, }, { @@ -135,7 +135,7 @@ }, annotations: { summary: 'Filesystem has less than 5% inodes left.', - description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.', + description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.', }, }, { @@ -153,13 +153,13 @@ }, annotations: { summary: 'Filesystem has less than 3% inodes left.', - description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.', + description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.', }, }, { alert: 'NodeNetworkReceiveErrs', expr: ||| - rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01 + rate(node_network_receive_errs_total{%(nodeExporterSelector)s}[2m]) / rate(node_network_receive_packets_total{%(nodeExporterSelector)s}[2m]) > 0.01 ||| % $._config, 'for': '1h', labels: { @@ -173,7 +173,7 @@ { alert: 'NodeNetworkTransmitErrs', expr: ||| - rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01 + rate(node_network_transmit_errs_total{%(nodeExporterSelector)s}[2m]) / rate(node_network_transmit_packets_total{%(nodeExporterSelector)s}[2m]) > 0.01 ||| % $._config, 'for': '1h', labels: { @@ -187,7 +187,7 @@ { alert: 'NodeHighNumberConntrackEntriesUsed', expr: ||| - (node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75 + (node_nf_conntrack_entries{%(nodeExporterSelector)s} / node_nf_conntrack_entries_limit) > 0.75 ||| % $._config, annotations: { summary: 'Number of conntrack are getting close to the limit.', @@ -204,7 +204,7 @@ ||| % $._config, annotations: { summary: 'Node Exporter text file collector failed to scrape.', - description: 'Node Exporter text file collector failed to scrape.', + description: 'Node Exporter text file collector on {{ $labels.instance }} failed to scrape.', }, labels: { severity: 'warning', @@ -231,7 +231,7 @@ }, annotations: { summary: 'Clock skew detected.', - description: 'Clock on {{ $labels.instance }} is out of sync by more than 0.05s. Ensure NTP is configured correctly on this host.', + description: 'Clock at {{ $labels.instance }} is out of sync by more than 0.05s. Ensure NTP is configured correctly on this host.', }, }, { @@ -247,7 +247,7 @@ }, annotations: { summary: 'Clock not synchronising.', - description: 'Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.', + description: 'Clock at {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.', }, }, { @@ -260,8 +260,8 @@ severity: 'critical', }, annotations: { - summary: 'RAID Array is degraded', - description: "RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.", + summary: 'RAID Array is degraded.', + description: "RAID array '{{ $labels.device }}' at {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.", }, }, { @@ -273,8 +273,8 @@ severity: 'warning', }, annotations: { - summary: 'Failed device in RAID array', - description: "At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap.", + summary: 'Failed device in RAID array.', + description: "At least one device in RAID array at {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap.", }, }, { @@ -309,6 +309,104 @@ description: 'File descriptors limit at {{ $labels.instance }} is currently at {{ printf "%.2f" $value }}%.', }, }, + { + alert: 'NodeCPUHighUsage', + expr: ||| + sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode!="idle"}[2m]))) * 100 > %(cpuHighUsageThreshold)d + ||| % $._config, + 'for': '15m', + labels: { + severity: 'info', + }, + annotations: { + summary: 'High CPU usage.', + description: ||| + CPU usage at {{ $labels.instance }} has been above %(cpuHighUsageThreshold)d%% for the last 15 minutes, is currently at {{ printf "%%.2f" $value }}%%. + ||| % $._config, + }, + }, + { + alert: 'NodeSystemSaturation', + expr: ||| + node_load1{%(nodeExporterSelector)s} + / count without (cpu, mode) (node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle"}) > %(systemSaturationPerCoreThreshold)d + ||| % $._config, + 'for': '15m', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'System saturated, load per core is very high.', + description: ||| + System load per core at {{ $labels.instance }} has been above %(systemSaturationPerCoreThreshold)d for the last 15 minutes, is currently at {{ printf "%%.2f" $value }}. + This might indicate this instance resources saturation and can cause it becoming unresponsive. + ||| % $._config, + }, + }, + { + alert: 'NodeMemoryMajorPagesFaults', + expr: ||| + rate(node_vmstat_pgmajfault{%(nodeExporterSelector)s}[5m]) > %(memoryMajorPagesFaultsThreshold)d + ||| % $._config, + 'for': '15m', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'Memory major page faults are occurring at very high rate.', + description: ||| + Memory major pages are occurring at very high rate at {{ $labels.instance }}, %(memoryMajorPagesFaultsThreshold)d major page faults per second for the last 15 minutes, is currently at {{ printf "%%.2f" $value }}. + Please check that there is enough memory available at this instance. + ||| % $._config, + }, + }, + { + alert: 'NodeMemoryHighUtilization', + expr: ||| + 100 - (node_memory_MemAvailable_bytes{%(nodeExporterSelector)s} / node_memory_MemTotal_bytes{%(nodeExporterSelector)s} * 100) > %(memoryHighUtilizationThreshold)d + ||| % $._config, + 'for': '15m', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'Host is running out of memory.', + description: ||| + Memory is filling up at {{ $labels.instance }}, has been above %(memoryHighUtilizationThreshold)d%% for the last 15 minutes, is currently at {{ printf "%%.2f" $value }}%%. + ||| % $._config, + }, + }, + { + alert: 'NodeDiskIOSaturation', + expr: ||| + rate(node_disk_io_time_weighted_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s}[5m]) > %(diskIOSaturationThreshold)d + ||| % $._config, + 'for': '30m', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'Disk IO queue is high.', + description: ||| + Disk IO queue (aqu-sq) is high on {{ $labels.device }} at {{ $labels.instance }}, has been above %(diskIOSaturationThreshold)d for the last 15 minutes, is currently at {{ printf "%%.2f" $value }}. + This symptom might indicate disk saturation. + ||| % $._config, + }, + }, + { + alert: 'NodeSystemdServiceFailed', + expr: ||| + node_systemd_unit_state{%(nodeExporterSelector)s, state="failed"} == 1 + ||| % $._config, + 'for': '5m', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'Systemd service has entered failed state.', + description: 'Systemd service {{ $labels.name }} has entered failed state at {{ $labels.instance }}', + }, + }, ], }, ], diff --git a/docs/node-mixin/config.libsonnet b/docs/node-mixin/config.libsonnet index 86179c8f93..4427b59d14 100644 --- a/docs/node-mixin/config.libsonnet +++ b/docs/node-mixin/config.libsonnet @@ -43,6 +43,13 @@ // just a warning for K8s nodes. nodeCriticalSeverity: 'critical', + // CPU utilization (%) on which to trigger the + // 'NodeCPUHighUsage' alert. + cpuHighUsageThreshold: 90, + // Load average 1m (per core) on which to trigger the + // 'NodeSystemSaturation' alert. + systemSaturationPerCoreThreshold: 2, + // Available disk space (%) thresholds on which to trigger the // 'NodeFilesystemSpaceFillingUp' alerts. These alerts fire if the disk // usage grows in a way that it is predicted to run out in 4h or 1d @@ -60,6 +67,18 @@ fsSpaceAvailableWarningThreshold: 5, fsSpaceAvailableCriticalThreshold: 3, + // Memory utilzation (%) level on which to trigger the + // 'NodeMemoryHighUtilization' alert. + memoryHighUtilizationThreshold: 90, + + // Threshold for the rate of memory major page faults to trigger + // 'NodeMemoryMajorPagesFaults' alert. + memoryMajorPagesFaultsThreshold: 500, + + // Disk IO queue level above which to trigger + // 'NodeDiskIOSaturation' alert. + diskIOSaturationThreshold: 10, + rateInterval: '5m', // Opt-in for multi-cluster support. showMultiCluster: false,