-
Notifications
You must be signed in to change notification settings - Fork 8
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
4b111d9
commit 1cea1b6
Showing
4 changed files
with
365 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,290 @@ | ||
groups: | ||
- name: ethpillar.rules | ||
rules: | ||
- alert: HostOutOfMemory | ||
expr: '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' | ||
for: 2m | ||
labels: | ||
severity: warning | ||
annotations: | ||
summary: Host out of memory (instance {{ $labels.instance }}) | ||
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" | ||
|
||
- alert: HostMemoryUnderMemoryPressure | ||
expr: '(rate(node_vmstat_pgmajfault[1m]) > 1000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' | ||
for: 2m | ||
labels: | ||
severity: warning | ||
annotations: | ||
summary: Host memory under memory pressure (instance {{ $labels.instance }}) | ||
description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" | ||
|
||
- alert: HostMemoryIsUnderutilized | ||
expr: '(100 - (avg_over_time(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' | ||
for: 1w | ||
labels: | ||
severity: info | ||
annotations: | ||
summary: Host Memory is underutilized (instance {{ $labels.instance }}) | ||
description: "Node memory is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" | ||
|
||
- alert: HostUnusualNetworkThroughputIn | ||
expr: '(sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' | ||
for: 5m | ||
labels: | ||
severity: warning | ||
annotations: | ||
summary: Host unusual network throughput in (instance {{ $labels.instance }}) | ||
description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" | ||
|
||
- alert: HostUnusualNetworkThroughputOut | ||
expr: '(sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' | ||
for: 5m | ||
labels: | ||
severity: warning | ||
annotations: | ||
summary: Host unusual network throughput out (instance {{ $labels.instance }}) | ||
description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" | ||
|
||
- alert: HostUnusualDiskReadRate | ||
expr: '(sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' | ||
for: 5m | ||
labels: | ||
severity: warning | ||
annotations: | ||
summary: Host unusual disk read rate (instance {{ $labels.instance }}) | ||
description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" | ||
|
||
- alert: HostUnusualDiskWriteRate | ||
expr: '(sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' | ||
for: 2m | ||
labels: | ||
severity: warning | ||
annotations: | ||
summary: Host unusual disk write rate (instance {{ $labels.instance }}) | ||
description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" | ||
|
||
- alert: HostOutOfDiskSpace | ||
expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' | ||
for: 2m | ||
labels: | ||
severity: warning | ||
annotations: | ||
summary: Host out of disk space (instance {{ $labels.instance }}) | ||
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" | ||
|
||
- alert: HostDiskWillFillIn24Hours | ||
expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' | ||
for: 2m | ||
labels: | ||
severity: warning | ||
annotations: | ||
summary: Host disk will fill in 24 hours (instance {{ $labels.instance }}) | ||
description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" | ||
|
||
- alert: HostOutOfInodes | ||
expr: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' | ||
for: 2m | ||
labels: | ||
severity: warning | ||
annotations: | ||
summary: Host out of inodes (instance {{ $labels.instance }}) | ||
description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" | ||
|
||
- alert: HostFilesystemDeviceError | ||
expr: 'node_filesystem_device_error == 1' | ||
for: 2m | ||
labels: | ||
severity: critical | ||
annotations: | ||
summary: Host filesystem device error (instance {{ $labels.instance }}) | ||
description: "{{ $labels.instance }}: Device error with the {{ $labels.mountpoint }} filesystem\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" | ||
|
||
- alert: HostInodesWillFillIn24Hours | ||
expr: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{fstype!="msdosfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{fstype!="msdosfs"} == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' | ||
for: 2m | ||
labels: | ||
severity: warning | ||
annotations: | ||
summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }}) | ||
description: "Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" | ||
|
||
- alert: HostUnusualDiskReadLatency | ||
expr: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' | ||
for: 2m | ||
labels: | ||
severity: warning | ||
annotations: | ||
summary: Host unusual disk read latency (instance {{ $labels.instance }}) | ||
description: "Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" | ||
|
||
- alert: HostUnusualDiskWriteLatency | ||
expr: '(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' | ||
for: 2m | ||
labels: | ||
severity: warning | ||
annotations: | ||
summary: Host unusual disk write latency (instance {{ $labels.instance }}) | ||
description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" | ||
|
||
- alert: HostHighCpuLoad | ||
expr: '(sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' | ||
for: 10m | ||
labels: | ||
severity: warning | ||
annotations: | ||
summary: Host high CPU load (instance {{ $labels.instance }}) | ||
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" | ||
|
||
- alert: HostCpuIsUnderutilized | ||
expr: '(100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' | ||
for: 1w | ||
labels: | ||
severity: info | ||
annotations: | ||
summary: Host CPU is underutilized (instance {{ $labels.instance }}) | ||
description: "CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" | ||
|
||
- alert: HostCpuStealNoisyNeighbor | ||
expr: '(avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' | ||
for: 0m | ||
labels: | ||
severity: warning | ||
annotations: | ||
summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }}) | ||
description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" | ||
|
||
- alert: HostCpuHighIowait | ||
expr: '(avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' | ||
for: 0m | ||
labels: | ||
severity: warning | ||
annotations: | ||
summary: Host CPU high iowait (instance {{ $labels.instance }}) | ||
description: "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" | ||
|
||
- alert: HostUnusualDiskIo | ||
expr: '(rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' | ||
for: 5m | ||
labels: | ||
severity: warning | ||
annotations: | ||
summary: Host unusual disk IO (instance {{ $labels.instance }}) | ||
description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" | ||
|
||
- alert: HostSwapIsFillingUp | ||
expr: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' | ||
for: 2m | ||
labels: | ||
severity: warning | ||
annotations: | ||
summary: Host swap is filling up (instance {{ $labels.instance }}) | ||
description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" | ||
|
||
- alert: HostSystemdServiceCrashed | ||
expr: '(node_systemd_unit_state{state="failed"} == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' | ||
for: 0m | ||
labels: | ||
severity: warning | ||
annotations: | ||
summary: Host systemd service crashed (instance {{ $labels.instance }}) | ||
description: "systemd service crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" | ||
|
||
- alert: HostPhysicalComponentTooHot | ||
expr: '((node_hwmon_temp_celsius * ignoring(label) group_left(instance, job, node, sensor) node_hwmon_sensor_label{label!="tctl"} > 75)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' | ||
for: 5m | ||
labels: | ||
severity: warning | ||
annotations: | ||
summary: Host physical component too hot (instance {{ $labels.instance }}) | ||
description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" | ||
|
||
- alert: HostNodeOvertemperatureAlarm | ||
expr: '(node_hwmon_temp_crit_alarm_celsius == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' | ||
for: 0m | ||
labels: | ||
severity: critical | ||
annotations: | ||
summary: Host node overtemperature alarm (instance {{ $labels.instance }}) | ||
description: "Physical node temperature alarm triggered\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" | ||
|
||
- alert: HostOomKillDetected | ||
expr: '(increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' | ||
for: 0m | ||
labels: | ||
severity: warning | ||
annotations: | ||
summary: Host OOM kill detected (instance {{ $labels.instance }}) | ||
description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" | ||
|
||
- alert: HostEdacCorrectableErrorsDetected | ||
expr: '(increase(node_edac_correctable_errors_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' | ||
for: 0m | ||
labels: | ||
severity: info | ||
annotations: | ||
summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }}) | ||
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" | ||
|
||
- alert: HostEdacUncorrectableErrorsDetected | ||
expr: '(node_edac_uncorrectable_errors_total > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' | ||
for: 0m | ||
labels: | ||
severity: warning | ||
annotations: | ||
summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}) | ||
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" | ||
|
||
- alert: HostNetworkReceiveErrors | ||
expr: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' | ||
for: 2m | ||
labels: | ||
severity: warning | ||
annotations: | ||
summary: Host Network Receive Errors (instance {{ $labels.instance }}) | ||
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" | ||
|
||
- alert: HostNetworkTransmitErrors | ||
expr: '(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' | ||
for: 2m | ||
labels: | ||
severity: warning | ||
annotations: | ||
summary: Host Network Transmit Errors (instance {{ $labels.instance }}) | ||
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" | ||
|
||
- alert: HostNetworkInterfaceSaturated | ||
expr: '((rate(node_network_receive_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"} > 0.8 < 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' | ||
for: 1m | ||
labels: | ||
severity: warning | ||
annotations: | ||
summary: Host Network Interface Saturated (instance {{ $labels.instance }}) | ||
description: "The network interface \"{{ $labels.device }}\" on \"{{ $labels.instance }}\" is getting overloaded.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" | ||
|
||
- alert: HostClockSkew | ||
expr: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' | ||
for: 10m | ||
labels: | ||
severity: warning | ||
annotations: | ||
summary: Host clock skew (instance {{ $labels.instance }}) | ||
description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" | ||
|
||
- alert: HostClockNotSynchronising | ||
expr: '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' | ||
for: 2m | ||
labels: | ||
severity: warning | ||
annotations: | ||
summary: Host clock not synchronising (instance {{ $labels.instance }}) | ||
description: "Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" | ||
|
||
- alert: HostRequiresReboot | ||
expr: '(node_reboot_required > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' | ||
for: 4h | ||
labels: | ||
severity: info | ||
annotations: | ||
summary: Host requires reboot (instance {{ $labels.instance }}) | ||
description: "{{ $labels.instance }} requires a reboot.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.