Skip to content

Commit

Permalink
Improved alerting rules
Browse files Browse the repository at this point in the history
  • Loading branch information
bastianeicher committed Jul 14, 2021
1 parent dedce7d commit 104a402
Showing 1 changed file with 32 additions and 45 deletions.
77 changes: 32 additions & 45 deletions charts/generic-service/templates/alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,63 +11,54 @@ spec:
groups:
- name: {{ include "generic-service.fullname" . }}.rules
rules:
- alert: ReplicasUnavailable
expr: kube_deployment_status_replicas_unavailable{namespace="{{ .Release.Namespace }}", deployment="{{ include "generic-service.fullname" . }}"} > 0
for: 10m
- alert: Down
expr: kube_deployment_status_replicas_available{namespace="{{ .Release.Namespace }}", deployment="{{ include "generic-service.fullname" . }}"} == 0
for: '{{ if .Values.ingress.enabled }}1m{{ else }}5m{{ end }}'
labels:
component: {{ include "generic-service.fullname" . }}
severity: warning
{{- if .Values.alerting.labels }}
{{ toYaml .Values.alerting.labels | indent 12 }}
{{- end }}
severity: critical
{{- if .Values.alerting.labels }}{{ toYaml .Values.alerting.labels | nindent 12 }}{{ end }}
annotations:
summary: Some {{ include "generic-service.fullname" . }} pods are unavailable
description: Some {{ include "generic-service.fullname" . }} pods were unavailable for the last 10 minutes
summary: '{{ include "generic-service.fullname" . }} is down.'
description: 'All replicas for {{ include "generic-service.fullname" . }} are down.'

{{- if .Values.ingress.enabled }}
- alert: ServiceUnavailable
expr: kube_deployment_status_replicas_available{namespace="{{ .Release.Namespace }}", deployment="{{ include "generic-service.fullname" . }}"} == 0
for: 1m
{{- if gt .Values.replicas 1.0 }}
- alert: ReplicasDown
expr: kube_deployment_status_replicas_unavailable{namespace="{{ .Release.Namespace }}", deployment="{{ include "generic-service.fullname" . }}"} > 0
for: 10m
labels:
component: {{ include "generic-service.fullname" . }}
severity: critical
{{- if .Values.alerting.labels }}
{{ toYaml .Values.alerting.labels | indent 12 }}
{{- end }}
severity: warning
{{- if .Values.alerting.labels }}{{ toYaml .Values.alerting.labels | nindent 12 }}{{ end }}
annotations:
summary: Service {{ include "generic-service.fullname" . }} unavailable
description: The service {{ include "generic-service.fullname" . }} was unavailable for the last 1 minute
{{- end }}
summary: '{{ include "generic-service.fullname" . }} replicas are down.'
description: '{{"{{$value}}"}} replicas for {{ include "generic-service.fullname" . }} are down.'
{{- end }}

- alert: HighMemoryUsage
expr: >
max(container_memory_working_set_bytes{namespace="{{ .Release.Namespace }}", container_name="{{ include "generic-service.fullname" . }}"}) /
min(kube_pod_container_resource_limits_memory_bytes{namespace="{{ .Release.Namespace }}", container="{{ include "generic-service.fullname" . }}"}) >
{{ .Values.alerting.memoryUsage.thresholdFactor }}
for: 5m
max(container_memory_working_set_bytes{namespace="{{ .Release.Namespace }}", container="{{ include "generic-service.fullname" . }}"}) /
min(kube_pod_container_resource_limits{namespace="{{ .Release.Namespace }}", container="{{ include "generic-service.fullname" . }}", resource="memory"})
* 100 > {{ .Values.alerting.memoryUsage.thresholdFactor }} * 100
labels:
component: {{ include "generic-service.fullname" . }}
severity: warning
{{- if .Values.alerting.labels }}
{{ toYaml .Values.alerting.labels | indent 12 }}
{{- end }}
{{- if .Values.alerting.labels }}{{ toYaml .Values.alerting.labels | nindent 12 }}{{ end }}
annotations:
summary: {{ include "generic-service.fullname" . }} memory usage is high
description: {{ include "generic-service.fullname" . }} is using more than {{ .Values.alerting.memoryUsage.thresholdFactor }} of its available memory
summary: '{{ include "generic-service.fullname" . }} memory usage is high.'
description: '{{ include "generic-service.fullname" . }} is using {{"{{$value}}"}}% of its available memory.'

{{ if .Values.ingress.istio.enabled }}
- alert: Http5xxOccurred
- alert: Http5xxResponses
expr: >
sum(increase(istio_requests_total{ {{ include "generic-service.istio-filter" . }}, response_code=~"5.." }[1m])) > 0
labels:
component: {{ include "generic-service.fullname" . }}
severity: warning
{{- if .Values.alerting.labels }}
{{ toYaml .Values.alerting.labels | indent 12 }}
{{- end }}
{{- if .Values.alerting.labels }}{{ toYaml .Values.alerting.labels | nindent 12 }}{{ end }}
annotations:
summary: {{ include "generic-service.fullname" . }} HTTP 5xx responses occurred
description: {{ include "generic-service.fullname" . }} HTTP 5xx responses occurred in the last minute
summary: '{{ include "generic-service.fullname" . }} gave HTTP 5xx responses.'
description: '{{ include "generic-service.fullname" . }} gave {{"{{$value}}"}} HTTP 5xx responses in the last minute.'

- alert: HighHttp4xxRatio
expr: >
Expand All @@ -78,12 +69,10 @@ spec:
labels:
component: {{ include "generic-service.fullname" . }}
severity: info
{{- if .Values.alerting.labels }}
{{ toYaml .Values.alerting.labels | indent 12 }}
{{- end }}
{{- if .Values.alerting.labels }}{{ toYaml .Values.alerting.labels | nindent 12 }}{{ end }}
annotations:
summary: Ratio of {{ include "generic-service.fullname" . }} HTTP 4xx responses is high
description: Ratio of {{ include "generic-service.fullname" . }} HTTP responses with 4xx status codes in the last {{ .Values.alerting.http4xxRatio.sampleInterval }} is higher than in the last {{ .Values.alerting.http4xxRatio.referenceInterval }}
summary: '{{ include "generic-service.fullname" . }} gave more HTTP 4xx responses than usual.'
description: '{{ include "generic-service.fullname" . }} gave {{"{{$value}}"}}x more HTTP 4xx responses in the last {{ .Values.alerting.http4xxRatio.sampleInterval }} than in the last {{ .Values.alerting.http4xxRatio.referenceInterval }}.'

- alert: SlowResponseTime
expr: >
Expand All @@ -94,12 +83,10 @@ spec:
labels:
component: {{ include "generic-service.fullname" . }}
severity: info
{{- if .Values.alerting.labels }}
{{ toYaml .Values.alerting.labels | indent 12 }}
{{- end }}
{{- if .Values.alerting.labels }}{{ toYaml .Values.alerting.labels | nindent 12 }}{{ end }}
annotations:
summary: Slow {{ include "generic-service.fullname" . }} HTTP repsonses
description: {{ include "generic-service.fullname" . }} HTTP responses in the last {{ .Values.alerting.responseTime.sampleInterval }} were slower than in the last {{ .Values.alerting.responseTime.referenceInterval }}
summary: '{{ include "generic-service.fullname" . }} HTTP responses are slower than usual.'
description: '{{ include "generic-service.fullname" . }} HTTP responses are {{"{{$value}}"}}x slower in the last {{ .Values.alerting.responseTime.sampleInterval }} than in the last {{ .Values.alerting.responseTime.referenceInterval }}.'
{{- end }}

{{- end }}

0 comments on commit 104a402

Please sign in to comment.