From 1c2d3b7583d149164f507c065decbf468f0de128 Mon Sep 17 00:00:00 2001 From: Lukas Kolletzki Date: Fri, 21 Jun 2024 19:22:24 +0200 Subject: [PATCH] Add separate alert for HTTP gateway timeout responses --- charts/generic-service/README.md | 3 ++- charts/generic-service/templates/alerts.yaml | 11 ++++++++++- charts/generic-service/values.schema.json | 7 ++++++- charts/generic-service/values.yaml | 1 + 4 files changed, 19 insertions(+), 3 deletions(-) diff --git a/charts/generic-service/README.md b/charts/generic-service/README.md index 87da826..5b63840 100644 --- a/charts/generic-service/README.md +++ b/charts/generic-service/README.md @@ -157,7 +157,8 @@ app: | `alerting.http.referenceInterval` | `1w` | The time interval to to compare with the sample interval to detect changes | | `alerting.http.maxSlowdown` | `2.5` | The maximum HTTP response slowdown in the sample interval compared to the reference interval | | `alerting.http.max4xxRatio` | `2.5` | The maximum HTTP 4xx ratio increase in the sample interval compared to the reference interval | -| `alerting.http.max5xxCount` | `0` | The maximum number of HTTP 5xx responses in the sample interval | +| `alerting.http.max5xxCount` | `0` | The maximum number of HTTP 5xx responses (except 504) in the sample interval | +| `alerting.http.maxTimeoutCount` | `0` | The maximum number of HTTP gateway timeout responses (504) in the sample interval | | `alerting.grpc.requestsMetric` | `grpc_server_handled_total` | The name of the Prometheus metric counting gRPC requests | | `alerting.grpc.sampleInterval` | `20m` | The time interval in which to measure gRPC responses | | `alerting.grpc.referenceInterval` | `1w` | The time interval to to compare with the sample interval to detect changes | diff --git a/charts/generic-service/templates/alerts.yaml b/charts/generic-service/templates/alerts.yaml index 2ae2fc2..412dba2 100644 --- a/charts/generic-service/templates/alerts.yaml +++ b/charts/generic-service/templates/alerts.yaml @@ -178,12 +178,21 @@ spec: - alert: Http5xx expr: | - sum(round(increase({{ include "generic-service.request-code-count-metric" . }}"5.."}[{{ .Values.alerting.http.sampleInterval }}]))) + sum(round(increase({{ include "generic-service.request-code-count-metric" . }}"5.[^4]"}[{{ .Values.alerting.http.sampleInterval }}]))) > {{ .Values.alerting.http.max5xxCount }} labels: {{- include "generic-service.alert-labels" . | nindent 12 }} critical topic: ingress annotations: {{- include "generic-service.alert-annotations" . | nindent 12 }} HTTP 5xx responses description: '{{ include "generic-service.fullname" . }} gave {{"{{ $value }}"}} HTTP 5xx responses in the last {{ .Values.alerting.http.sampleInterval }}.' + + - alert: HttpTimeout + expr: | + sum(round(increase({{ include "generic-service.request-code-count-metric" . }}"504"}[{{ .Values.alerting.http.sampleInterval }}]))) + > {{ .Values.alerting.http.maxTimoutCount }} + labels: {{- include "generic-service.alert-labels" . | nindent 12 }} critical + topic: ingress + annotations: {{- include "generic-service.alert-annotations" . | nindent 12 }} HTTP gateway timeout responses + description: '{{ include "generic-service.fullname" . }} gave {{"{{ $value }}"}} HTTP gateway timout responses in the last {{ .Values.alerting.http.sampleInterval }}.' {{- end }} {{- if or (eq .Values.ingress.protocol "grpc") (eq .Values.ingress.protocol "grpcs") }} diff --git a/charts/generic-service/values.schema.json b/charts/generic-service/values.schema.json index e365b16..bef951c 100644 --- a/charts/generic-service/values.schema.json +++ b/charts/generic-service/values.schema.json @@ -889,7 +889,12 @@ "max5xxCount": { "type": "number", "default": 0, - "description": "The maximum number of HTTP 5xx responses in the sample interval" + "description": "The maximum number of HTTP 5xx responses (except 504) in the sample interval" + }, + "maxTimeoutCount": { + "type": "number", + "default": 0, + "description": "The maximum number of HTTP gateway timeout responses (504) in the sample interval" } }, "additionalProperties": false diff --git a/charts/generic-service/values.yaml b/charts/generic-service/values.yaml index 6a93f77..ebed30b 100644 --- a/charts/generic-service/values.yaml +++ b/charts/generic-service/values.yaml @@ -173,6 +173,7 @@ alerting: maxSlowdown: 2.5 max4xxRatio: 2.5 max5xxCount: 0 + maxTimeoutCount: 0 grpc: requestsMetric: grpc_server_handled_total sampleInterval: 20m