From 51b8f371364e8ea8d35c2f7ebe53525dbef6281c Mon Sep 17 00:00:00 2001 From: Thomas <12561498+tn819@users.noreply.github.com> Date: Fri, 19 Jul 2024 11:34:33 +0000 Subject: [PATCH 1/8] adjust to include a 499 error and remove 499 from 4xx error --- charts/generic-service/templates/alerts.yaml | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/charts/generic-service/templates/alerts.yaml b/charts/generic-service/templates/alerts.yaml index e694bca..fb23b28 100644 --- a/charts/generic-service/templates/alerts.yaml +++ b/charts/generic-service/templates/alerts.yaml @@ -165,15 +165,20 @@ spec: topic: ingress annotations: {{- include "generic-service.alert-annotations" . | nindent 12 }} HTTP responses slower than usual description: '{{ include "generic-service.fullname" . }} HTTP responses are {{"{{ $value }}"}}x slower in the last {{ .Values.alerting.http.sampleInterval }} than in the last {{ .Values.alerting.http.referenceInterval }}.' - - alert: Http4xx expr: | - (sum(rate({{ include "generic-service.request-code-count-metric" . }}"4.."}[{{ .Values.alerting.http.sampleInterval }}])) / sum(rate({{ include "generic-service.request-count-metric" . }}[{{ .Values.alerting.http.sampleInterval }}]))) / - (sum(rate({{ include "generic-service.request-code-count-metric" . }}"4.."}[{{ .Values.alerting.http.referenceInterval }}])) / sum(rate({{ include "generic-service.request-count-metric" . }}[{{ .Values.alerting.http.referenceInterval }}]))) - > {{ .Values.alerting.http.max4xxRatio }} + (sum(rate({{ include "generic-service.request-code-count-metric" . }}"4.."}[{{ .Values.alerting.http.sampleInterval }}])) - sum(rate({{ include "generic-service.request-code-count-metric" . }}"499"[{{ .Values.alerting.http.sampleInterval }}]))) / + (sum(rate({{ include "generic-service.request-code-count-metric" . }}"4.."}[{{ .Values.alerting.http.referenceInterval }}])) - sum(rate({{ include "generic-service.request-code-count-metric" . }}"499"[{{ .Values.alerting.http.referenceInterval }}])))) > {{ .Values.alerting.http.max4xxRatio }} labels: {{- include "generic-service.alert-labels" . | nindent 12 }} warning annotations: {{- include "generic-service.alert-annotations" . | nindent 12 }} higher HTTP 4xx rate description: '{{ include "generic-service.fullname" . }} gave {{"{{ $value }}"}}x more HTTP 4xx responses per request in the last {{ .Values.alerting.http.sampleInterval }} than in the last {{ .Values.alerting.http.referenceInterval }}.' + - alert: ClientConnectionClosed499 + expr: | + (sum(rate({{ include "generic-service.request-code-count-metric" . }}"499"[{{ .Values.alerting.http.sampleInterval }}]))) / + (sum(rate({{ include "generic-service.request-code-count-metric" . }}"499"[{{ .Values.alerting.http.referenceInterval }}])))) > {{ .Values.alerting.http.max4xxRatio }} + labels: {{- include "generic-service.alert-labels" . | nindent 12 }} warning + annotations: {{- include "generic-service.alert-annotations" . | nindent 12 }} higher HTTP 499 rate + description: '{{ include "generic-service.fullname" . }} gave {{"{{ $value }}"}}x more HTTP 499 responses per request in the last {{ .Values.alerting.http.sampleInterval }} than in the last {{ .Values.alerting.http.referenceInterval }}.' {{- end }} - alert: Http5xx From 0a5f742eae382c034b8b5e9b4d7bdfc89d8479d9 Mon Sep 17 00:00:00 2001 From: Thomas <12561498+tn819@users.noreply.github.com> Date: Wed, 24 Jul 2024 09:27:49 +0200 Subject: [PATCH 2/8] Update charts/generic-service/templates/alerts.yaml Co-authored-by: Bastian Eicher --- charts/generic-service/templates/alerts.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charts/generic-service/templates/alerts.yaml b/charts/generic-service/templates/alerts.yaml index fb23b28..0cd1e38 100644 --- a/charts/generic-service/templates/alerts.yaml +++ b/charts/generic-service/templates/alerts.yaml @@ -172,7 +172,7 @@ spec: labels: {{- include "generic-service.alert-labels" . | nindent 12 }} warning annotations: {{- include "generic-service.alert-annotations" . | nindent 12 }} higher HTTP 4xx rate description: '{{ include "generic-service.fullname" . }} gave {{"{{ $value }}"}}x more HTTP 4xx responses per request in the last {{ .Values.alerting.http.sampleInterval }} than in the last {{ .Values.alerting.http.referenceInterval }}.' - - alert: ClientConnectionClosed499 + - alert: HttpClientCancelled expr: | (sum(rate({{ include "generic-service.request-code-count-metric" . }}"499"[{{ .Values.alerting.http.sampleInterval }}]))) / (sum(rate({{ include "generic-service.request-code-count-metric" . }}"499"[{{ .Values.alerting.http.referenceInterval }}])))) > {{ .Values.alerting.http.max4xxRatio }} From 0df2cdf4b6e950e2f6139e909f2114454e32ab88 Mon Sep 17 00:00:00 2001 From: Thomas <12561498+tn819@users.noreply.github.com> Date: Wed, 24 Jul 2024 09:28:17 +0200 Subject: [PATCH 3/8] Update charts/generic-service/templates/alerts.yaml Co-authored-by: Bastian Eicher --- charts/generic-service/templates/alerts.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/charts/generic-service/templates/alerts.yaml b/charts/generic-service/templates/alerts.yaml index 0cd1e38..09858c7 100644 --- a/charts/generic-service/templates/alerts.yaml +++ b/charts/generic-service/templates/alerts.yaml @@ -177,8 +177,8 @@ spec: (sum(rate({{ include "generic-service.request-code-count-metric" . }}"499"[{{ .Values.alerting.http.sampleInterval }}]))) / (sum(rate({{ include "generic-service.request-code-count-metric" . }}"499"[{{ .Values.alerting.http.referenceInterval }}])))) > {{ .Values.alerting.http.max4xxRatio }} labels: {{- include "generic-service.alert-labels" . | nindent 12 }} warning - annotations: {{- include "generic-service.alert-annotations" . | nindent 12 }} higher HTTP 499 rate - description: '{{ include "generic-service.fullname" . }} gave {{"{{ $value }}"}}x more HTTP 499 responses per request in the last {{ .Values.alerting.http.sampleInterval }} than in the last {{ .Values.alerting.http.referenceInterval }}.' + annotations: {{- include "generic-service.alert-annotations" . | nindent 12 }} higher HTTP client cancellation rate + description: '{{ include "generic-service.fullname" . }} gave a {{"{{ $value }}"}}x higher percentage of HTTP request cancelled by the client in the last {{ .Values.alerting.http.sampleInterval }} than in the last {{ .Values.alerting.http.referenceInterval }}.' {{- end }} - alert: Http5xx From c841478ce0b03767b3800624eab87fd80026f951 Mon Sep 17 00:00:00 2001 From: Thomas <12561498+tn819@users.noreply.github.com> Date: Wed, 24 Jul 2024 07:37:00 +0000 Subject: [PATCH 4/8] introduce maxClientCancellationRatio --- charts/generic-service/README.md | 4 +++- charts/generic-service/templates/alerts.yaml | 2 +- charts/generic-service/values.schema.json | 5 +++++ 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/charts/generic-service/README.md b/charts/generic-service/README.md index def47a2..a972b47 100644 --- a/charts/generic-service/README.md +++ b/charts/generic-service/README.md @@ -156,7 +156,9 @@ app: | `alerting.http.sampleInterval` | `20m` | The time interval in which to measure HTTP responses for triggering alerts | | `alerting.http.referenceInterval` | `1w` | The time interval to compare with the sample interval to detect changes | | `alerting.http.maxSlowdown` | `2.5` | The maximum HTTP response slowdown in the sample interval compared to the reference interval | -| `alerting.http.max4xxRatio` | `2.5` | The maximum HTTP 4xx ratio increase in the sample interval compared to the reference interval | +| `alerting.http.max4xxRatio` | `2.5` | The maximum HTTP 4xx ratio increase (except 499) in the sample interval compared to the reference interval | +| `alerting.http.maxClientCancellationRatio` | `0` | The maximum client cancellation (HTTP 499) ratio increase in the sample interval compared to the reference interval | +| `alerting.http.maxTimeoutCount` | `0` | The maximum number of HTTP gateway timeout responses (504) in the sample interval | `alerting.http.max5xxCount` | `0` | The maximum number of HTTP 5xx responses (except 504) in the sample interval | | `alerting.http.maxTimeoutCount` | `0` | The maximum number of HTTP gateway timeout responses (504) in the sample interval | | `alerting.grpc.requestsMetric` | `grpc_server_handled_total` | The name of the Prometheus metric counting gRPC requests | diff --git a/charts/generic-service/templates/alerts.yaml b/charts/generic-service/templates/alerts.yaml index 09858c7..72b24d7 100644 --- a/charts/generic-service/templates/alerts.yaml +++ b/charts/generic-service/templates/alerts.yaml @@ -175,7 +175,7 @@ spec: - alert: HttpClientCancelled expr: | (sum(rate({{ include "generic-service.request-code-count-metric" . }}"499"[{{ .Values.alerting.http.sampleInterval }}]))) / - (sum(rate({{ include "generic-service.request-code-count-metric" . }}"499"[{{ .Values.alerting.http.referenceInterval }}])))) > {{ .Values.alerting.http.max4xxRatio }} + (sum(rate({{ include "generic-service.request-code-count-metric" . }}"499"[{{ .Values.alerting.http.referenceInterval }}])))) > {{ .Values.alerting.http.maxClientCancellationRatio }} labels: {{- include "generic-service.alert-labels" . | nindent 12 }} warning annotations: {{- include "generic-service.alert-annotations" . | nindent 12 }} higher HTTP client cancellation rate description: '{{ include "generic-service.fullname" . }} gave a {{"{{ $value }}"}}x higher percentage of HTTP request cancelled by the client in the last {{ .Values.alerting.http.sampleInterval }} than in the last {{ .Values.alerting.http.referenceInterval }}.' diff --git a/charts/generic-service/values.schema.json b/charts/generic-service/values.schema.json index f10b3ad..0ad6070 100644 --- a/charts/generic-service/values.schema.json +++ b/charts/generic-service/values.schema.json @@ -886,6 +886,11 @@ "default": 2.5, "description": "The maximum HTTP 4xx ratio increase in the sample interval compared to the reference interval" }, + "maxClientCancellationRatio": { + "type": "number", + "default": 2.5, + "description": "The maximum client cancellation (HTTP 499) ratio increase in the sample interval compared to the reference interval" + }, "max5xxCount": { "type": "number", "default": 0, From 3977f6383dfd1730ba4a74a53ce8977edc106b69 Mon Sep 17 00:00:00 2001 From: Thomas <12561498+tn819@users.noreply.github.com> Date: Mon, 29 Jul 2024 09:45:37 +0200 Subject: [PATCH 5/8] Update charts/generic-service/README.md Co-authored-by: Bastian Eicher --- charts/generic-service/README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/charts/generic-service/README.md b/charts/generic-service/README.md index a972b47..39799a4 100644 --- a/charts/generic-service/README.md +++ b/charts/generic-service/README.md @@ -158,7 +158,6 @@ app: | `alerting.http.maxSlowdown` | `2.5` | The maximum HTTP response slowdown in the sample interval compared to the reference interval | | `alerting.http.max4xxRatio` | `2.5` | The maximum HTTP 4xx ratio increase (except 499) in the sample interval compared to the reference interval | | `alerting.http.maxClientCancellationRatio` | `0` | The maximum client cancellation (HTTP 499) ratio increase in the sample interval compared to the reference interval | -| `alerting.http.maxTimeoutCount` | `0` | The maximum number of HTTP gateway timeout responses (504) in the sample interval | `alerting.http.max5xxCount` | `0` | The maximum number of HTTP 5xx responses (except 504) in the sample interval | | `alerting.http.maxTimeoutCount` | `0` | The maximum number of HTTP gateway timeout responses (504) in the sample interval | | `alerting.grpc.requestsMetric` | `grpc_server_handled_total` | The name of the Prometheus metric counting gRPC requests | From a82373eeb162a6ed7026aa01bcbd3fbbc54541e5 Mon Sep 17 00:00:00 2001 From: Thomas <12561498+tn819@users.noreply.github.com> Date: Mon, 29 Jul 2024 09:45:59 +0200 Subject: [PATCH 6/8] Update charts/generic-service/README.md Co-authored-by: Bastian Eicher --- charts/generic-service/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/charts/generic-service/README.md b/charts/generic-service/README.md index 39799a4..c5bb2e0 100644 --- a/charts/generic-service/README.md +++ b/charts/generic-service/README.md @@ -156,8 +156,8 @@ app: | `alerting.http.sampleInterval` | `20m` | The time interval in which to measure HTTP responses for triggering alerts | | `alerting.http.referenceInterval` | `1w` | The time interval to compare with the sample interval to detect changes | | `alerting.http.maxSlowdown` | `2.5` | The maximum HTTP response slowdown in the sample interval compared to the reference interval | -| `alerting.http.max4xxRatio` | `2.5` | The maximum HTTP 4xx ratio increase (except 499) in the sample interval compared to the reference interval | -| `alerting.http.maxClientCancellationRatio` | `0` | The maximum client cancellation (HTTP 499) ratio increase in the sample interval compared to the reference interval | +| `alerting.http.max4xxRatio` | `2.5` | The maximum HTTP 4xx ratio increase in the sample interval compared to the reference interval | +| `alerting.http.maxClientCancellationRatio` | `2.5` | The maximum client cancellation ratio increase in the sample interval compared to the reference interval | | `alerting.http.max5xxCount` | `0` | The maximum number of HTTP 5xx responses (except 504) in the sample interval | | `alerting.http.maxTimeoutCount` | `0` | The maximum number of HTTP gateway timeout responses (504) in the sample interval | | `alerting.grpc.requestsMetric` | `grpc_server_handled_total` | The name of the Prometheus metric counting gRPC requests | From 385bd6aaa4164f9b4d94a70daffe10eae4383763 Mon Sep 17 00:00:00 2001 From: Thomas <12561498+tn819@users.noreply.github.com> Date: Mon, 29 Jul 2024 09:46:06 +0200 Subject: [PATCH 7/8] Update charts/generic-service/values.schema.json Co-authored-by: Bastian Eicher --- charts/generic-service/values.schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charts/generic-service/values.schema.json b/charts/generic-service/values.schema.json index 0ad6070..27390bb 100644 --- a/charts/generic-service/values.schema.json +++ b/charts/generic-service/values.schema.json @@ -889,7 +889,7 @@ "maxClientCancellationRatio": { "type": "number", "default": 2.5, - "description": "The maximum client cancellation (HTTP 499) ratio increase in the sample interval compared to the reference interval" + "description": "The maximum client cancellation ratio increase in the sample interval compared to the reference interval" }, "max5xxCount": { "type": "number", From c68095aafa1e6e5dce2812fd4f20c2c7e490704b Mon Sep 17 00:00:00 2001 From: Thomas <12561498+tn819@users.noreply.github.com> Date: Mon, 29 Jul 2024 11:33:06 +0000 Subject: [PATCH 8/8] add to values.yaml --- charts/generic-service/values.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/charts/generic-service/values.yaml b/charts/generic-service/values.yaml index 3d52b02..3c2b733 100644 --- a/charts/generic-service/values.yaml +++ b/charts/generic-service/values.yaml @@ -172,6 +172,7 @@ alerting: referenceInterval: 1w maxSlowdown: 2.5 max4xxRatio: 2.5 + maxClientCancellationRatio: 2.5 max5xxCount: 0 maxTimeoutCount: 0 grpc: