diff --git a/charts/generic-service/README.md b/charts/generic-service/README.md index def47a2..c5bb2e0 100644 --- a/charts/generic-service/README.md +++ b/charts/generic-service/README.md @@ -157,6 +157,7 @@ app: | `alerting.http.referenceInterval` | `1w` | The time interval to compare with the sample interval to detect changes | | `alerting.http.maxSlowdown` | `2.5` | The maximum HTTP response slowdown in the sample interval compared to the reference interval | | `alerting.http.max4xxRatio` | `2.5` | The maximum HTTP 4xx ratio increase in the sample interval compared to the reference interval | +| `alerting.http.maxClientCancellationRatio` | `2.5` | The maximum client cancellation ratio increase in the sample interval compared to the reference interval | | `alerting.http.max5xxCount` | `0` | The maximum number of HTTP 5xx responses (except 504) in the sample interval | | `alerting.http.maxTimeoutCount` | `0` | The maximum number of HTTP gateway timeout responses (504) in the sample interval | | `alerting.grpc.requestsMetric` | `grpc_server_handled_total` | The name of the Prometheus metric counting gRPC requests | diff --git a/charts/generic-service/templates/alerts.yaml b/charts/generic-service/templates/alerts.yaml index e694bca..72b24d7 100644 --- a/charts/generic-service/templates/alerts.yaml +++ b/charts/generic-service/templates/alerts.yaml @@ -165,15 +165,20 @@ spec: topic: ingress annotations: {{- include "generic-service.alert-annotations" . | nindent 12 }} HTTP responses slower than usual description: '{{ include "generic-service.fullname" . }} HTTP responses are {{"{{ $value }}"}}x slower in the last {{ .Values.alerting.http.sampleInterval }} than in the last {{ .Values.alerting.http.referenceInterval }}.' - - alert: Http4xx expr: | - (sum(rate({{ include "generic-service.request-code-count-metric" . }}"4.."}[{{ .Values.alerting.http.sampleInterval }}])) / sum(rate({{ include "generic-service.request-count-metric" . }}[{{ .Values.alerting.http.sampleInterval }}]))) / - (sum(rate({{ include "generic-service.request-code-count-metric" . }}"4.."}[{{ .Values.alerting.http.referenceInterval }}])) / sum(rate({{ include "generic-service.request-count-metric" . }}[{{ .Values.alerting.http.referenceInterval }}]))) - > {{ .Values.alerting.http.max4xxRatio }} + (sum(rate({{ include "generic-service.request-code-count-metric" . }}"4.."}[{{ .Values.alerting.http.sampleInterval }}])) - sum(rate({{ include "generic-service.request-code-count-metric" . }}"499"[{{ .Values.alerting.http.sampleInterval }}]))) / + (sum(rate({{ include "generic-service.request-code-count-metric" . }}"4.."}[{{ .Values.alerting.http.referenceInterval }}])) - sum(rate({{ include "generic-service.request-code-count-metric" . }}"499"[{{ .Values.alerting.http.referenceInterval }}])))) > {{ .Values.alerting.http.max4xxRatio }} labels: {{- include "generic-service.alert-labels" . | nindent 12 }} warning annotations: {{- include "generic-service.alert-annotations" . | nindent 12 }} higher HTTP 4xx rate description: '{{ include "generic-service.fullname" . }} gave {{"{{ $value }}"}}x more HTTP 4xx responses per request in the last {{ .Values.alerting.http.sampleInterval }} than in the last {{ .Values.alerting.http.referenceInterval }}.' + - alert: HttpClientCancelled + expr: | + (sum(rate({{ include "generic-service.request-code-count-metric" . }}"499"[{{ .Values.alerting.http.sampleInterval }}]))) / + (sum(rate({{ include "generic-service.request-code-count-metric" . }}"499"[{{ .Values.alerting.http.referenceInterval }}])))) > {{ .Values.alerting.http.maxClientCancellationRatio }} + labels: {{- include "generic-service.alert-labels" . | nindent 12 }} warning + annotations: {{- include "generic-service.alert-annotations" . | nindent 12 }} higher HTTP client cancellation rate + description: '{{ include "generic-service.fullname" . }} gave a {{"{{ $value }}"}}x higher percentage of HTTP request cancelled by the client in the last {{ .Values.alerting.http.sampleInterval }} than in the last {{ .Values.alerting.http.referenceInterval }}.' {{- end }} - alert: Http5xx diff --git a/charts/generic-service/values.schema.json b/charts/generic-service/values.schema.json index f10b3ad..27390bb 100644 --- a/charts/generic-service/values.schema.json +++ b/charts/generic-service/values.schema.json @@ -886,6 +886,11 @@ "default": 2.5, "description": "The maximum HTTP 4xx ratio increase in the sample interval compared to the reference interval" }, + "maxClientCancellationRatio": { + "type": "number", + "default": 2.5, + "description": "The maximum client cancellation ratio increase in the sample interval compared to the reference interval" + }, "max5xxCount": { "type": "number", "default": 0, diff --git a/charts/generic-service/values.yaml b/charts/generic-service/values.yaml index 3d52b02..3c2b733 100644 --- a/charts/generic-service/values.yaml +++ b/charts/generic-service/values.yaml @@ -172,6 +172,7 @@ alerting: referenceInterval: 1w maxSlowdown: 2.5 max4xxRatio: 2.5 + maxClientCancellationRatio: 2.5 max5xxCount: 0 maxTimeoutCount: 0 grpc: