Skip to content

Commit

Permalink
Add 499 Alert and exclude from 4xx
Browse files Browse the repository at this point in the history
  • Loading branch information
tn819 authored Jul 31, 2024
1 parent 4998b5b commit 3470c1a
Show file tree
Hide file tree
Showing 4 changed files with 16 additions and 4 deletions.
1 change: 1 addition & 0 deletions charts/generic-service/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,7 @@ app:
| `alerting.http.referenceInterval` | `1w` | The time interval to compare with the sample interval to detect changes |
| `alerting.http.maxSlowdown` | `2.5` | The maximum HTTP response slowdown in the sample interval compared to the reference interval |
| `alerting.http.max4xxRatio` | `2.5` | The maximum HTTP 4xx ratio increase in the sample interval compared to the reference interval |
| `alerting.http.maxClientCancellationRatio` | `2.5` | The maximum client cancellation ratio increase in the sample interval compared to the reference interval |
| `alerting.http.max5xxCount` | `0` | The maximum number of HTTP 5xx responses (except 504) in the sample interval |
| `alerting.http.maxTimeoutCount` | `0` | The maximum number of HTTP gateway timeout responses (504) in the sample interval |
| `alerting.grpc.requestsMetric` | `grpc_server_handled_total` | The name of the Prometheus metric counting gRPC requests |
Expand Down
13 changes: 9 additions & 4 deletions charts/generic-service/templates/alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -165,15 +165,20 @@ spec:
topic: ingress
annotations: {{- include "generic-service.alert-annotations" . | nindent 12 }} HTTP responses slower than usual
description: '{{ include "generic-service.fullname" . }} HTTP responses are {{"{{ $value }}"}}x slower in the last {{ .Values.alerting.http.sampleInterval }} than in the last {{ .Values.alerting.http.referenceInterval }}.'

- alert: Http4xx
expr: |
(sum(rate({{ include "generic-service.request-code-count-metric" . }}"4.."}[{{ .Values.alerting.http.sampleInterval }}])) / sum(rate({{ include "generic-service.request-count-metric" . }}[{{ .Values.alerting.http.sampleInterval }}]))) /
(sum(rate({{ include "generic-service.request-code-count-metric" . }}"4.."}[{{ .Values.alerting.http.referenceInterval }}])) / sum(rate({{ include "generic-service.request-count-metric" . }}[{{ .Values.alerting.http.referenceInterval }}])))
> {{ .Values.alerting.http.max4xxRatio }}
(sum(rate({{ include "generic-service.request-code-count-metric" . }}"4.."}[{{ .Values.alerting.http.sampleInterval }}])) - sum(rate({{ include "generic-service.request-code-count-metric" . }}"499"[{{ .Values.alerting.http.sampleInterval }}]))) /
(sum(rate({{ include "generic-service.request-code-count-metric" . }}"4.."}[{{ .Values.alerting.http.referenceInterval }}])) - sum(rate({{ include "generic-service.request-code-count-metric" . }}"499"[{{ .Values.alerting.http.referenceInterval }}])))) > {{ .Values.alerting.http.max4xxRatio }}
labels: {{- include "generic-service.alert-labels" . | nindent 12 }} warning
annotations: {{- include "generic-service.alert-annotations" . | nindent 12 }} higher HTTP 4xx rate
description: '{{ include "generic-service.fullname" . }} gave {{"{{ $value }}"}}x more HTTP 4xx responses per request in the last {{ .Values.alerting.http.sampleInterval }} than in the last {{ .Values.alerting.http.referenceInterval }}.'
- alert: HttpClientCancelled
expr: |
(sum(rate({{ include "generic-service.request-code-count-metric" . }}"499"[{{ .Values.alerting.http.sampleInterval }}]))) /
(sum(rate({{ include "generic-service.request-code-count-metric" . }}"499"[{{ .Values.alerting.http.referenceInterval }}])))) > {{ .Values.alerting.http.maxClientCancellationRatio }}
labels: {{- include "generic-service.alert-labels" . | nindent 12 }} warning
annotations: {{- include "generic-service.alert-annotations" . | nindent 12 }} higher HTTP client cancellation rate
description: '{{ include "generic-service.fullname" . }} gave a {{"{{ $value }}"}}x higher percentage of HTTP request cancelled by the client in the last {{ .Values.alerting.http.sampleInterval }} than in the last {{ .Values.alerting.http.referenceInterval }}.'
{{- end }}

- alert: Http5xx
Expand Down
5 changes: 5 additions & 0 deletions charts/generic-service/values.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -886,6 +886,11 @@
"default": 2.5,
"description": "The maximum HTTP 4xx ratio increase in the sample interval compared to the reference interval"
},
"maxClientCancellationRatio": {
"type": "number",
"default": 2.5,
"description": "The maximum client cancellation ratio increase in the sample interval compared to the reference interval"
},
"max5xxCount": {
"type": "number",
"default": 0,
Expand Down
1 change: 1 addition & 0 deletions charts/generic-service/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,7 @@ alerting:
referenceInterval: 1w
maxSlowdown: 2.5
max4xxRatio: 2.5
maxClientCancellationRatio: 2.5
max5xxCount: 0
maxTimeoutCount: 0
grpc:
Expand Down

0 comments on commit 3470c1a

Please sign in to comment.