Skip to content

Commit

Permalink
Improved memory usage alerts
Browse files Browse the repository at this point in the history
  • Loading branch information
bastianeicher committed Jul 28, 2021
1 parent ce551ce commit 50301b3
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 7 deletions.
2 changes: 1 addition & 1 deletion charts/generic-service/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ app:
| `monitoring.interval` | `1m` | The interval at which monitoring data is scraped |
| `alerting.enabled` | `true` | Applies default alert rules like unavailable pods or memory use (if `monitoring.enabled` is `true`) |
| `alerting.labels` | `{}` | Labels to apply to generic alert rules in addition to `component` and `severity` |
| `alerting.memoryUsage.thresholdFactor` | `0.9` | The maximum factor (between `0` and `1`) of memory usage allowed before alerting |
| `alerting.memoryUsage.thresholdFactor` | `0.9` | The maximum usage factor of `resources.limits.memory` allowed before alerting (between `0` and `1`) |
| `alerting.http4xxRatio.sampleInterval` | `5m` | The time interval in which to measure ratio of HTTP 4xx responses for the current state |
| `alerting.http4xxRatio.referenceInterval` | `1d` | The time interval in which to measure ratio of HTTP 4xx responses as a reference for the normal state |
| `alerting.http4xxRatio.thresholdFactor` | `1.5` | The maximum factor between the current state and the normal state of HTTP 4xx response ratio allowed |
Expand Down
24 changes: 19 additions & 5 deletions charts/generic-service/templates/alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,18 +35,32 @@ spec:
description: '{{"{{ $value }}"}} replicas for {{ include "generic-service.fullname" . }} are down.'
{{- end }}

- alert: HighMemoryUsage
expr: >
- alert: MemoryUsageAboveRequest
expr: |
max(container_memory_working_set_bytes{namespace="{{ .Release.Namespace }}", container="{{ include "generic-service.fullname" . }}"}) -
min(kube_pod_container_resource_requests{namespace="{{ .Release.Namespace }}", container="{{ include "generic-service.fullname" . }}", resource="memory"})
> 0
for: 10m
labels:
component: {{ include "generic-service.fullname" . }}
severity: warning
{{- if .Values.alerting.labels }}{{ toYaml .Values.alerting.labels | nindent 12 }}{{ end }}
annotations:
summary: '{{ include "generic-service.fullname" . }} using more memory than requested.'
description: '{{ include "generic-service.fullname" . }} is using {{"{{ $value | humanize1024 }}"}}B more than its requested memory ({{ .Values.resources.requests.memory }}B).'

- alert: MemoryUsageCloseToLimit
expr: |
max(container_memory_working_set_bytes{namespace="{{ .Release.Namespace }}", container="{{ include "generic-service.fullname" . }}"}) /
min(kube_pod_container_resource_limits{namespace="{{ .Release.Namespace }}", container="{{ include "generic-service.fullname" . }}", resource="memory"})
> {{ .Values.alerting.memoryUsage.thresholdFactor }}
labels:
component: {{ include "generic-service.fullname" . }}
severity: warning
severity: critical
{{- if .Values.alerting.labels }}{{ toYaml .Values.alerting.labels | nindent 12 }}{{ end }}
annotations:
summary: '{{ include "generic-service.fullname" . }} memory usage is high.'
description: '{{ include "generic-service.fullname" . }} is using {{"{{ $value | humanizePercentage }}"}} of its available memory.'
summary: '{{ include "generic-service.fullname" . }} memory usage is close to its limit.'
description: '{{ include "generic-service.fullname" . }} is using {{"{{ $value | humanizePercentage }}"}} of its memory limit ({{ .Values.resources.limits.memory }}B).'

{{ if .Values.ingress.istio.enabled }}
- alert: Http5xxResponses
Expand Down
2 changes: 1 addition & 1 deletion charts/generic-service/values.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -421,7 +421,7 @@
"thresholdFactor": {
"type": "number",
"default": "0.9",
"description": "The maximum factor (between 0 and 1) of memory usage allowed before alerting"
"description": "The maximum usage factor of resources.limits.memory allowed before alerting (between 0 and 1)"
}
}
},
Expand Down

0 comments on commit 50301b3

Please sign in to comment.