diff --git a/docs/src/SUMMARY.md b/docs/src/SUMMARY.md index 9631acc35..6699e94b8 100644 --- a/docs/src/SUMMARY.md +++ b/docs/src/SUMMARY.md @@ -17,6 +17,7 @@ - [Disks](./topics/disks/disks.md) - [OS Disk](./topics/disks/os-disk.md) - [Data Disks](./topics/disks/data-disks.md) + - [Machine Health Checks](./topics/health-checking.md) - [Development](./developers/development.md) - [Releasing](./developers/releasing.md) - [Reference](./reference/reference.md) diff --git a/docs/src/topics/backups.md b/docs/src/topics/backups.md index 015cf560e..fa5f5a4cc 100644 --- a/docs/src/topics/backups.md +++ b/docs/src/topics/backups.md @@ -16,7 +16,7 @@ clusterctl generate cluster $CLUSTER_NAME \ --flavor etcd-backup-restore \ | kubectl apply -f - ``` -For more fine-grain control and to know more about etcd backups, refere [backups.md](../topics/etcd.md) +For more fine-grain control and to know more about etcd backups, refer to [the backups section of the etcd page](../topics/etcd.md#etcd-backups) ## Object Storage diff --git a/docs/src/topics/etcd.md b/docs/src/topics/etcd.md index 788baf045..8b18662bc 100644 --- a/docs/src/topics/etcd.md +++ b/docs/src/topics/etcd.md @@ -23,8 +23,9 @@ Users can also enable SSE (Server-side encryption) by passing a SSE AES-256 Key [here](https://github.com/linode/cluster-api-provider-linode/blob/main/templates/addons/etcd-backup-restore/etcd-backup-restore.yaml) on the pod can be controlled during the provisioning process. -> [!WARNING] -> This is currently under development and will be available for use once the upstream [PR](https://github.com/gardener/etcd-backup-restore/pull/719) is merged and an official image is made available +```admonish warning +This is currently under development and will be available for use once the upstream [PR](https://github.com/gardener/etcd-backup-restore/pull/719) is merged and an official image is made available +``` For eg: ```sh diff --git a/docs/src/topics/health-checking.md b/docs/src/topics/health-checking.md new file mode 100644 index 000000000..9fcbe5810 --- /dev/null +++ b/docs/src/topics/health-checking.md @@ -0,0 +1,25 @@ +# Machine Health Checks + +CAPL supports auto-remediation of workload cluster Nodes considered to be unhealthy +via [`MachineHealthChecks`](https://cluster-api.sigs.k8s.io/tasks/automated-machine-management/healthchecking). + +## Enabling Machine Health Checks + +While it is possible to manually create and apply a `MachineHealthCheck` resource into the management cluster, +using the `self-healing` flavor is the quickest way to get started: +```sh +clusterctl generate cluster $CLUSTER_NAME \ + --kubernetes-version v1.29.1 \ + --infrastructure linode:0.0.0 \ + --flavor self-healing \ + | kubectl apply -f - +``` + +This flavor deploys a `MachineHealthCheck` for the workers and another `MachineHealthCheck` for the control plane +of the cluster. It also configures the remediation strategy of the kubeadm control plane to prevent unnecessary load +on the infrastructure provider. + +## Configuring Machine Health Checks + +Refer to the [Cluster API documentation](https://cluster-api.sigs.k8s.io/tasks/automated-machine-management/healthchecking) +for further information on configuring and using `MachineHealthChecks`. diff --git a/templates/addons/machine-health-check/kustomization.yaml b/templates/addons/machine-health-check/kustomization.yaml new file mode 100644 index 000000000..512714447 --- /dev/null +++ b/templates/addons/machine-health-check/kustomization.yaml @@ -0,0 +1,4 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - machinehealthcheck.yaml diff --git a/templates/addons/machine-health-check/machinehealthcheck.yaml b/templates/addons/machine-health-check/machinehealthcheck.yaml new file mode 100644 index 000000000..759b2bd61 --- /dev/null +++ b/templates/addons/machine-health-check/machinehealthcheck.yaml @@ -0,0 +1,46 @@ +apiVersion: cluster.x-k8s.io/v1beta1 +kind: MachineHealthCheck +metadata: + name: ${CLUSTER_NAME}-node-unhealthy-5m +spec: + clusterName: ${CLUSTER_NAME} + # (Optional) maxUnhealthy prevents further remediation if the cluster is already partially unhealthy + maxUnhealthy: 40% + # (Optional) nodeStartupTimeout determines how long a MachineHealthCheck should wait for + # a Node to join the cluster, before considering a Machine unhealthy. + # Defaults to 10 minutes if not specified. + # Set to 0 to disable the node startup timeout. + # Disabling this timeout will prevent a Machine from being considered unhealthy when + # the Node it created has not yet registered with the cluster. This can be useful when + # Nodes take a long time to start up or when you only want condition based checks for + # Machine health. + nodeStartupTimeout: 10m + # Conditions to check on Nodes for matched Machines, if any condition is matched for the duration of its timeout, the Machine is considered unhealthy + selector: + matchLabels: + cluster.x-k8s.io/deployment-name: ${CLUSTER_NAME}-md-0 + unhealthyConditions: + - type: Ready + status: Unknown + timeout: 300s + - type: Ready + status: "False" + timeout: 300s +--- +apiVersion: cluster.x-k8s.io/v1beta1 +kind: MachineHealthCheck +metadata: + name: ${CLUSTER_NAME}-kcp-unhealthy-5m +spec: + clusterName: ${CLUSTER_NAME} + maxUnhealthy: 100% + selector: + matchLabels: + cluster.x-k8s.io/control-plane: "" + unhealthyConditions: + - type: Ready + status: Unknown + timeout: 300s + - type: Ready + status: "False" + timeout: 300s diff --git a/templates/flavors/self-healing/kustomization.yaml b/templates/flavors/self-healing/kustomization.yaml new file mode 100644 index 000000000..1235a1028 --- /dev/null +++ b/templates/flavors/self-healing/kustomization.yaml @@ -0,0 +1,20 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - ../default + - ../../addons/machine-health-check +patches: + - target: + group: controlplane.cluster.x-k8s.io + version: v1beta1 + kind: KubeadmControlPlane + patch: |- + apiVersion: controlplane.cluster.x-k8s.io/v1beta1 + kind: KubeadmControlPlane + metadata: + name: ${CLUSTER_NAME}-control-plane + spec: + remediationStrategy: + maxRetry: 5 + retryPeriod: 2m + minHealthyPeriod: 2h