diff --git a/CHANGELOG.md b/CHANGELOG.md index a571c84..d229409 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Changed + +- CAPA: use lower heartbeat timeout to allow spot instances to terminate more quickly + ## [1.27.3] - 2024-11-13 ### Changed diff --git a/pkg/clusterbuilder/providers/capa/values/cluster_values.yaml b/pkg/clusterbuilder/providers/capa/values/cluster_values.yaml index 80db645..e9617a8 100644 --- a/pkg/clusterbuilder/providers/capa/values/cluster_values.yaml +++ b/pkg/clusterbuilder/providers/capa/values/cluster_values.yaml @@ -19,6 +19,15 @@ global: maxSize: 5 minSize: 2 rootVolumeSizeGB: 25 + + # With spot instances, aws-node-termination-handler may not receive any ASG lifecycle hook events + # and we don't want to wait for the default 30 minutes of heartbeat timeout before instances + # terminate. That would fail the tests. This can be fixed once heartbeats are implemented + # (https://github.com/aws/aws-node-termination-handler/issues/493), since then we would reduce + # cluster-aws's defaults to a low value, let's say `heartbeatTimeout: 5m` and `globalTimeout: 30m`. + awsNodeTerminationHandler: + heartbeatTimeoutSeconds: 100 + spotInstances: enabled: true maxPrice: 0.2960 diff --git a/pkg/clusterbuilder/providers/capa/values/private-cluster_values.yaml b/pkg/clusterbuilder/providers/capa/values/private-cluster_values.yaml index 1a274e8..d0f096e 100644 --- a/pkg/clusterbuilder/providers/capa/values/private-cluster_values.yaml +++ b/pkg/clusterbuilder/providers/capa/values/private-cluster_values.yaml @@ -52,6 +52,15 @@ global: maxSize: 5 minSize: 2 rootVolumeSizeGB: 25 + + # With spot instances, aws-node-termination-handler may not receive any ASG lifecycle hook events + # and we don't want to wait for the default 30 minutes of heartbeat timeout before instances + # terminate. That would fail the tests. This can be fixed once heartbeats are implemented + # (https://github.com/aws/aws-node-termination-handler/issues/493), since then we would reduce + # cluster-aws's defaults to a low value, let's say `heartbeatTimeout: 5m` and `globalTimeout: 30m`. + awsNodeTerminationHandler: + heartbeatTimeoutSeconds: 100 + spotInstances: enabled: true maxPrice: 0.2960