Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[release-1.27] Add flag to enable fast delete of failed VMSS #7407

Open
wants to merge 1 commit into
base: cluster-autoscaler-release-1.27
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions cluster-autoscaler/cloudprovider/azure/azure_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,9 @@ type Config struct {

UseFederatedWorkloadIdentityExtension bool `json:"useFederatedWorkloadIdentityExtension" yaml:"useFederatedWorkloadIdentityExtension"`
AADFederatedTokenFile string `json:"aadFederatedTokenFile" yaml:"aadFederatedTokenFile"`

// EnableFastDeleteOnFailedProvisioning defines whether to delete the experimental faster VMSS instance deletion on failed provisioning
EnableFastDeleteOnFailedProvisioning bool `json:"enableFastDeleteOnFailedProvisioning,omitempty" yaml:"enableFastDeleteOnFailedProvisioning,omitempty"`
}

// These are only here for backward compabitility. Their equivalent exists in providerazure.Config with a different name.
Expand Down Expand Up @@ -328,6 +331,12 @@ func BuildAzureConfig(configReader io.Reader) (*Config, error) {

cfg.DeploymentParameters = parameters
}
if enableFastDeleteOnFailedProvisioning := os.Getenv("AZURE_ENABLE_FAST_DELETE_ON_FAILED_PROVISIONING"); enableFastDeleteOnFailedProvisioning != "" {
cfg.EnableFastDeleteOnFailedProvisioning, err = strconv.ParseBool(enableFastDeleteOnFailedProvisioning)
if err != nil {
return nil, fmt.Errorf("failed to parse AZURE_ENABLE_FAST_DELETE_ON_FAILED_PROVISIONING: %q, %v", enableFastDeleteOnFailedProvisioning, err)
}
}
providerazureconfig.InitializeCloudProviderRateLimitConfig(&cfg.CloudProviderRateLimitConfig)

if err := cfg.validate(); err != nil {
Expand Down
10 changes: 7 additions & 3 deletions cluster-autoscaler/cloudprovider/azure/azure_manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,8 @@ const validAzureCfg = `{
"routeRateLimit": {
"cloudProviderRateLimit": true,
"cloudProviderRateLimitQPS": 3
}
},
"enableFastDeleteOnFailedProvisioning": true
}`

const validAzureCfgLegacy = `{
Expand Down Expand Up @@ -268,8 +269,9 @@ func TestCreateAzureManagerValidConfig(t *testing.T) {
},
},
},
VmssVmsCacheJitter: 120,
MaxDeploymentsCount: 8,
VmssVmsCacheJitter: 120,
MaxDeploymentsCount: 8,
EnableFastDeleteOnFailedProvisioning: true,
}

assert.NoError(t, err)
Expand Down Expand Up @@ -665,6 +667,7 @@ func TestCreateAzureManagerWithNilConfig(t *testing.T) {
t.Setenv("CLUSTER_NAME", "mycluster")
t.Setenv("ARM_CLUSTER_RESOURCE_GROUP", "myrg")
t.Setenv("ARM_BASE_URL_FOR_AP_CLIENT", "nodeprovisioner-svc.nodeprovisioner.svc.cluster.local")
t.Setenv("AZURE_ENABLE_FAST_DELETE_ON_FAILED_PROVISIONING", "true")

t.Run("environment variables correctly set", func(t *testing.T) {
manager, err := createAzureManagerInternal(nil, cloudprovider.NodeGroupDiscoveryOptions{}, mockAzClient)
Expand Down Expand Up @@ -906,6 +909,7 @@ func TestCreateAzureManagerWithEnvOverridingConfig(t *testing.T) {
t.Setenv("CLUSTER_NAME", "mycluster")
t.Setenv("ARM_CLUSTER_RESOURCE_GROUP", "myrg")
t.Setenv("ARM_BASE_URL_FOR_AP_CLIENT", "nodeprovisioner-svc.nodeprovisioner.svc.cluster.local")
t.Setenv("AZURE_ENABLE_FAST_DELETE_ON_FAILED_PROVISIONING", "true")

t.Run("environment variables correctly set", func(t *testing.T) {
manager, err := createAzureManagerInternal(strings.NewReader(validAzureCfgForStandardVMType), cloudprovider.NodeGroupDiscoveryOptions{}, mockAzClient)
Expand Down
35 changes: 29 additions & 6 deletions cluster-autoscaler/cloudprovider/azure/azure_scale_set.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,8 @@ type ScaleSet struct {

// uses Azure Dedicated Host
dedicatedHost bool

enableFastDeleteOnFailedProvisioning bool
}

// NewScaleSet creates a new NewScaleSet.
Expand Down Expand Up @@ -128,6 +130,8 @@ func NewScaleSet(spec *dynamic.NodeGroupSpec, az *AzureManager, curSize int64, d
klog.V(2).Infof("enableDetailedCSEMessage: %t", scaleSet.enableDetailedCSEMessage)
}

scaleSet.enableFastDeleteOnFailedProvisioning = az.config.EnableFastDeleteOnFailedProvisioning

return scaleSet, nil
}

Expand Down Expand Up @@ -686,7 +690,7 @@ func (scaleSet *ScaleSet) buildScaleSetCacheForFlex() error {
return rerr.Error()
}

scaleSet.instanceCache = buildInstanceCacheForFlex(vms)
scaleSet.instanceCache = buildInstanceCacheForFlex(vms, scaleSet.enableFastDeleteOnFailedProvisioning)
scaleSet.lastInstanceRefresh = lastRefresh

return nil
Expand Down Expand Up @@ -744,21 +748,21 @@ func (scaleSet *ScaleSet) buildScaleSetCacheForUniform() error {
// Note that the GetScaleSetVms() results is not used directly because for the List endpoint,
// their resource ID format is not consistent with Get endpoint
// buildInstanceCacheForFlex used by orchestrationMode == compute.Flexible
func buildInstanceCacheForFlex(vms []compute.VirtualMachine) []cloudprovider.Instance {
func buildInstanceCacheForFlex(vms []compute.VirtualMachine, enableFastDeleteOnFailedProvisioning bool) []cloudprovider.Instance {
var instances []cloudprovider.Instance
for _, vm := range vms {
powerState := vmPowerStateRunning
if vm.InstanceView != nil && vm.InstanceView.Statuses != nil {
powerState = vmPowerStateFromStatuses(*vm.InstanceView.Statuses)
}
addVMToCache(&instances, vm.ID, vm.ProvisioningState, powerState)
addVMToCache(&instances, vm.ID, vm.ProvisioningState, powerState, enableFastDeleteOnFailedProvisioning)
}

return instances
}

// addVMToCache used by orchestrationMode == compute.Flexible
func addVMToCache(instances *[]cloudprovider.Instance, id, provisioningState *string, powerState string) {
func addVMToCache(instances *[]cloudprovider.Instance, id, provisioningState *string, powerState string, enableFastDeleteOnFailedProvisioning bool) {
// The resource ID is empty string, which indicates the instance may be in deleting state.
if len(*id) == 0 {
return
Expand All @@ -773,13 +777,13 @@ func addVMToCache(instances *[]cloudprovider.Instance, id, provisioningState *st

*instances = append(*instances, cloudprovider.Instance{
Id: azurePrefix + resourceID,
Status: instanceStatusFromProvisioningStateAndPowerState(resourceID, provisioningState, powerState),
Status: instanceStatusFromProvisioningStateAndPowerState(resourceID, provisioningState, powerState, enableFastDeleteOnFailedProvisioning),
})
}

// instanceStatusFromProvisioningStateAndPowerState converts the VM provisioning state to cloudprovider.InstanceStatus
// instanceStatusFromProvisioningStateAndPowerState used by orchestrationMode == compute.Flexible
func instanceStatusFromProvisioningStateAndPowerState(resourceID string, provisioningState *string, powerState string) *cloudprovider.InstanceStatus {
func instanceStatusFromProvisioningStateAndPowerState(resourceID string, provisioningState *string, powerState string, enableFastDeleteOnFailedProvisioning bool) *cloudprovider.InstanceStatus {
if provisioningState == nil {
return nil
}
Expand All @@ -792,6 +796,25 @@ func instanceStatusFromProvisioningStateAndPowerState(resourceID string, provisi
status.State = cloudprovider.InstanceDeleting
case string(compute.ProvisioningStateCreating):
status.State = cloudprovider.InstanceCreating
case string(compute.ProvisioningStateFailed):
if enableFastDeleteOnFailedProvisioning {
// Provisioning can fail both during instance creation or after the instance is running.
// Per https://learn.microsoft.com/en-us/azure/virtual-machines/states-billing#provisioning-states,
// ProvisioningState represents the most recent provisioning state, therefore only report
// InstanceCreating errors when the power state indicates the instance has not yet started running
if !isRunningVmPowerState(powerState) {
klog.V(4).Infof("VM %s reports failed provisioning state with non-running power state: %s", resourceID, powerState)
status.State = cloudprovider.InstanceCreating
status.ErrorInfo = &cloudprovider.InstanceErrorInfo{
ErrorClass: cloudprovider.OutOfResourcesErrorClass,
ErrorCode: "provisioning-state-failed",
ErrorMessage: "Azure failed to provision a node for this node group",
}
} else {
klog.V(5).Infof("VM %s reports a failed provisioning state but is running (%s)", resourceID, powerState)
status.State = cloudprovider.InstanceRunning
}
}
default:
status.State = cloudprovider.InstanceRunning
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package azure

import (
"fmt"
"strconv"
"sync"
"time"

Expand Down Expand Up @@ -211,20 +212,45 @@ func (scaleSet *ScaleSet) instanceStatusFromVM(vm *compute.VirtualMachineScaleSe
}
return nil
}
powerState := vmPowerStateRunning
if vm.InstanceView != nil && vm.InstanceView.Statuses != nil {
powerState = vmPowerStateFromStatuses(*vm.InstanceView.Statuses)
}

status := &cloudprovider.InstanceStatus{}
switch *vm.ProvisioningState {
case string(compute.ProvisioningStateDeleting):
status.State = cloudprovider.InstanceDeleting
case string(compute.ProvisioningStateCreating):
status.State = cloudprovider.InstanceCreating
case string(compute.ProvisioningStateFailed):
klog.V(3).Infof("VM %s reports failed provisioning state with power state: %s, eligible for fast delete: %s", to.String(vm.ID), powerState, strconv.FormatBool(scaleSet.enableFastDeleteOnFailedProvisioning))
if scaleSet.enableFastDeleteOnFailedProvisioning {
// Provisioning can fail both during instance creation or after the instance is running.
// Per https://learn.microsoft.com/en-us/azure/virtual-machines/states-billing#provisioning-states,
// ProvisioningState represents the most recent provisioning state, therefore only report
// InstanceCreating errors when the power state indicates the instance has not yet started running
if !isRunningVmPowerState(powerState) {
klog.V(4).Infof("VM %s reports failed provisioning state with non-running power state: %s", *vm.ID, powerState)
status.State = cloudprovider.InstanceCreating
status.ErrorInfo = &cloudprovider.InstanceErrorInfo{
ErrorClass: cloudprovider.OutOfResourcesErrorClass,
ErrorCode: "provisioning-state-failed",
ErrorMessage: "Azure failed to provision a node for this node group",
}
} else {
klog.V(5).Infof("VM %s reports a failed provisioning state but is running (%s)", *vm.ID, powerState)
status.State = cloudprovider.InstanceRunning
}
}
default:
status.State = cloudprovider.InstanceRunning
}

// Add vmssCSE Provisioning Failed Message in error info body for vmssCSE Extensions if enableDetailedCSEMessage is true
if scaleSet.enableDetailedCSEMessage && vm.InstanceView != nil {
if err, failed := scaleSet.cseErrors(vm.InstanceView.Extensions); failed {
klog.V(3).Infof("VM %s reports CSE failure: %v, with provisioning state %s, power state %s", to.String(vm.ID), err, to.String(vm.ProvisioningState), powerState)
errorInfo := &cloudprovider.InstanceErrorInfo{
ErrorClass: cloudprovider.OtherErrorClass,
ErrorCode: vmssExtensionProvisioningFailed,
Expand Down
18 changes: 10 additions & 8 deletions cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,11 @@ func newTestScaleSet(manager *AzureManager, name string) *ScaleSet {
azureRef: azureRef{
Name: name,
},
manager: manager,
minSize: 1,
maxSize: 5,
enableForceDelete: manager.config.EnableForceDelete,
manager: manager,
minSize: 1,
maxSize: 5,
enableForceDelete: manager.config.EnableForceDelete,
enableFastDeleteOnFailedProvisioning: true,
}
}

Expand All @@ -55,10 +56,11 @@ func newTestScaleSetMinSizeZero(manager *AzureManager, name string) *ScaleSet {
azureRef: azureRef{
Name: name,
},
manager: manager,
minSize: 0,
maxSize: 5,
enableForceDelete: manager.config.EnableForceDelete,
manager: manager,
minSize: 0,
maxSize: 5,
enableForceDelete: manager.config.EnableForceDelete,
enableFastDeleteOnFailedProvisioning: true,
}
}

Expand Down
Loading