diff --git a/cmd/katalyst-agent/app/options/dynamic/adminqos/reclaimedresource/reclaimedresource_base.go b/cmd/katalyst-agent/app/options/dynamic/adminqos/reclaimedresource/reclaimedresource_base.go index b28c237e8..40e2e2198 100644 --- a/cmd/katalyst-agent/app/options/dynamic/adminqos/reclaimedresource/reclaimedresource_base.go +++ b/cmd/katalyst-agent/app/options/dynamic/adminqos/reclaimedresource/reclaimedresource_base.go @@ -34,6 +34,7 @@ type ReclaimedResourceOptions struct { MinReclaimedResourceForReport general.ResourceList ReservedResourceForAllocate general.ResourceList ReservedResourceForReclaimedCores general.ResourceList + MaxNodeUtilizationPercent map[string]int64 *cpuheadroom.CPUHeadroomOptions *memoryheadroom.MemoryHeadroomOptions @@ -58,8 +59,9 @@ func NewReclaimedResourceOptions() *ReclaimedResourceOptions { v1.ResourceCPU: resource.MustParse("4"), v1.ResourceMemory: resource.MustParse("0"), }, - CPUHeadroomOptions: cpuheadroom.NewCPUHeadroomOptions(), - MemoryHeadroomOptions: memoryheadroom.NewMemoryHeadroomOptions(), + MaxNodeUtilizationPercent: map[string]int64{}, + CPUHeadroomOptions: cpuheadroom.NewCPUHeadroomOptions(), + MemoryHeadroomOptions: memoryheadroom.NewMemoryHeadroomOptions(), } } @@ -77,6 +79,8 @@ func (o *ReclaimedResourceOptions) AddFlags(fss *cliflag.NamedFlagSets) { "reserved reclaimed resource actually not allocate to reclaimed resource") fs.Var(&o.ReservedResourceForReclaimedCores, "reserved-resource-for-reclaimed-cores", "reserved resources for reclaimed_cores pods") + fs.StringToInt64Var(&o.MaxNodeUtilizationPercent, "max-node-utilization-percent", o.MaxNodeUtilizationPercent, + "node utilization resource limit for reclaimed pool") o.CPUHeadroomOptions.AddFlags(fss) o.MemoryHeadroomOptions.AddFlags(fss) @@ -91,6 +95,12 @@ func (o *ReclaimedResourceOptions) ApplyTo(c *reclaimedresource.ReclaimedResourc c.ReservedResourceForAllocate = v1.ResourceList(o.ReservedResourceForAllocate) c.MinReclaimedResourceForAllocate = v1.ResourceList(o.ReservedResourceForReclaimedCores) + maxNodeUtilizationPercent := make(map[v1.ResourceName]int64) + for resourceName, value := range o.MaxNodeUtilizationPercent { + maxNodeUtilizationPercent[v1.ResourceName(resourceName)] = value + } + c.MaxNodeUtilizationPercent = maxNodeUtilizationPercent + errList = append(errList, o.CPUHeadroomOptions.ApplyTo(c.CPUHeadroomConfiguration)) errList = append(errList, o.MemoryHeadroomOptions.ApplyTo(c.MemoryHeadroomConfiguration)) return errors.NewAggregate(errList) diff --git a/go.mod b/go.mod index 4ddb4c7ac..097126be6 100644 --- a/go.mod +++ b/go.mod @@ -152,6 +152,7 @@ require ( ) replace ( + github.com/kubewharf/katalyst-api => github.com/WangZzzhe/katalyst-api v0.0.0-20240521121144-d6e1535f8562 k8s.io/api => k8s.io/api v0.24.6 k8s.io/apiextensions-apiserver => k8s.io/apiextensions-apiserver v0.24.6 k8s.io/apimachinery => k8s.io/apimachinery v0.24.6 diff --git a/go.sum b/go.sum index ea0528f5d..7ffa160c0 100644 --- a/go.sum +++ b/go.sum @@ -82,6 +82,8 @@ github.com/Shopify/sarama v1.19.0/go.mod h1:FVkBWblsNy7DGZRfXLU0O9RCGt5g3g3yEuWX github.com/Shopify/toxiproxy v2.1.4+incompatible/go.mod h1:OXgGpZ6Cli1/URJOF1DMxUHB2q5Ap20/P/eIdh4G0pI= github.com/StackExchange/wmi v0.0.0-20180116203802-5d049714c4a6/go.mod h1:3eOhrUMpNV+6aFIbp5/iudMxNCF27Vw2OZgy4xEx0Fg= github.com/VividCortex/gohistogram v1.0.0/go.mod h1:Pf5mBqqDxYaXu3hDrrU+w6nw50o/4+TcAqDqk/vUH7g= +github.com/WangZzzhe/katalyst-api v0.0.0-20240521121144-d6e1535f8562 h1:tbq/XrcRV1Ud9hteAtn9nO0cQkyVJiLTyd3hhQeseR4= +github.com/WangZzzhe/katalyst-api v0.0.0-20240521121144-d6e1535f8562/go.mod h1:Y2IeIorxQamF2a3oa0+URztl5QCSty6Jj3zD83R8J9k= github.com/afex/hystrix-go v0.0.0-20180502004556-fa1af6a1f4f5/go.mod h1:SkGFH1ia65gfNATL8TAiHDNxPzPdmEL5uirI2Uyuz6c= github.com/ajstarks/svgo v0.0.0-20180226025133-644b8db467af/go.mod h1:K08gAheRH3/J6wwsYMMT4xOr94bZjxIelGM0+d/wbFw= github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= @@ -554,8 +556,6 @@ github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= -github.com/kubewharf/katalyst-api v0.5.1-0.20240508015926-0f4045cf6899 h1:r3u4VcGjiRGsPnkQRL1IfBRhSX1g1gwvmyCHo9PKUBs= -github.com/kubewharf/katalyst-api v0.5.1-0.20240508015926-0f4045cf6899/go.mod h1:Y2IeIorxQamF2a3oa0+URztl5QCSty6Jj3zD83R8J9k= github.com/kubewharf/kubelet v1.24.6-kubewharf.8 h1:2e89T/nZTgzaVhyRsZuwEdRk8V8kJXs4PRkgfeG4Ai4= github.com/kubewharf/kubelet v1.24.6-kubewharf.8/go.mod h1:MxbSZUx3wXztFneeelwWWlX7NAAStJ6expqq7gY2J3c= github.com/kyoh86/exportloopref v0.1.7/go.mod h1:h1rDl2Kdj97+Kwh4gdz3ujE7XHmH51Q0lUiZ1z4NLj8= diff --git a/pkg/agent/qrm-plugins/cpu/dynamicpolicy/cpueviction/strategy/pressure_load.go b/pkg/agent/qrm-plugins/cpu/dynamicpolicy/cpueviction/strategy/pressure_load.go index 13b9b5d80..be0444fbc 100644 --- a/pkg/agent/qrm-plugins/cpu/dynamicpolicy/cpueviction/strategy/pressure_load.go +++ b/pkg/agent/qrm-plugins/cpu/dynamicpolicy/cpueviction/strategy/pressure_load.go @@ -25,6 +25,7 @@ import ( "time" v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/apimachinery/pkg/util/wait" @@ -618,5 +619,8 @@ func (p *CPUPressureLoadEviction) checkPressureWithAdvisedThreshold() bool { // for now, we consider ReservedResourceForAllocate as downgrading or manual intervention configuration, // when it's set to a value greater than zero, fall back to static threshold dynamicConfiguration := p.dynamicConf.GetDynamicConfiguration() - return dynamicConfiguration.EnableReclaim && dynamicConfiguration.ReservedResourceForAllocate.Cpu().Value() == 0 + reservedResourceForAllocate := dynamicConfiguration.GetReservedResourceForAllocate(v1.ResourceList{ + v1.ResourceCPU: *resource.NewQuantity(int64(p.metaServer.NumCPUs), resource.DecimalSI), + }) + return dynamicConfiguration.EnableReclaim && reservedResourceForAllocate.Cpu().Value() == 0 } diff --git a/pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/advisor_helper.go b/pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/advisor_helper.go index f24c03a15..d30e8b886 100644 --- a/pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/advisor_helper.go +++ b/pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/advisor_helper.go @@ -20,6 +20,7 @@ import ( "fmt" v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/klog/v2" "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" @@ -167,7 +168,10 @@ func (cra *cpuResourceAdvisor) updateNumasAvailableResource() { } func (cra *cpuResourceAdvisor) getNumasReservedForAllocate(numas machine.CPUSet) float64 { - reserved := cra.conf.GetDynamicConfiguration().ReservedResourceForAllocate[v1.ResourceCPU] + reserved := cra.conf.GetDynamicConfiguration().GetReservedResourceForAllocate(v1.ResourceList{ + v1.ResourceCPU: *resource.NewQuantity(int64(cra.metaServer.NumCPUs), resource.DecimalSI), + })[v1.ResourceCPU] + return float64(reserved.Value()*int64(numas.Size())) / float64(cra.metaServer.NumNUMANodes) } diff --git a/pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/assembler/headroomassembler/assembler_dedicated.go b/pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/assembler/headroomassembler/assembler_dedicated.go index db9bbd6ec..bf47db642 100644 --- a/pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/assembler/headroomassembler/assembler_dedicated.go +++ b/pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/assembler/headroomassembler/assembler_dedicated.go @@ -64,7 +64,9 @@ func NewHeadroomAssemblerDedicated(conf *config.Configuration, _ interface{}, re func (ha *HeadroomAssemblerDedicated) GetHeadroom() (resource.Quantity, error) { dynamicConfig := ha.conf.GetDynamicConfiguration() - reserved := ha.conf.GetDynamicConfiguration().ReservedResourceForAllocate[v1.ResourceCPU] + reserved := ha.conf.GetDynamicConfiguration().GetReservedResourceForAllocate(v1.ResourceList{ + v1.ResourceCPU: *resource.NewQuantity(int64(ha.metaServer.NumCPUs), resource.DecimalSI), + })[v1.ResourceCPU] // return zero when reclaim is disabled if !dynamicConfig.EnableReclaim { diff --git a/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/advisor.go b/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/advisor.go index 1edd8b5db..ea487abae 100644 --- a/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/advisor.go +++ b/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/advisor.go @@ -205,8 +205,9 @@ func (ra *memoryResourceAdvisor) update() error { return fmt.Errorf("meta reader has not synced") } - reservedForAllocate := ra.conf.GetDynamicConfiguration(). - ReservedResourceForAllocate[v1.ResourceMemory] + reservedForAllocate := ra.conf.GetDynamicConfiguration().GetReservedResourceForAllocate(v1.ResourceList{ + v1.ResourceMemory: *resource.NewQuantity(int64(ra.metaServer.MemoryCapacity), resource.BinarySI), + })[v1.ResourceMemory] for _, headroomPolicy := range ra.headroomPolices { // capacity and reserved can both be adjusted dynamically during running process diff --git a/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/plugin/provisioner/memory_provisioner.go b/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/plugin/provisioner/memory_provisioner.go index 49ae8e89a..f7684f0a3 100644 --- a/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/plugin/provisioner/memory_provisioner.go +++ b/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/plugin/provisioner/memory_provisioner.go @@ -23,6 +23,7 @@ import ( "sync" v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/memory/dynamicpolicy/memoryadvisor" "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/metacache" @@ -96,8 +97,9 @@ func (m *memoryProvisioner) initializeMemoryProvisioner() error { } func (m *memoryProvisioner) Reconcile(status *types.MemoryPressureStatus) (err error) { - reservedForAllocate := m.conf.GetDynamicConfiguration(). - ReservedResourceForAllocate[v1.ResourceMemory] + reservedForAllocate := m.conf.GetDynamicConfiguration().GetReservedResourceForAllocate(v1.ResourceList{ + v1.ResourceMemory: *resource.NewQuantity(int64(m.metaServer.MemoryCapacity), resource.BinarySI), + })[v1.ResourceMemory] m.policy.SetEssentials( types.ResourceEssentials{ EnableReclaim: m.conf.GetDynamicConfiguration().EnableReclaim, diff --git a/pkg/config/agent/dynamic/adminqos/reclaimedresource/reclaimedresource_base.go b/pkg/config/agent/dynamic/adminqos/reclaimedresource/reclaimedresource_base.go index e9f5f7e01..9b3db3789 100644 --- a/pkg/config/agent/dynamic/adminqos/reclaimedresource/reclaimedresource_base.go +++ b/pkg/config/agent/dynamic/adminqos/reclaimedresource/reclaimedresource_base.go @@ -18,10 +18,12 @@ package reclaimedresource import ( v1 "k8s.io/api/core/v1" + "k8s.io/klog/v2" "github.com/kubewharf/katalyst-core/pkg/config/agent/dynamic/adminqos/reclaimedresource/cpuheadroom" "github.com/kubewharf/katalyst-core/pkg/config/agent/dynamic/adminqos/reclaimedresource/memoryheadroom" "github.com/kubewharf/katalyst-core/pkg/config/agent/dynamic/crd" + "github.com/kubewharf/katalyst-core/pkg/util/native" ) type ReclaimedResourceConfiguration struct { @@ -30,6 +32,7 @@ type ReclaimedResourceConfiguration struct { MinReclaimedResourceForReport v1.ResourceList ReservedResourceForAllocate v1.ResourceList MinReclaimedResourceForAllocate v1.ResourceList + MaxNodeUtilizationPercent map[v1.ResourceName]int64 *cpuheadroom.CPUHeadroomConfiguration *memoryheadroom.MemoryHeadroomConfiguration @@ -72,8 +75,50 @@ func (c *ReclaimedResourceConfiguration) ApplyConfiguration(conf *crd.DynamicCon c.MinReclaimedResourceForAllocate[resourceName] = value } } + + if config.MaxNodeUtilizationPercent != nil { + for resourceName, value := range config.MaxNodeUtilizationPercent { + c.MaxNodeUtilizationPercent[resourceName] = value + } + } } c.CPUHeadroomConfiguration.ApplyConfiguration(conf) c.MemoryHeadroomConfiguration.ApplyConfiguration(conf) } + +func (c *ReclaimedResourceConfiguration) GetReservedResourceForAllocate(nodeResourceList v1.ResourceList) v1.ResourceList { + if len(c.MaxNodeUtilizationPercent) == 0 { + return c.ReservedResourceForAllocate + } + + res := v1.ResourceList{} + for resource, quantity := range c.ReservedResourceForAllocate { + nodeAllocatable, ok := nodeResourceList[resource] + if !ok { + res[resource] = quantity + continue + } + + maxUtil, ok := c.MaxNodeUtilizationPercent[resource] + if !ok { + res[resource] = quantity + continue + } + if maxUtil <= 0 || maxUtil > 100 { + klog.Warningf("unsupported MaxNodeUtilizationPercent, resourceName: %v, value: %v", resource, maxUtil) + res[resource] = quantity + continue + } + + nodeAllocatableCopy := nodeAllocatable.DeepCopy() + nodeAllocatableCopy.Sub(native.MultiplyResourceQuantity(resource, nodeAllocatableCopy, float64(maxUtil)/100.0)) + res[resource] = nodeAllocatableCopy + + klog.V(6).Infof("GetReservedResourceForAllocate resource: %v, nodeResource: %v, "+ + "MaxNodeUtilizationPercent: %v, ReservedResourceForAllocate: %v, res: %v", + resource, nodeAllocatable.String(), maxUtil, quantity.String(), nodeAllocatableCopy.String()) + } + + return res +} diff --git a/pkg/config/agent/dynamic/adminqos/reclaimedresource/reclaimedresource_base_test.go b/pkg/config/agent/dynamic/adminqos/reclaimedresource/reclaimedresource_base_test.go new file mode 100644 index 000000000..a9420ada2 --- /dev/null +++ b/pkg/config/agent/dynamic/adminqos/reclaimedresource/reclaimedresource_base_test.go @@ -0,0 +1,145 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package reclaimedresource + +import ( + "testing" + + "github.com/stretchr/testify/assert" + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" +) + +func TestGetReservedResourceForAllocate(t *testing.T) { + t.Parallel() + + for _, tc := range []struct { + name string + config *ReclaimedResourceConfiguration + nodeResourceList v1.ResourceList + expectRes v1.ResourceList + }{ + { + name: "MaxNodeUtilizationPercent not set", + config: &ReclaimedResourceConfiguration{ + ReservedResourceForAllocate: map[v1.ResourceName]resource.Quantity{ + v1.ResourceCPU: resource.MustParse("4"), + v1.ResourceMemory: resource.MustParse("8Gi"), + }, + }, + nodeResourceList: map[v1.ResourceName]resource.Quantity{}, + expectRes: map[v1.ResourceName]resource.Quantity{ + v1.ResourceCPU: resource.MustParse("4"), + v1.ResourceMemory: resource.MustParse("8Gi"), + }, + }, + { + name: "MaxNodeUtilizationPercent set, only get cpu", + config: &ReclaimedResourceConfiguration{ + ReservedResourceForAllocate: map[v1.ResourceName]resource.Quantity{ + v1.ResourceCPU: resource.MustParse("4"), + v1.ResourceMemory: resource.MustParse("8Gi"), + }, + MaxNodeUtilizationPercent: map[v1.ResourceName]int64{ + v1.ResourceCPU: 60, + v1.ResourceMemory: 80, + }, + }, + nodeResourceList: map[v1.ResourceName]resource.Quantity{ + v1.ResourceCPU: resource.MustParse("32"), + }, + expectRes: map[v1.ResourceName]resource.Quantity{ + v1.ResourceCPU: resource.MustParse("12800m"), + v1.ResourceMemory: resource.MustParse("8Gi"), + }, + }, + { + name: "MaxNodeUtilizationPercent set, only get memory", + config: &ReclaimedResourceConfiguration{ + ReservedResourceForAllocate: map[v1.ResourceName]resource.Quantity{ + v1.ResourceCPU: resource.MustParse("4"), + v1.ResourceMemory: resource.MustParse("8Gi"), + }, + MaxNodeUtilizationPercent: map[v1.ResourceName]int64{ + v1.ResourceCPU: 60, + v1.ResourceMemory: 80, + }, + }, + nodeResourceList: map[v1.ResourceName]resource.Quantity{ + v1.ResourceMemory: resource.MustParse("100Gi"), + }, + expectRes: map[v1.ResourceName]resource.Quantity{ + v1.ResourceCPU: resource.MustParse("4"), + v1.ResourceMemory: resource.MustParse("20Gi"), + }, + }, + { + name: "MaxNodeUtilizationPercent only cpu set", + config: &ReclaimedResourceConfiguration{ + ReservedResourceForAllocate: map[v1.ResourceName]resource.Quantity{ + v1.ResourceCPU: resource.MustParse("4"), + v1.ResourceMemory: resource.MustParse("8Gi"), + }, + MaxNodeUtilizationPercent: map[v1.ResourceName]int64{ + v1.ResourceMemory: 80, + }, + }, + nodeResourceList: map[v1.ResourceName]resource.Quantity{ + v1.ResourceCPU: resource.MustParse("32"), + }, + expectRes: map[v1.ResourceName]resource.Quantity{ + v1.ResourceCPU: resource.MustParse("4"), + v1.ResourceMemory: resource.MustParse("8Gi"), + }, + }, + { + name: "MaxNodeUtilizationPercent value not supported", + config: &ReclaimedResourceConfiguration{ + ReservedResourceForAllocate: map[v1.ResourceName]resource.Quantity{ + v1.ResourceCPU: resource.MustParse("4"), + v1.ResourceMemory: resource.MustParse("8Gi"), + }, + MaxNodeUtilizationPercent: map[v1.ResourceName]int64{ + v1.ResourceCPU: 120, + v1.ResourceMemory: 0, + }, + }, + nodeResourceList: map[v1.ResourceName]resource.Quantity{ + v1.ResourceCPU: resource.MustParse("32"), + v1.ResourceMemory: resource.MustParse("100Gi"), + }, + expectRes: map[v1.ResourceName]resource.Quantity{ + v1.ResourceCPU: resource.MustParse("4"), + v1.ResourceMemory: resource.MustParse("8Gi"), + }, + }, + } { + tc := tc + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + + res := tc.config.GetReservedResourceForAllocate(tc.nodeResourceList) + assert.Equal(t, len(tc.expectRes), len(res)) + for resource, quantity := range res { + expectQuan, ok := tc.expectRes[resource] + assert.True(t, ok) + + assert.True(t, quantity.Equal(expectQuan)) + } + }) + } +}