Skip to content

Commit

Permalink
fix(qrm): add EnableReclaimNUMABinding config to support make all rec…
Browse files Browse the repository at this point in the history
…laim pods to non-RNB by default
  • Loading branch information
luomingmeng committed Nov 25, 2024
1 parent 5c1dceb commit 0d1210f
Show file tree
Hide file tree
Showing 8 changed files with 25 additions and 9 deletions.
4 changes: 4 additions & 0 deletions cmd/katalyst-agent/app/options/qrm/qrm_base.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ type GenericQRMPluginOptions struct {
UseKubeletReservedConfig bool
PodAnnotationKeptKeys []string
PodLabelKeptKeys []string
EnableReclaimNUMABinding bool
}

func NewGenericQRMPluginOptions() *GenericQRMPluginOptions {
Expand Down Expand Up @@ -57,6 +58,8 @@ func (o *GenericQRMPluginOptions) AddFlags(fss *cliflag.NamedFlagSets) {
o.PodAnnotationKeptKeys, "pod annotation keys will be kept in qrm state")
fs.StringSliceVar(&o.PodLabelKeptKeys, "pod-label-kept-keys",
o.PodLabelKeptKeys, "pod label keys will be kept in qrm state")
fs.BoolVar(&o.EnableReclaimNUMABinding, "enable-reclaim-numa-binding",
o.EnableReclaimNUMABinding, "if set true, reclaim pod will be allocated on a specific NUMA node best-effort, otherwise, reclaim pod will be allocated on multi NUMA nodes")
}

func (o *GenericQRMPluginOptions) ApplyTo(conf *qrmconfig.GenericQRMPluginConfiguration) error {
Expand All @@ -67,6 +70,7 @@ func (o *GenericQRMPluginOptions) ApplyTo(conf *qrmconfig.GenericQRMPluginConfig
conf.UseKubeletReservedConfig = o.UseKubeletReservedConfig
conf.PodAnnotationKeptKeys = append(conf.PodAnnotationKeptKeys, o.PodAnnotationKeptKeys...)
conf.PodLabelKeptKeys = append(conf.PodLabelKeptKeys, o.PodLabelKeptKeys...)
conf.EnableReclaimNUMABinding = o.EnableReclaimNUMABinding
return nil
}

Expand Down
2 changes: 2 additions & 0 deletions pkg/agent/qrm-plugins/cpu/dynamicpolicy/policy.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ type DynamicPolicy struct {
cpuAdvisorSocketAbsPath string
cpuPluginSocketAbsPath string
extraStateFileAbsPath string
enableReclaimNUMABinding bool
enableCPUIdle bool
enableSyncingCPUIdle bool
reclaimRelativeRootCgroupPath string
Expand Down Expand Up @@ -200,6 +201,7 @@ func NewDynamicPolicy(agentCtx *agent.GenericContext, conf *config.Configuration
dynamicConfig: conf.DynamicAgentConfiguration,
cpuAdvisorSocketAbsPath: conf.CPUAdvisorSocketAbsPath,
cpuPluginSocketAbsPath: conf.CPUPluginSocketAbsPath,
enableReclaimNUMABinding: conf.EnableReclaimNUMABinding,
enableCPUAdvisor: conf.CPUQRMPluginConfig.EnableCPUAdvisor,
cpuNUMAHintPreferPolicy: conf.CPUQRMPluginConfig.CPUNUMAHintPreferPolicy,
cpuNUMAHintPreferLowThreshold: conf.CPUQRMPluginConfig.CPUNUMAHintPreferLowThreshold,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,8 @@ func (p *DynamicPolicy) reclaimedCoresHintHandler(ctx context.Context,
return nil, fmt.Errorf("not support inplace update resize for reclaimed cores")
}

if qosutil.AnnotationsIndicateNUMABinding(req.Annotations) {
if qosutil.AnnotationsIndicateNUMABinding(req.Annotations) &&
p.enableReclaimNUMABinding {
return p.reclaimedCoresWithNUMABindingHintHandler(ctx, req)
}

Expand Down
1 change: 1 addition & 0 deletions pkg/agent/qrm-plugins/cpu/dynamicpolicy/policy_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@ func getTestDynamicPolicyWithoutInitialization(topology *machine.CPUTopology, st
advisorValidator: validator.NewCPUAdvisorValidator(stateImpl, machineInfo),
reservedReclaimedCPUsSize: general.Max(reservedReclaimedCPUsSize, topology.NumNUMANodes),
reservedCPUs: reservedCPUs,
enableReclaimNUMABinding: true,
emitter: metrics.DummyMetrics{},
podDebugAnnoKeys: []string{podDebugAnnoKey},
}
Expand Down
2 changes: 2 additions & 0 deletions pkg/agent/qrm-plugins/memory/dynamicpolicy/policy.go
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,7 @@ type DynamicPolicy struct {
enableEvictingLogCache bool
logCacheEvictionManager logcache.Manager

enableReclaimNUMABinding bool
enableNonBindingShareCoresMemoryResourceCheck bool

enableNUMAAllocationReactor bool
Expand Down Expand Up @@ -234,6 +235,7 @@ func NewDynamicPolicy(agentCtx *agent.GenericContext, conf *config.Configuration
enableOOMPriority: conf.EnableOOMPriority,
oomPriorityMapPinnedPath: conf.OOMPriorityPinnedMapAbsPath,
enableEvictingLogCache: conf.EnableEvictingLogCache,
enableReclaimNUMABinding: conf.EnableReclaimNUMABinding,
enableNonBindingShareCoresMemoryResourceCheck: conf.EnableNonBindingShareCoresMemoryResourceCheck,
enableNUMAAllocationReactor: conf.EnableNUMAAllocationReactor,
numaBindResultResourceAllocationAnnotationKey: conf.NUMABindResultResourceAllocationAnnotationKey,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,8 @@ func (p *DynamicPolicy) reclaimedCoresHintHandler(ctx context.Context,
return nil, fmt.Errorf("not support inplace update resize for reclaimed cores")
}

if qosutil.AnnotationsIndicateNUMABinding(req.Annotations) {
if qosutil.AnnotationsIndicateNUMABinding(req.Annotations) &&
p.enableReclaimNUMABinding {
return p.reclaimedCoresWithNUMABindingHintHandler(ctx, req)
}

Expand Down
15 changes: 8 additions & 7 deletions pkg/agent/qrm-plugins/memory/dynamicpolicy/policy_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,13 +130,14 @@ func getTestDynamicPolicyWithInitialization(topology *machine.CPUTopology, machi
}

policyImplement := &DynamicPolicy{
topology: topology,
qosConfig: qosConfig,
state: stateImpl,
emitter: metrics.DummyMetrics{},
migratingMemory: make(map[string]map[string]bool),
stopCh: make(chan struct{}),
podDebugAnnoKeys: []string{podDebugAnnoKey},
topology: topology,
qosConfig: qosConfig,
state: stateImpl,
emitter: metrics.DummyMetrics{},
migratingMemory: make(map[string]map[string]bool),
stopCh: make(chan struct{}),
podDebugAnnoKeys: []string{podDebugAnnoKey},
enableReclaimNUMABinding: true,
enableNonBindingShareCoresMemoryResourceCheck: true,
numaBindResultResourceAllocationAnnotationKey: coreconsts.QRMResourceAnnotationKeyNUMABindResult,
}
Expand Down
4 changes: 4 additions & 0 deletions pkg/config/agent/qrm/qrm_base.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@ type GenericQRMPluginConfiguration struct {
PodAnnotationKeptKeys []string
// PodLabelKeptKeys indicates pod label keys will be kept in qrm state
PodLabelKeptKeys []string
// EnableReclaimNUMABinding indicates whether to enable NUMA Binding for reclaim pods
// if this flag is set to true, reclaim pod will be allocated on a specific NUMA node
// best-effort, otherwise, reclaim pod will be allocated on multi NUMA nodes
EnableReclaimNUMABinding bool
}

type QRMPluginsConfiguration struct {
Expand Down

0 comments on commit 0d1210f

Please sign in to comment.