diff --git a/api/numaresourcesoperator/v1/helper/nodegroup/nodegroup.go b/api/numaresourcesoperator/v1/helper/nodegroup/nodegroup.go index 125afbbf2..029de20ef 100644 --- a/api/numaresourcesoperator/v1/helper/nodegroup/nodegroup.go +++ b/api/numaresourcesoperator/v1/helper/nodegroup/nodegroup.go @@ -59,31 +59,42 @@ func (ttr Tree) Clone() Tree { // One of the key design assumptions in NROP is the 1:1 mapping between NodeGroups and MCPs. // This historical accident should be fixed in future versions. func FindTrees(mcps *mcov1.MachineConfigPoolList, nodeGroups []nropv1.NodeGroup) ([]Tree, error) { + // node groups are validated by the controller before getting to this phase, so for sure all node groups will be valid at this point. + // a valid node group has either PoolName OR MachineConfigPoolSelector, not both. Getting here means operator is deployed on Openshift thus processing MCPs var result []Tree for idx := range nodeGroups { nodeGroup := &nodeGroups[idx] // shortcut + treeMCPs := []*mcov1.MachineConfigPool{} - if nodeGroup.MachineConfigPoolSelector == nil { - continue - } - selector, err := metav1.LabelSelectorAsSelector(nodeGroup.MachineConfigPoolSelector) - if err != nil { - klog.Errorf("bad node group machine config pool selector %q", nodeGroup.MachineConfigPoolSelector.String()) - continue + if nodeGroup.PoolName != nil { + for i := range mcps.Items { + mcp := &mcps.Items[i] + if mcp.Name == *nodeGroup.PoolName { + treeMCPs = append(treeMCPs, mcp) + // MCO ensures there are no mcp name duplications + break + } + } } - treeMCPs := []*mcov1.MachineConfigPool{} - for i := range mcps.Items { - mcp := &mcps.Items[i] // shortcut - mcpLabels := labels.Set(mcp.Labels) - if !selector.Matches(mcpLabels) { + if nodeGroup.MachineConfigPoolSelector != nil { + selector, err := metav1.LabelSelectorAsSelector(nodeGroup.MachineConfigPoolSelector) + if err != nil { + klog.Errorf("bad node group machine config pool selector %q", nodeGroup.MachineConfigPoolSelector.String()) continue } - treeMCPs = append(treeMCPs, mcp) - } + for i := range mcps.Items { + mcp := &mcps.Items[i] // shortcut + mcpLabels := labels.Set(mcp.Labels) + if !selector.Matches(mcpLabels) { + continue + } + treeMCPs = append(treeMCPs, mcp) + } + } if len(treeMCPs) == 0 { - return nil, fmt.Errorf("failed to find MachineConfigPool for the node group with the selector %q", nodeGroup.MachineConfigPoolSelector.String()) + return nil, fmt.Errorf("failed to find MachineConfigPool for the node group %+v", nodeGroup) } result = append(result, Tree{ diff --git a/api/numaresourcesoperator/v1/helper/nodegroup/nodegroup_test.go b/api/numaresourcesoperator/v1/helper/nodegroup/nodegroup_test.go index d43d3eb5a..1812839d9 100644 --- a/api/numaresourcesoperator/v1/helper/nodegroup/nodegroup_test.go +++ b/api/numaresourcesoperator/v1/helper/nodegroup/nodegroup_test.go @@ -23,6 +23,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/util/sets" "k8s.io/klog/v2" mcov1 "github.com/openshift/machine-config-operator/pkg/apis/machineconfiguration.openshift.io/v1" @@ -185,6 +186,47 @@ func TestFindTrees(t *testing.T) { }, }, }, + { + name: "node group with PoolName and MachineConfigPoolSelector in another node group", + mcps: &mcpList, + ngs: []nropv1.NodeGroup{ + { + PoolName: &mcpList.Items[0].Name, + }, + { + MachineConfigPoolSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "mcp-label-3": "test3", + }, + }, + }, + }, + expected: []Tree{ + { + MachineConfigPools: []*mcov1.MachineConfigPool{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "mcp1", + }, + }, + }, + }, + { + MachineConfigPools: []*mcov1.MachineConfigPool{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "mcp3", + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "mcp5", + }, + }, + }, + }, + }, + }, } for _, tt := range testCases { t.Run(tt.name, func(t *testing.T) { @@ -203,9 +245,10 @@ func TestFindTrees(t *testing.T) { if err != nil { t.Errorf("unexpected error checking backward compat: %v", err) } - compatNames := mcpNamesFromList(gotMcps) - if !reflect.DeepEqual(gotNames, compatNames) { - t.Errorf("Trees mismatch (non backward compatible): got=%v compat=%v", gotNames, compatNames) + compatibleNames := mcpNamesFromList(gotMcps) + gotSet := sets.New[string](gotNames...) + if !gotSet.HasAll(compatibleNames...) { + t.Errorf("Trees mismatch (non backward compatible): got=%v compat=%v", gotNames, compatibleNames) } }) } diff --git a/api/numaresourcesoperator/v1/numaresourcesoperator_types.go b/api/numaresourcesoperator/v1/numaresourcesoperator_types.go index 01c70ff9e..1f26bab5a 100644 --- a/api/numaresourcesoperator/v1/numaresourcesoperator_types.go +++ b/api/numaresourcesoperator/v1/numaresourcesoperator_types.go @@ -112,7 +112,7 @@ type NodeGroupConfig struct { } // NodeGroup defines group of nodes that will run resource topology exporter daemon set -// You can choose the group of node by MachineConfigPoolSelector or by NodeSelector +// You can choose the group of node by MachineConfigPoolSelector or by PoolName type NodeGroup struct { // MachineConfigPoolSelector defines label selector for the machine config pool // +optional @@ -120,6 +120,29 @@ type NodeGroup struct { // Config defines the RTE behavior for this NodeGroup // +optional Config *NodeGroupConfig `json:"config,omitempty"` + // PoolName defines the pool name to which the nodes belong that the config of this node group will be applied to + // +optional + PoolName *string `json:"poolName,omitempty"` +} + +// NodeGroupStatus reports the status of a NodeGroup once matches an actual set of nodes and it is correctly processed +// by the system. In other words, is not possible to have a NodeGroupStatus which does not represent a valid NodeGroup +// which in turn correctly references unambiguously a set of nodes in the cluster. +// Hence, if a NodeGroupStatus is published, its `Name` must be present, because it refers back to a NodeGroup whose +// config was correctly processed in the Spec. And its DaemonSet will be nonempty, because matches correctly a set +// of nodes in the cluster. The Config is best-effort always represented, possibly reflecting the system defaults. +// If the system cannot process a NodeGroup correctly from the Spec, it will report Degraded state in the top-level +// condition, and will provide details using the aforementioned conditions. +type NodeGroupStatus struct { + // DaemonSet of the configured RTEs, for this node group + //+operator-sdk:csv:customresourcedefinitions:type=status,displayName="RTE DaemonSets" + DaemonSet NamespacedName `json:"daemonsets"` + // NodeGroupConfig represents the latest available configuration applied to this NodeGroup + //+operator-sdk:csv:customresourcedefinitions:type=status,displayName="Optional configuration enforced on this NodeGroup" + Config NodeGroupConfig `json:"config"` + // PoolName represents the pool name to which the nodes belong that the config of this node group is be applied to + //+operator-sdk:csv:customresourcedefinitions:type=status,displayName="Pool name of nodes in this node group" + PoolName string `json:"selector"` } // NUMAResourcesOperatorStatus defines the observed state of NUMAResourcesOperator @@ -130,6 +153,10 @@ type NUMAResourcesOperatorStatus struct { // MachineConfigPools resolved from configured node groups //+operator-sdk:csv:customresourcedefinitions:type=status,displayName="RTE MCPs from node groups" MachineConfigPools []MachineConfigPool `json:"machineconfigpools,omitempty"` + // NodeGroups report the observed status of the configured NodeGroups, matching by their name + // +optional + //+operator-sdk:csv:customresourcedefinitions:type=status,displayName="Node groups observed status" + NodeGroups []NodeGroupStatus `json:"nodeGroups,omitempty"` // Conditions show the current state of the NUMAResourcesOperator Operator //+operator-sdk:csv:customresourcedefinitions:type=status,displayName="Condition reported" Conditions []metav1.Condition `json:"conditions,omitempty"` diff --git a/api/numaresourcesoperator/v1/zz_generated.deepcopy.go b/api/numaresourcesoperator/v1/zz_generated.deepcopy.go index 0e404ffc7..6e2b74042 100644 --- a/api/numaresourcesoperator/v1/zz_generated.deepcopy.go +++ b/api/numaresourcesoperator/v1/zz_generated.deepcopy.go @@ -156,6 +156,13 @@ func (in *NUMAResourcesOperatorStatus) DeepCopyInto(out *NUMAResourcesOperatorSt (*in)[i].DeepCopyInto(&(*out)[i]) } } + if in.NodeGroups != nil { + in, out := &in.NodeGroups, &out.NodeGroups + *out = make([]NodeGroupStatus, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } if in.Conditions != nil { in, out := &in.Conditions, &out.Conditions *out = make([]metav1.Condition, len(*in)) @@ -345,6 +352,11 @@ func (in *NodeGroup) DeepCopyInto(out *NodeGroup) { *out = new(NodeGroupConfig) (*in).DeepCopyInto(*out) } + if in.PoolName != nil { + in, out := &in.PoolName, &out.PoolName + *out = new(string) + **out = **in + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NodeGroup. @@ -399,6 +411,23 @@ func (in *NodeGroupConfig) DeepCopy() *NodeGroupConfig { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *NodeGroupStatus) DeepCopyInto(out *NodeGroupStatus) { + *out = *in + out.DaemonSet = in.DaemonSet + in.Config.DeepCopyInto(&out.Config) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NodeGroupStatus. +func (in *NodeGroupStatus) DeepCopy() *NodeGroupStatus { + if in == nil { + return nil + } + out := new(NodeGroupStatus) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *ResourceSpecParams) DeepCopyInto(out *ResourceSpecParams) { *out = *in diff --git a/bundle/manifests/nodetopology.openshift.io_numaresourcesoperators.yaml b/bundle/manifests/nodetopology.openshift.io_numaresourcesoperators.yaml index 1d5c14f01..6ca90fb1e 100644 --- a/bundle/manifests/nodetopology.openshift.io_numaresourcesoperators.yaml +++ b/bundle/manifests/nodetopology.openshift.io_numaresourcesoperators.yaml @@ -62,7 +62,7 @@ spec: items: description: |- NodeGroup defines group of nodes that will run resource topology exporter daemon set - You can choose the group of node by MachineConfigPoolSelector or by NodeSelector + You can choose the group of node by MachineConfigPoolSelector or by PoolName properties: config: description: Config defines the RTE behavior for this NodeGroup @@ -182,6 +182,11 @@ spec: type: object type: object x-kubernetes-map-type: atomic + poolName: + description: PoolName defines the pool name to which the nodes + belong that the config of this node group will be applied + to + type: string type: object type: array podExcludes: @@ -401,6 +406,114 @@ spec: - name type: object type: array + nodeGroups: + description: NodeGroups report the observed status of the configured + NodeGroups, matching by their name + items: + description: |- + NodeGroupStatus reports the status of a NodeGroup once matches an actual set of nodes and it is correctly processed + by the system. In other words, is not possible to have a NodeGroupStatus which does not represent a valid NodeGroup + which in turn correctly references unambiguously a set of nodes in the cluster. + Hence, if a NodeGroupStatus is published, its `Name` must be present, because it refers back to a NodeGroup whose + config was correctly processed in the Spec. And its DaemonSet will be nonempty, because matches correctly a set + of nodes in the cluster. The Config is best-effort always represented, possibly reflecting the system defaults. + If the system cannot process a NodeGroup correctly from the Spec, it will report Degraded state in the top-level + condition, and will provide details using the aforementioned conditions. + properties: + config: + description: NodeGroupConfig represents the latest available + configuration applied to this NodeGroup + properties: + infoRefreshMode: + description: InfoRefreshMode sets the mechanism which will + be used to refresh the topology info. + enum: + - Periodic + - Events + - PeriodicAndEvents + type: string + infoRefreshPause: + description: InfoRefreshPause defines if updates to NRTs + are paused for the machines belonging to this group + enum: + - Disabled + - Enabled + type: string + infoRefreshPeriod: + description: InfoRefreshPeriod sets the topology info refresh + period. Use explicit 0 to disable. + type: string + podsFingerprinting: + description: PodsFingerprinting defines if pod fingerprint + should be reported for the machines belonging to this + group + enum: + - Disabled + - Enabled + - EnabledExclusiveResources + type: string + tolerations: + description: |- + Tolerations overrides tolerations to be set into RTE daemonsets for this NodeGroup. If not empty, the tolerations will be the one set here. + Leave empty to make the system use the default tolerations. + items: + description: |- + The pod this Toleration is attached to tolerates any taint that matches + the triple using the matching operator . + properties: + effect: + description: |- + Effect indicates the taint effect to match. Empty means match all taint effects. + When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute. + type: string + key: + description: |- + Key is the taint key that the toleration applies to. Empty means match all taint keys. + If the key is empty, operator must be Exists; this combination means to match all values and all keys. + type: string + operator: + description: |- + Operator represents a key's relationship to the value. + Valid operators are Exists and Equal. Defaults to Equal. + Exists is equivalent to wildcard for value, so that a pod can + tolerate all taints of a particular category. + type: string + tolerationSeconds: + description: |- + TolerationSeconds represents the period of time the toleration (which must be + of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default, + it is not set, which means tolerate the taint forever (do not evict). Zero and + negative values will be treated as 0 (evict immediately) by the system. + format: int64 + type: integer + value: + description: |- + Value is the taint value the toleration matches to. + If the operator is Exists, the value should be empty, otherwise just a regular string. + type: string + type: object + type: array + type: object + daemonsets: + description: DaemonSet of the configured RTEs, for this node + group + properties: + name: + type: string + namespace: + type: string + type: object + selector: + description: PoolName represents the pool name to which the + nodes belong that the config of this node group is be applied + to + type: string + required: + - config + - daemonsets + - selector + type: object + type: array relatedObjects: description: RelatedObjects list of objects of interest for this operator items: diff --git a/bundle/manifests/numaresources-operator.clusterserviceversion.yaml b/bundle/manifests/numaresources-operator.clusterserviceversion.yaml index 428bdea68..028b4fb0a 100644 --- a/bundle/manifests/numaresources-operator.clusterserviceversion.yaml +++ b/bundle/manifests/numaresources-operator.clusterserviceversion.yaml @@ -62,7 +62,7 @@ metadata: } ] capabilities: Basic Install - createdAt: "2024-10-14T13:06:40Z" + createdAt: "2024-10-16T11:10:10Z" olm.skipRange: '>=4.17.0 <4.18.0' operators.operatorframework.io/builder: operator-sdk-v1.36.1 operators.operatorframework.io/project_layout: go.kubebuilder.io/v3 @@ -149,6 +149,21 @@ spec: applied to this MachineConfigPool displayName: Optional configuration enforced on this NodeGroup path: machineconfigpools[0].config + - description: NodeGroups report the observed status of the configured NodeGroups, + matching by their name + displayName: Node groups observed status + path: nodeGroups + - description: NodeGroupConfig represents the latest available configuration + applied to this NodeGroup + displayName: Optional configuration enforced on this NodeGroup + path: nodeGroups[0].config + - description: DaemonSet of the configured RTEs, for this node group + displayName: RTE DaemonSets + path: nodeGroups[0].daemonsets + - description: PoolName represents the pool name to which the nodes belong that + the config of this node group is be applied to + displayName: Pool name of nodes in this node group + path: nodeGroups[0].selector - description: RelatedObjects list of objects of interest for this operator displayName: Related Objects path: relatedObjects diff --git a/config/crd/bases/nodetopology.openshift.io_numaresourcesoperators.yaml b/config/crd/bases/nodetopology.openshift.io_numaresourcesoperators.yaml index 9960e356c..fa308d5b8 100644 --- a/config/crd/bases/nodetopology.openshift.io_numaresourcesoperators.yaml +++ b/config/crd/bases/nodetopology.openshift.io_numaresourcesoperators.yaml @@ -62,7 +62,7 @@ spec: items: description: |- NodeGroup defines group of nodes that will run resource topology exporter daemon set - You can choose the group of node by MachineConfigPoolSelector or by NodeSelector + You can choose the group of node by MachineConfigPoolSelector or by PoolName properties: config: description: Config defines the RTE behavior for this NodeGroup @@ -182,6 +182,11 @@ spec: type: object type: object x-kubernetes-map-type: atomic + poolName: + description: PoolName defines the pool name to which the nodes + belong that the config of this node group will be applied + to + type: string type: object type: array podExcludes: @@ -401,6 +406,114 @@ spec: - name type: object type: array + nodeGroups: + description: NodeGroups report the observed status of the configured + NodeGroups, matching by their name + items: + description: |- + NodeGroupStatus reports the status of a NodeGroup once matches an actual set of nodes and it is correctly processed + by the system. In other words, is not possible to have a NodeGroupStatus which does not represent a valid NodeGroup + which in turn correctly references unambiguously a set of nodes in the cluster. + Hence, if a NodeGroupStatus is published, its `Name` must be present, because it refers back to a NodeGroup whose + config was correctly processed in the Spec. And its DaemonSet will be nonempty, because matches correctly a set + of nodes in the cluster. The Config is best-effort always represented, possibly reflecting the system defaults. + If the system cannot process a NodeGroup correctly from the Spec, it will report Degraded state in the top-level + condition, and will provide details using the aforementioned conditions. + properties: + config: + description: NodeGroupConfig represents the latest available + configuration applied to this NodeGroup + properties: + infoRefreshMode: + description: InfoRefreshMode sets the mechanism which will + be used to refresh the topology info. + enum: + - Periodic + - Events + - PeriodicAndEvents + type: string + infoRefreshPause: + description: InfoRefreshPause defines if updates to NRTs + are paused for the machines belonging to this group + enum: + - Disabled + - Enabled + type: string + infoRefreshPeriod: + description: InfoRefreshPeriod sets the topology info refresh + period. Use explicit 0 to disable. + type: string + podsFingerprinting: + description: PodsFingerprinting defines if pod fingerprint + should be reported for the machines belonging to this + group + enum: + - Disabled + - Enabled + - EnabledExclusiveResources + type: string + tolerations: + description: |- + Tolerations overrides tolerations to be set into RTE daemonsets for this NodeGroup. If not empty, the tolerations will be the one set here. + Leave empty to make the system use the default tolerations. + items: + description: |- + The pod this Toleration is attached to tolerates any taint that matches + the triple using the matching operator . + properties: + effect: + description: |- + Effect indicates the taint effect to match. Empty means match all taint effects. + When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute. + type: string + key: + description: |- + Key is the taint key that the toleration applies to. Empty means match all taint keys. + If the key is empty, operator must be Exists; this combination means to match all values and all keys. + type: string + operator: + description: |- + Operator represents a key's relationship to the value. + Valid operators are Exists and Equal. Defaults to Equal. + Exists is equivalent to wildcard for value, so that a pod can + tolerate all taints of a particular category. + type: string + tolerationSeconds: + description: |- + TolerationSeconds represents the period of time the toleration (which must be + of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default, + it is not set, which means tolerate the taint forever (do not evict). Zero and + negative values will be treated as 0 (evict immediately) by the system. + format: int64 + type: integer + value: + description: |- + Value is the taint value the toleration matches to. + If the operator is Exists, the value should be empty, otherwise just a regular string. + type: string + type: object + type: array + type: object + daemonsets: + description: DaemonSet of the configured RTEs, for this node + group + properties: + name: + type: string + namespace: + type: string + type: object + selector: + description: PoolName represents the pool name to which the + nodes belong that the config of this node group is be applied + to + type: string + required: + - config + - daemonsets + - selector + type: object + type: array relatedObjects: description: RelatedObjects list of objects of interest for this operator items: diff --git a/config/manifests/bases/numaresources-operator.clusterserviceversion.yaml b/config/manifests/bases/numaresources-operator.clusterserviceversion.yaml index 409120760..d66b6c55a 100644 --- a/config/manifests/bases/numaresources-operator.clusterserviceversion.yaml +++ b/config/manifests/bases/numaresources-operator.clusterserviceversion.yaml @@ -88,6 +88,21 @@ spec: applied to this MachineConfigPool displayName: Optional configuration enforced on this NodeGroup path: machineconfigpools[0].config + - description: NodeGroups report the observed status of the configured NodeGroups, + matching by their name + displayName: Node groups observed status + path: nodeGroups + - description: NodeGroupConfig represents the latest available configuration + applied to this NodeGroup + displayName: Optional configuration enforced on this NodeGroup + path: nodeGroups[0].config + - description: DaemonSet of the configured RTEs, for this node group + displayName: RTE DaemonSets + path: nodeGroups[0].daemonsets + - description: PoolName represents the pool name to which the nodes belong that + the config of this node group is be applied to + displayName: Pool name of nodes in this node group + path: nodeGroups[0].selector - description: RelatedObjects list of objects of interest for this operator displayName: Related Objects path: relatedObjects diff --git a/controllers/kubeletconfig_controller_test.go b/controllers/kubeletconfig_controller_test.go index 019842a5b..7c6f07c76 100644 --- a/controllers/kubeletconfig_controller_test.go +++ b/controllers/kubeletconfig_controller_test.go @@ -65,9 +65,12 @@ var _ = Describe("Test KubeletConfig Reconcile", func() { "test1": "test1", } mcp1 = testobjs.NewMachineConfigPool("test1", label1, &metav1.LabelSelector{MatchLabels: label1}, &metav1.LabelSelector{MatchLabels: label1}) - nro = testobjs.NewNUMAResourcesOperator(objectnames.DefaultNUMAResourcesOperatorCrName, []*metav1.LabelSelector{ - {MatchLabels: label1}, - }) + ng := nropv1.NodeGroup{ + MachineConfigPoolSelector: &metav1.LabelSelector{ + MatchLabels: label1, + }, + } + nro = testobjs.NewNUMAResourcesOperator(objectnames.DefaultNUMAResourcesOperatorCrName, ng) kubeletConfig := &kubeletconfigv1beta1.KubeletConfiguration{} mcoKc1 = testobjs.NewKubeletConfig("test1", label1, mcp1.Spec.MachineConfigSelector, kubeletConfig) }) diff --git a/controllers/numaresourcesoperator_controller.go b/controllers/numaresourcesoperator_controller.go index dca6203f2..96edc2dde 100644 --- a/controllers/numaresourcesoperator_controller.go +++ b/controllers/numaresourcesoperator_controller.go @@ -225,38 +225,39 @@ func (r *NUMAResourcesOperatorReconciler) reconcileResourceMachineConfig(ctx con // It can take a while. mcpStatuses, allMCPsUpdated := syncMachineConfigPoolsStatuses(instance.Name, trees, r.ForwardMCPConds, mcpUpdatedFunc) instance.Status.MachineConfigPools = mcpStatuses + if !allMCPsUpdated { // the Machine Config Pool still did not apply the machine config, wait for one minute return true, ctrl.Result{RequeueAfter: numaResourcesRetryPeriod}, status.ConditionProgressing, nil } - instance.Status.MachineConfigPools = syncMachineConfigPoolNodeGroupConfigStatuses(instance.Status.MachineConfigPools, trees) + return false, ctrl.Result{}, "", nil } -func (r *NUMAResourcesOperatorReconciler) reconcileResourceDaemonSet(ctx context.Context, instance *nropv1.NUMAResourcesOperator, trees []nodegroupv1.Tree) (bool, ctrl.Result, string, error) { - daemonSetsInfo, err := r.syncNUMAResourcesOperatorResources(ctx, instance, trees) +func (r *NUMAResourcesOperatorReconciler) reconcileResourceDaemonSet(ctx context.Context, instance *nropv1.NUMAResourcesOperator, trees []nodegroupv1.Tree) (map[string]nropv1.NamespacedName, bool, ctrl.Result, string, error) { + daemonSetsInfoPerMCP, err := r.syncNUMAResourcesOperatorResources(ctx, instance, trees) if err != nil { r.Recorder.Eventf(instance, corev1.EventTypeWarning, "FailedRTECreate", "Failed to create Resource-Topology-Exporter DaemonSets: %v", err) - return true, ctrl.Result{}, status.ConditionDegraded, fmt.Errorf("FailedRTESync: %w", err) + return nil, true, ctrl.Result{}, status.ConditionDegraded, fmt.Errorf("FailedRTESync: %w", err) } - if len(daemonSetsInfo) == 0 { - return false, ctrl.Result{}, "", nil + if len(daemonSetsInfoPerMCP) == 0 { + return nil, false, ctrl.Result{}, "", nil } r.Recorder.Eventf(instance, corev1.EventTypeNormal, "SuccessfulRTECreate", "Created Resource-Topology-Exporter DaemonSets") - dsStatuses, allDSsUpdated, err := r.syncDaemonSetsStatuses(ctx, r.Client, daemonSetsInfo) - instance.Status.DaemonSets = dsStatuses - instance.Status.RelatedObjects = relatedobjects.ResourceTopologyExporter(r.Namespace, dsStatuses) + dssWithReadyStatus, allDSsUpdated, err := r.syncDaemonSetsStatuses(ctx, r.Client, daemonSetsInfoPerMCP) + instance.Status.DaemonSets = dssWithReadyStatus + instance.Status.RelatedObjects = relatedobjects.ResourceTopologyExporter(r.Namespace, dssWithReadyStatus) if err != nil { - return true, ctrl.Result{}, status.ConditionDegraded, err + return nil, true, ctrl.Result{}, status.ConditionDegraded, err } if !allDSsUpdated { - return true, ctrl.Result{RequeueAfter: 5 * time.Second}, status.ConditionProgressing, nil + return nil, true, ctrl.Result{RequeueAfter: 5 * time.Second}, status.ConditionProgressing, nil } - return false, ctrl.Result{}, "", nil + return daemonSetsInfoPerMCP, false, ctrl.Result{}, "", nil } func (r *NUMAResourcesOperatorReconciler) reconcileResource(ctx context.Context, instance *nropv1.NUMAResourcesOperator, trees []nodegroupv1.Tree) (ctrl.Result, string, error) { @@ -270,15 +271,20 @@ func (r *NUMAResourcesOperatorReconciler) reconcileResource(ctx context.Context, } } - if done, res, cond, err := r.reconcileResourceDaemonSet(ctx, instance, trees); done { + dsPerMCP, done, res, cond, err := r.reconcileResourceDaemonSet(ctx, instance, trees) + if done { return res, cond, err } + // all fields of NodeGroupStatus are required so publish the status only when all daemonset and MCPs are updated which + // is a certain thing if we got to this point otherwise the function would have returned already + instance.Status.NodeGroups = syncNodeGroupsStatus(instance, dsPerMCP) + return ctrl.Result{}, status.ConditionAvailable, nil } -func (r *NUMAResourcesOperatorReconciler) syncDaemonSetsStatuses(ctx context.Context, rd client.Reader, daemonSetsInfo []nropv1.NamespacedName) ([]nropv1.NamespacedName, bool, error) { - dsStatuses := []nropv1.NamespacedName{} +func (r *NUMAResourcesOperatorReconciler) syncDaemonSetsStatuses(ctx context.Context, rd client.Reader, daemonSetsInfo map[string]nropv1.NamespacedName) ([]nropv1.NamespacedName, bool, error) { + dssWithReadyStatus := []nropv1.NamespacedName{} for _, nname := range daemonSetsInfo { ds := appsv1.DaemonSet{} dsKey := client.ObjectKey{ @@ -287,15 +293,28 @@ func (r *NUMAResourcesOperatorReconciler) syncDaemonSetsStatuses(ctx context.Con } err := rd.Get(ctx, dsKey, &ds) if err != nil { - return dsStatuses, false, err + return dssWithReadyStatus, false, err } if !isDaemonSetReady(&ds) { - return dsStatuses, false, nil + return dssWithReadyStatus, false, nil + } + dssWithReadyStatus = append(dssWithReadyStatus, nname) + } + return dssWithReadyStatus, true, nil +} + +func syncNodeGroupsStatus(instance *nropv1.NUMAResourcesOperator, dsPerMCP map[string]nropv1.NamespacedName) []nropv1.NodeGroupStatus { + ngStatuses := []nropv1.NodeGroupStatus{} + for _, mcp := range instance.Status.MachineConfigPools { + status := nropv1.NodeGroupStatus{ + PoolName: mcp.Name, + Config: *mcp.Config, + DaemonSet: dsPerMCP[mcp.Name], } - dsStatuses = append(dsStatuses, nname) + ngStatuses = append(ngStatuses, status) } - return dsStatuses, true, nil + return ngStatuses } func (r *NUMAResourcesOperatorReconciler) syncNodeResourceTopologyAPI(ctx context.Context) (bool, error) { @@ -425,7 +444,7 @@ func getMachineConfigPoolStatusByName(mcpStatuses []nropv1.MachineConfigPool, na return nropv1.MachineConfigPool{Name: name} } -func (r *NUMAResourcesOperatorReconciler) syncNUMAResourcesOperatorResources(ctx context.Context, instance *nropv1.NUMAResourcesOperator, trees []nodegroupv1.Tree) ([]nropv1.NamespacedName, error) { +func (r *NUMAResourcesOperatorReconciler) syncNUMAResourcesOperatorResources(ctx context.Context, instance *nropv1.NUMAResourcesOperator, trees []nodegroupv1.Tree) (map[string]nropv1.NamespacedName, error) { klog.V(4).InfoS("RTESync start", "trees", len(trees)) defer klog.V(4).Info("RTESync stop") @@ -440,35 +459,47 @@ func (r *NUMAResourcesOperatorReconciler) syncNUMAResourcesOperatorResources(ctx } var err error - var daemonSetsNName []nropv1.NamespacedName + dssByMCP := make(map[string]nropv1.NamespacedName) err = rteupdate.DaemonSetUserImageSettings(r.RTEManifests.DaemonSet, instance.Spec.ExporterImage, r.Images.Preferred(), r.ImagePullPolicy) if err != nil { - return daemonSetsNName, err + return dssByMCP, err } err = rteupdate.DaemonSetPauseContainerSettings(r.RTEManifests.DaemonSet) if err != nil { - return daemonSetsNName, err + return dssByMCP, err } err = loglevel.UpdatePodSpec(&r.RTEManifests.DaemonSet.Spec.Template.Spec, manifests.ContainerNameRTE, instance.Spec.LogLevel) if err != nil { - return daemonSetsNName, err + return dssByMCP, err } // ConfigMap should be provided by the kubeletconfig reconciliation loop if r.RTEManifests.ConfigMap != nil { cmHash, err := hash.ComputeCurrentConfigMap(ctx, r.Client, r.RTEManifests.ConfigMap) if err != nil { - return daemonSetsNName, err + return dssByMCP, err } rteupdate.DaemonSetHashAnnotation(r.RTEManifests.DaemonSet, cmHash) } rteupdate.SecurityContextConstraint(r.RTEManifests.SecurityContextConstraint, annotations.IsCustomPolicyEnabled(instance.Annotations)) + processor := func(mcpName string, gdm *rtestate.GeneratedDesiredManifest) error { + err := daemonsetUpdater(mcpName, gdm) + if err != nil { + return err + } + dssByMCP[mcpName] = nropv1.NamespacedName{ + Namespace: gdm.DaemonSet.GetNamespace(), + Name: gdm.DaemonSet.GetName(), + } + return nil + } + existing := rtestate.FromClient(ctx, r.Client, r.Platform, r.RTEManifests, instance, trees, r.Namespace) - for _, objState := range existing.State(r.RTEManifests, daemonsetUpdater, annotations.IsCustomPolicyEnabled(instance.Annotations)) { + for _, objState := range existing.State(r.RTEManifests, processor, annotations.IsCustomPolicyEnabled(instance.Annotations)) { if objState.Error != nil { // We are likely in the bootstrap scenario. In this case, which is expected once, everything is fine. // If it happens past bootstrap, still carry on. We know what to do, and we do want to enforce the desired state. @@ -482,16 +513,12 @@ func (r *NUMAResourcesOperatorReconciler) syncNUMAResourcesOperatorResources(ctx if err != nil { return nil, fmt.Errorf("failed to set controller reference to %s %s: %w", objState.Desired.GetNamespace(), objState.Desired.GetName(), err) } - obj, _, err := apply.ApplyObject(ctx, r.Client, objState) + _, _, err = apply.ApplyObject(ctx, r.Client, objState) if err != nil { return nil, fmt.Errorf("failed to apply (%s) %s/%s: %w", objState.Desired.GetObjectKind().GroupVersionKind(), objState.Desired.GetNamespace(), objState.Desired.GetName(), err) } - - if nname, ok := rtestate.DaemonSetNamespacedNameFromObject(obj); ok { - daemonSetsNName = append(daemonSetsNName, nname) - } } - return daemonSetsNName, nil + return dssByMCP, nil } func (r *NUMAResourcesOperatorReconciler) deleteUnusedDaemonSets(ctx context.Context, instance *nropv1.NUMAResourcesOperator, trees []nodegroupv1.Tree) []error { diff --git a/controllers/numaresourcesoperator_controller_test.go b/controllers/numaresourcesoperator_controller_test.go index 47ef6c0ca..18efbd75a 100644 --- a/controllers/numaresourcesoperator_controller_test.go +++ b/controllers/numaresourcesoperator_controller_test.go @@ -107,29 +107,86 @@ var _ = Describe("Test NUMAResourcesOperator Reconcile", func() { Context("with unexpected NRO CR name", func() { It("should updated the CR condition to degraded", func() { - nro := testobjs.NewNUMAResourcesOperator("test", nil) + nro := testobjs.NewNUMAResourcesOperator("test") verifyDegradedCondition(nro, status.ConditionTypeIncorrectNUMAResourcesOperatorResourceName) }) }) - Context("with NRO empty machine config pool selector node group", func() { - It("should updated the CR condition to degraded", func() { - nro := testobjs.NewNUMAResourcesOperator(objectnames.DefaultNUMAResourcesOperatorCrName, []*metav1.LabelSelector{nil}) + Context("with NRO empty selectors node group", func() { + It("should update the CR condition to degraded", func() { + ng := nropv1.NodeGroup{} + nro := testobjs.NewNUMAResourcesOperator(objectnames.DefaultNUMAResourcesOperatorCrName, ng) verifyDegradedCondition(nro, validation.NodeGroupsError) }) }) - Context("without available machine config pools", func() { - It("should updated the CR condition to degraded", func() { - nro := testobjs.NewNUMAResourcesOperator(objectnames.DefaultNUMAResourcesOperatorCrName, []*metav1.LabelSelector{ - { + Context("with NRO mutiple pool specifiers set on same node group", func() { + It("should update the CR condition to degraded", func() { + pn := "pn-1" + ng := nropv1.NodeGroup{ + MachineConfigPoolSelector: &metav1.LabelSelector{ MatchLabels: map[string]string{"test": "test"}, }, - }) + PoolName: &pn, + } + nro := testobjs.NewNUMAResourcesOperator(objectnames.DefaultNUMAResourcesOperatorCrName, ng) verifyDegradedCondition(nro, validation.NodeGroupsError) }) }) + Context("without available machine config pools", func() { + It("should update the CR condition to degraded when MachineConfigPoolSelector is set", func() { + ng := nropv1.NodeGroup{ + MachineConfigPoolSelector: &metav1.LabelSelector{MatchLabels: map[string]string{"test": "test"}}, + } + nro := testobjs.NewNUMAResourcesOperator(objectnames.DefaultNUMAResourcesOperatorCrName, ng) + verifyDegradedCondition(nro, validation.NodeGroupsError) + }) + It("should update the CR condition to degraded when PoolName set", func() { + pn := "pn-1" + ng := nropv1.NodeGroup{ + PoolName: &pn, + } + nro := testobjs.NewNUMAResourcesOperator(objectnames.DefaultNUMAResourcesOperatorCrName, ng) + verifyDegradedCondition(nro, validation.NodeGroupsError) + }) + }) + + Context("with two node groups each with different pool specifier type and both point to same MCP", func() { + It("should update the CR condition to degraded", func() { + mcpName := "test1" + label1 := map[string]string{ + "test1": "test1", + } + + ng1 := nropv1.NodeGroup{ + MachineConfigPoolSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{"test": "test"}, + }, + } + ng2 := nropv1.NodeGroup{ + PoolName: &mcpName, + } + nro := testobjs.NewNUMAResourcesOperator(objectnames.DefaultNUMAResourcesOperatorCrName, ng1, ng2) + + mcp1 := testobjs.NewMachineConfigPool(mcpName, label1, &metav1.LabelSelector{MatchLabels: label1}, &metav1.LabelSelector{MatchLabels: label1}) + + var err error + reconciler, err := NewFakeNUMAResourcesOperatorReconciler(platform.OpenShift, defaultOCPVersion, nro, mcp1) + Expect(err).ToNot(HaveOccurred()) + + key := client.ObjectKeyFromObject(nro) + result, err := reconciler.Reconcile(context.TODO(), reconcile.Request{NamespacedName: key}) + Expect(err).ToNot(HaveOccurred()) + Expect(result).To(Equal(reconcile.Result{})) + + Expect(reconciler.Client.Get(context.TODO(), key, nro)).ToNot(HaveOccurred()) + degradedCondition := getConditionByType(nro.Status.Conditions, status.ConditionDegraded) + Expect(degradedCondition.Status).To(Equal(metav1.ConditionTrue)) + Expect(degradedCondition.Reason).To(Equal(validation.NodeGroupsError)) + }) + }) + Context("with correct NRO and more than one NodeGroup", func() { var nro *nropv1.NUMAResourcesOperator var mcp1 *machineconfigv1.MachineConfigPool @@ -137,6 +194,7 @@ var _ = Describe("Test NUMAResourcesOperator Reconcile", func() { var reconciler *NUMAResourcesOperatorReconciler var label1, label2 map[string]string + var ng1, ng2 nropv1.NodeGroup BeforeEach(func() { label1 = map[string]string{ @@ -146,10 +204,17 @@ var _ = Describe("Test NUMAResourcesOperator Reconcile", func() { "test2": "test2", } - nro = testobjs.NewNUMAResourcesOperator(objectnames.DefaultNUMAResourcesOperatorCrName, []*metav1.LabelSelector{ - {MatchLabels: label1}, - {MatchLabels: label2}, - }) + ng1 = nropv1.NodeGroup{ + MachineConfigPoolSelector: &metav1.LabelSelector{ + MatchLabels: label1, + }, + } + ng2 = nropv1.NodeGroup{ + MachineConfigPoolSelector: &metav1.LabelSelector{ + MatchLabels: label2, + }, + } + nro = testobjs.NewNUMAResourcesOperator(objectnames.DefaultNUMAResourcesOperatorCrName, ng1, ng2) nro.Annotations = map[string]string{annotations.SELinuxPolicyConfigAnnotation: annotations.SELinuxPolicyCustom} mcp1 = testobjs.NewMachineConfigPool("test1", label1, &metav1.LabelSelector{MatchLabels: label1}, &metav1.LabelSelector{MatchLabels: label1}) @@ -336,7 +401,55 @@ var _ = Describe("Test NUMAResourcesOperator Reconcile", func() { }) }) }) + When("add PoolName on existing node group with another specifier already exist", func() { + It("should update the CR condition to degraded", func(ctx context.Context) { + pn := "pool-1" + ng1WithNodeSelector := ng1.DeepCopy() + ng1WithNodeSelector.PoolName = &pn + key := client.ObjectKeyFromObject(nro) + Eventually(func() error { + nroUpdated := &nropv1.NUMAResourcesOperator{} + Expect(reconciler.Client.Get(ctx, key, nroUpdated)) + nroUpdated.Spec.NodeGroups[0] = *ng1WithNodeSelector + return reconciler.Client.Update(context.TODO(), nroUpdated) + }).WithPolling(1 * time.Second).WithTimeout(30 * time.Second).ShouldNot(HaveOccurred()) + verifyDegradedCondition(nro, validation.NodeGroupsError) + }) + }) }) + + Context("with correct NRO and PoolName set", func() { + It("should create RTE daemonset", func(ctx context.Context) { + mcpName := "test1" + label := map[string]string{ + "test1": "test1", + } + ng := nropv1.NodeGroup{ + PoolName: &mcpName, + } + + mcp := testobjs.NewMachineConfigPool(mcpName, label, &metav1.LabelSelector{MatchLabels: label}, &metav1.LabelSelector{MatchLabels: label}) + nro := testobjs.NewNUMAResourcesOperator(objectnames.DefaultNUMAResourcesOperatorCrName, ng) + nro.Annotations = map[string]string{annotations.SELinuxPolicyConfigAnnotation: annotations.SELinuxPolicyCustom} + + reconciler := reconcileObjects(nro, mcp) + + mcpDSKey := client.ObjectKey{ + Name: objectnames.GetComponentName(nro.Name, mcp.Name), + Namespace: testNamespace, + } + ds := &appsv1.DaemonSet{} + Expect(reconciler.Client.Get(context.TODO(), mcpDSKey, ds)).ToNot(HaveOccurred()) + + nroUpdated := &nropv1.NUMAResourcesOperator{} + Expect(reconciler.Client.Get(context.TODO(), client.ObjectKeyFromObject(nro), nroUpdated)).ToNot(HaveOccurred()) + + Expect(len(nroUpdated.Status.MachineConfigPools)).To(Equal(1)) + Expect(nroUpdated.Status.MachineConfigPools[0].Name).To(Equal(mcp.Name)) + // TODO check the actual returned config and the daemonset in status, we need to force the NRO condition as Available for this. + }) + }) + Context("with correct NRO CR", func() { var nro *nropv1.NUMAResourcesOperator var mcp1 *machineconfigv1.MachineConfigPool @@ -353,10 +466,17 @@ var _ = Describe("Test NUMAResourcesOperator Reconcile", func() { "test2": "test2", } - nro = testobjs.NewNUMAResourcesOperator(objectnames.DefaultNUMAResourcesOperatorCrName, []*metav1.LabelSelector{ - {MatchLabels: label1}, - {MatchLabels: label2}, - }) + ng1 := nropv1.NodeGroup{ + MachineConfigPoolSelector: &metav1.LabelSelector{ + MatchLabels: label1, + }, + } + ng2 := nropv1.NodeGroup{ + MachineConfigPoolSelector: &metav1.LabelSelector{ + MatchLabels: label2, + }, + } + nro = testobjs.NewNUMAResourcesOperator(objectnames.DefaultNUMAResourcesOperatorCrName, ng1, ng2) nro.Annotations = map[string]string{annotations.SELinuxPolicyConfigAnnotation: annotations.SELinuxPolicyCustom} mcp1 = testobjs.NewMachineConfigPool("test1", label1, &metav1.LabelSelector{MatchLabels: label1}, &metav1.LabelSelector{MatchLabels: label1}) @@ -1134,20 +1254,26 @@ var _ = Describe("Test NUMAResourcesOperator Reconcile", func() { var mcp2 *machineconfigv1.MachineConfigPool var reconciler *NUMAResourcesOperatorReconciler - var label1, label2 map[string]string BeforeEach(func() { - label1 = map[string]string{ + label1 := map[string]string{ "test1": "test1", } - label2 = map[string]string{ + label2 := map[string]string{ "test2": "test2", } - nro = testobjs.NewNUMAResourcesOperator(objectnames.DefaultNUMAResourcesOperatorCrName, []*metav1.LabelSelector{ - {MatchLabels: label1}, - {MatchLabels: label2}, - }) + ng1 := nropv1.NodeGroup{ + MachineConfigPoolSelector: &metav1.LabelSelector{ + MatchLabels: label1, + }, + } + ng2 := nropv1.NodeGroup{ + MachineConfigPoolSelector: &metav1.LabelSelector{ + MatchLabels: label2, + }, + } + nro = testobjs.NewNUMAResourcesOperator(objectnames.DefaultNUMAResourcesOperatorCrName, ng1, ng2) // reconciling NRO object with custom policy, emulates the old behavior version nro.Annotations = map[string]string{annotations.SELinuxPolicyConfigAnnotation: annotations.SELinuxPolicyCustom} @@ -1273,7 +1399,14 @@ func reconcileObjects(nro *nropv1.NUMAResourcesOperator, mcp *machineconfigv1.Ma Status: corev1.ConditionTrue, }, } + mcName := objectnames.GetMachineConfigName(nro.Name, mcp.Name) + mcp.Status.Configuration.Source[0].Name = mcName + Expect(reconciler.Client.Update(context.TODO(), mcp)).Should(Succeed()) + Expect(reconciler.Client.Get(context.TODO(), client.ObjectKeyFromObject(mcp), mcp)).ToNot(HaveOccurred()) + Expect(mcp.Status.Conditions[0].Type).To(Equal(machineconfigv1.MachineConfigPoolUpdated)) + Expect(mcp.Status.Conditions[0].Status).To(Equal(corev1.ConditionTrue)) + Expect(mcp.Status.Configuration.Source[0].Name).To(Equal(mcName)) var secondLoopResult reconcile.Result secondLoopResult, err = reconciler.Reconcile(context.TODO(), reconcile.Request{NamespacedName: key}) diff --git a/internal/objects/objects.go b/internal/objects/objects.go index 4ca53b351..1e8eb66f7 100644 --- a/internal/objects/objects.go +++ b/internal/objects/objects.go @@ -30,14 +30,7 @@ import ( "github.com/openshift-kni/numaresources-operator/internal/api/annotations" ) -func NewNUMAResourcesOperator(name string, labelSelectors []*metav1.LabelSelector) *nropv1.NUMAResourcesOperator { - var nodeGroups []nropv1.NodeGroup - for _, selector := range labelSelectors { - nodeGroups = append(nodeGroups, nropv1.NodeGroup{ - MachineConfigPoolSelector: selector, - }) - } - +func NewNUMAResourcesOperator(name string, nodeGroups ...nropv1.NodeGroup) *nropv1.NUMAResourcesOperator { return &nropv1.NUMAResourcesOperator{ TypeMeta: metav1.TypeMeta{ Kind: "NUMAResourcesOperator", @@ -189,3 +182,12 @@ func NamespaceLabels() map[string]string { "security.openshift.io/scc.podSecurityLabelSync": "false", } } + +func GetDaemonSetListFromNodeGroupStatuses(groups []nropv1.NodeGroupStatus) []nropv1.NamespacedName { + dss := []nropv1.NamespacedName{} + for _, group := range groups { + // if NodeGroupStatus is set then it must have all the fields set including the daemonset + dss = append(dss, group.DaemonSet) + } + return dss +} diff --git a/internal/objects/objects_test.go b/internal/objects/objects_test.go index 7f19adb79..f5052f6e9 100644 --- a/internal/objects/objects_test.go +++ b/internal/objects/objects_test.go @@ -17,23 +17,25 @@ package objects import ( + "reflect" "testing" "time" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + nropv1 "github.com/openshift-kni/numaresources-operator/api/numaresourcesoperator/v1" ) func TestNewNUMAResourcesOperator(t *testing.T) { name := "test-nrop" - labelSelectors := []*metav1.LabelSelector{ - { - MatchLabels: map[string]string{ - "unit-test-nrop-obj": "foobar", - }, + ng := nropv1.NodeGroup{ + MachineConfigPoolSelector: &metav1.LabelSelector{MatchLabels: map[string]string{ + "unit-test-nrop-obj": "foobar", + }, }, } - obj := NewNUMAResourcesOperator(name, labelSelectors) + obj := NewNUMAResourcesOperator(name, ng) if obj == nil { t.Fatalf("null object") @@ -93,3 +95,42 @@ func TestNewNamespace(t *testing.T) { } } } + +func TestGetDaemonSetListFromNodeGroupStatuses(t *testing.T) { + ngs := []nropv1.NodeGroupStatus{ + { + PoolName: "pool1", + DaemonSet: nropv1.NamespacedName{ + Name: "daemonset-1", + }, + }, + { + PoolName: "pool2", + DaemonSet: nropv1.NamespacedName{ + Name: "daemonset-2", + }, + }, + { + PoolName: "pool3", + DaemonSet: nropv1.NamespacedName{ + Name: "daemonset-1", // duplicates should not exist, if they do it's a bug and we don't want to ignore it by eliminate the duplicates + }, + }, + } + expectedOutput := []nropv1.NamespacedName{ + { + Name: "daemonset-1", + }, + { + Name: "daemonset-2", + }, + { + Name: "daemonset-1", + }, + } + + got := GetDaemonSetListFromNodeGroupStatuses(ngs) + if !reflect.DeepEqual(got, expectedOutput) { + t.Errorf("unexpected daemonsets list:\n\t%v\n\tgot:\n\t%v", expectedOutput, got) + } +} diff --git a/internal/wait/numaresources.go b/internal/wait/numaresources.go index 06c0bf927..adf5fccf5 100644 --- a/internal/wait/numaresources.go +++ b/internal/wait/numaresources.go @@ -18,11 +18,13 @@ package wait import ( "context" + "reflect" k8swait "k8s.io/apimachinery/pkg/util/wait" "k8s.io/klog/v2" nropv1 "github.com/openshift-kni/numaresources-operator/api/numaresourcesoperator/v1" + testobjs "github.com/openshift-kni/numaresources-operator/internal/objects" ) func (wt Waiter) ForNUMAResourcesOperatorDeleted(ctx context.Context, nrop *nropv1.NUMAResourcesOperator) error { @@ -62,6 +64,13 @@ func (wt Waiter) ForDaemonsetInNUMAResourcesOperatorStatus(ctx context.Context, klog.Warningf("failed to get the DaemonSet from NUMAResourcesOperator %s", key.String()) return false, nil } + + dssFromNodeGroupStatus := testobjs.GetDaemonSetListFromNodeGroupStatuses(updatedNRO.Status.NodeGroups) + if !reflect.DeepEqual(updatedNRO.Status.DaemonSets, dssFromNodeGroupStatus) { + klog.Warningf("daemonset list mismatch: from NodeGroupStatus'es:\n%+v\n from operator status:\n%+v\n", dssFromNodeGroupStatus, updatedNRO.Status.NodeGroups) + return false, nil + } + klog.Infof("Daemonset info %s/%s ready in NUMAResourcesOperator %s", updatedNRO.Status.DaemonSets[0].Namespace, updatedNRO.Status.DaemonSets[0].Name, key.String()) return true, nil }) diff --git a/pkg/status/status_test.go b/pkg/status/status_test.go index 42e947bac..3c8a0c6a2 100644 --- a/pkg/status/status_test.go +++ b/pkg/status/status_test.go @@ -36,7 +36,7 @@ func TestUpdate(t *testing.T) { t.Errorf("nropv1.AddToScheme() failed with: %v", err) } - nro := testobjs.NewNUMAResourcesOperator("test-nro", nil) + nro := testobjs.NewNUMAResourcesOperator("test-nro") fakeClient := fake.NewClientBuilder().WithScheme(scheme.Scheme).WithRuntimeObjects(nro).Build() nro.Status.Conditions, _ = GetUpdatedConditions(nro.Status.Conditions, ConditionProgressing, "testReason", "test message") @@ -64,7 +64,7 @@ func TestUpdateIfNeeded(t *testing.T) { t.Errorf("nropv1.AddToScheme() failed with: %v", err) } - nro := testobjs.NewNUMAResourcesOperator("test-nro", nil) + nro := testobjs.NewNUMAResourcesOperator("test-nro") var ok bool nro.Status.Conditions, ok = GetUpdatedConditions(nro.Status.Conditions, ConditionAvailable, "", "") diff --git a/pkg/validation/validation.go b/pkg/validation/validation.go index 74ccde82a..08847320b 100644 --- a/pkg/validation/validation.go +++ b/pkg/validation/validation.go @@ -33,7 +33,6 @@ const ( ) // MachineConfigPoolDuplicates validates selected MCPs for duplicates -// TODO: move it under the validation webhook once we will have one func MachineConfigPoolDuplicates(trees []nodegroupv1.Tree) error { duplicates := map[string]int{} for _, tree := range trees { @@ -57,13 +56,16 @@ func MachineConfigPoolDuplicates(trees []nodegroupv1.Tree) error { } // NodeGroups validates the node groups for nil values and duplicates. -// TODO: move it under the validation webhook once we will have one func NodeGroups(nodeGroups []nropv1.NodeGroup) error { - if err := nodeGroupsMachineConfigPoolSelector(nodeGroups); err != nil { + if err := nodeGroupPools(nodeGroups); err != nil { return err } - if err := nodeGroupsDuplicates(nodeGroups); err != nil { + if err := nodeGroupsDuplicatesByMCPSelector(nodeGroups); err != nil { + return err + } + + if err := nodeGroupsDuplicatesByPoolName(nodeGroups); err != nil { return err } @@ -74,19 +76,20 @@ func NodeGroups(nodeGroups []nropv1.NodeGroup) error { return nil } -// TODO: move it under the validation webhook once we will have one -func nodeGroupsMachineConfigPoolSelector(nodeGroups []nropv1.NodeGroup) error { - for _, nodeGroup := range nodeGroups { - if nodeGroup.MachineConfigPoolSelector == nil { - return fmt.Errorf("one of the node groups does not have machineConfigPoolSelector") +func nodeGroupPools(nodeGroups []nropv1.NodeGroup) error { + for idx, nodeGroup := range nodeGroups { + if nodeGroup.MachineConfigPoolSelector == nil && nodeGroup.PoolName == nil { + return fmt.Errorf("node group %d missing any pool specifier", idx) + } + if nodeGroup.MachineConfigPoolSelector != nil && nodeGroup.PoolName != nil { + return fmt.Errorf("node group %d must have only a single specifier set: either PoolName or MachineConfigPoolSelector", idx) } } return nil } -// TODO: move it under the validation webhook once we will have one -func nodeGroupsDuplicates(nodeGroups []nropv1.NodeGroup) error { +func nodeGroupsDuplicatesByMCPSelector(nodeGroups []nropv1.NodeGroup) error { duplicates := map[string]int{} for _, nodeGroup := range nodeGroups { if nodeGroup.MachineConfigPoolSelector == nil { @@ -114,7 +117,34 @@ func nodeGroupsDuplicates(nodeGroups []nropv1.NodeGroup) error { return nil } -// TODO: move it under the validation webhook once we will have one +func nodeGroupsDuplicatesByPoolName(nodeGroups []nropv1.NodeGroup) error { + duplicates := map[string]int{} + for _, nodeGroup := range nodeGroups { + if nodeGroup.PoolName == nil { + continue + } + + key := *nodeGroup.PoolName + if _, ok := duplicates[key]; !ok { + duplicates[key] = 0 + } + duplicates[key] += 1 + } + + var duplicateErrors []string + for name, count := range duplicates { + if count > 1 { + duplicateErrors = append(duplicateErrors, fmt.Sprintf("the pool name %q has duplicates", name)) + } + } + + if len(duplicateErrors) > 0 { + return errors.New(strings.Join(duplicateErrors, "; ")) + } + + return nil +} + func nodeGroupMachineConfigPoolSelector(nodeGroups []nropv1.NodeGroup) error { var selectorsErrors []string for _, nodeGroup := range nodeGroups { diff --git a/pkg/validation/validation_test.go b/pkg/validation/validation_test.go index 37349ed38..89422e3d7 100644 --- a/pkg/validation/validation_test.go +++ b/pkg/validation/validation_test.go @@ -25,7 +25,6 @@ import ( nropv1 "github.com/openshift-kni/numaresources-operator/api/numaresourcesoperator/v1" nodegroupv1 "github.com/openshift-kni/numaresources-operator/api/numaresourcesoperator/v1/helper/nodegroup" - testobjs "github.com/openshift-kni/numaresources-operator/internal/objects" ) @@ -90,12 +89,14 @@ func TestNodeGroupsSanity(t *testing.T) { expectedErrorMessage string } + poolName := "poolname-test" testCases := []testCase{ { - name: "nil MCP selector", + name: "both source pools are nil", nodeGroups: []nropv1.NodeGroup{ { MachineConfigPoolSelector: nil, + PoolName: nil, }, { MachineConfigPoolSelector: &metav1.LabelSelector{ @@ -106,10 +107,25 @@ func TestNodeGroupsSanity(t *testing.T) { }, }, expectedError: true, - expectedErrorMessage: "one of the node groups does not have machineConfigPoolSelector", + expectedErrorMessage: "missing any pool specifier", + }, + { + name: "both source pools are set", + nodeGroups: []nropv1.NodeGroup{ + { + MachineConfigPoolSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "test": "test", + }, + }, + PoolName: &poolName, + }, + }, + expectedError: true, + expectedErrorMessage: "must have only a single specifier set", }, { - name: "with duplicates", + name: "with duplicates - mcp selector", nodeGroups: []nropv1.NodeGroup{ { MachineConfigPoolSelector: &metav1.LabelSelector{ @@ -129,6 +145,19 @@ func TestNodeGroupsSanity(t *testing.T) { expectedError: true, expectedErrorMessage: "has duplicates", }, + { + name: "with duplicates - pool name", + nodeGroups: []nropv1.NodeGroup{ + { + PoolName: &poolName, + }, + { + PoolName: &poolName, + }, + }, + expectedError: true, + expectedErrorMessage: "has duplicates", + }, { name: "bad MCP selector", nodeGroups: []nropv1.NodeGroup{ @@ -165,6 +194,9 @@ func TestNodeGroupsSanity(t *testing.T) { }, }, }, + { + PoolName: &poolName, + }, }, }, } diff --git a/test/e2e/rte/rte_test.go b/test/e2e/rte/rte_test.go index 8b614de58..9172be26b 100644 --- a/test/e2e/rte/rte_test.go +++ b/test/e2e/rte/rte_test.go @@ -46,6 +46,7 @@ import ( nropv1 "github.com/openshift-kni/numaresources-operator/api/numaresourcesoperator/v1" nodegroupv1 "github.com/openshift-kni/numaresources-operator/api/numaresourcesoperator/v1/helper/nodegroup" "github.com/openshift-kni/numaresources-operator/internal/machineconfigpools" + testobjs "github.com/openshift-kni/numaresources-operator/internal/objects" "github.com/openshift-kni/numaresources-operator/internal/podlist" "github.com/openshift-kni/numaresources-operator/internal/remoteexec" "github.com/openshift-kni/numaresources-operator/pkg/loglevel" @@ -144,6 +145,8 @@ var _ = ginkgo.Describe("with a running cluster with all the components", func() err := clients.Client.Get(context.TODO(), client.ObjectKey{Name: objectnames.DefaultNUMAResourcesOperatorCrName}, nropObj) gomega.Expect(err).ToNot(gomega.HaveOccurred()) gomega.Expect(nropObj.Status.DaemonSets).ToNot(gomega.BeEmpty()) + dssFromNodeGroupStatus := testobjs.GetDaemonSetListFromNodeGroupStatuses(nropObj.Status.NodeGroups) + gomega.Expect(reflect.DeepEqual(nropObj.Status.DaemonSets, dssFromNodeGroupStatus)).To(gomega.BeTrue()) klog.Infof("NRO %q", nropObj.Name) // NROP guarantees all the daemonsets are in the same namespace, @@ -191,6 +194,8 @@ var _ = ginkgo.Describe("with a running cluster with all the components", func() err := clients.Client.Get(context.TODO(), client.ObjectKey{Name: objectnames.DefaultNUMAResourcesOperatorCrName}, nropObj) gomega.Expect(err).ToNot(gomega.HaveOccurred()) gomega.Expect(nropObj.Status.DaemonSets).ToNot(gomega.BeEmpty()) + dssFromNodeGroupStatus := testobjs.GetDaemonSetListFromNodeGroupStatuses(nropObj.Status.NodeGroups) + gomega.Expect(reflect.DeepEqual(nropObj.Status.DaemonSets, dssFromNodeGroupStatus)).To(gomega.BeTrue()) klog.Infof("NRO %q", nropObj.Name) // NROP guarantees all the daemonsets are in the same namespace,