diff --git a/Taskfile.yaml b/Taskfile.yaml index 8af220c0a..76a6fd0de 100644 --- a/Taskfile.yaml +++ b/Taskfile.yaml @@ -28,6 +28,9 @@ vars: target: "" VALIDATION_FILES: "tools/validation/{main,messages,diff,no_cyrillic,doc_changes,copyright}.go" + DLV_IMAGE: + sh: if [ -z $DLV_IMAGE ]; then echo "ttl.sh/$(uuidgen | awk '{print tolower($0)}'):10m" ; else echo $DLV_IMAGE ; fi + tasks: check-werf: cmds: @@ -148,6 +151,40 @@ tasks: - task: cve:bin - tools/cve/scan-main.sh {{.REPORT_FILE_NAME}} + dlv:virt-controller:build: + desc: "Build image virt-controller with dlv" + cmd: docker build -f ./images/virt-controller/debug/dlv.Dockerfile -t "{{ .DLV_IMAGE }}" . + + dlv:virt-controller:build-push: + desc: "Build and Push image virt-controller with dlv" + cmds: + - task: dlv:virt-controller:build + - docker push "{{ .DLV_IMAGE }}" + - task: dlv:virt-controller:print + + dlv:virt-controller:print: + desc: "Print commands for debug" + env: + IMAGE: "{{ .DLV_IMAGE }}" + cmd: | + cat < 0 { ++ errs = append(errs, tErrs...) ++ continue ++ } ++ if match { ++ return true, nil ++ } ++ } ++ return false, errors.Flatten(errors.NewAggregate(errs)) ++} ++ ++func isEmptyNodeSelectorTerm(term *v1.NodeSelectorTerm) bool { ++ return len(term.MatchExpressions) == 0 && len(term.MatchFields) == 0 ++} ++ ++func extractNodeFields(n *v1.Node) fields.Set { ++ f := make(fields.Set) ++ if len(n.Name) > 0 { ++ f["metadata.name"] = n.Name ++ } ++ return f ++} ++ ++type nodeSelectorTerm struct { ++ matchLabels labels.Selector ++ matchFields fields.Selector ++ parseErrs []error ++} ++ ++func newNodeSelectorTerm(term *v1.NodeSelectorTerm, path *field.Path) nodeSelectorTerm { ++ var parsedTerm nodeSelectorTerm ++ var errs []error ++ if len(term.MatchExpressions) != 0 { ++ p := path.Child("matchExpressions") ++ parsedTerm.matchLabels, errs = nodeSelectorRequirementsAsSelector(term.MatchExpressions, p) ++ if errs != nil { ++ parsedTerm.parseErrs = append(parsedTerm.parseErrs, errs...) ++ } ++ } ++ if len(term.MatchFields) != 0 { ++ p := path.Child("matchFields") ++ parsedTerm.matchFields, errs = nodeSelectorRequirementsAsFieldSelector(term.MatchFields, p) ++ if errs != nil { ++ parsedTerm.parseErrs = append(parsedTerm.parseErrs, errs...) ++ } ++ } ++ return parsedTerm ++} ++ ++func (t *nodeSelectorTerm) match(nodeLabels labels.Set, nodeFields fields.Set) (bool, []error) { ++ if t.parseErrs != nil { ++ return false, t.parseErrs ++ } ++ if t.matchLabels != nil && !t.matchLabels.Matches(nodeLabels) { ++ return false, nil ++ } ++ if t.matchFields != nil && len(nodeFields) > 0 && !t.matchFields.Matches(nodeFields) { ++ return false, nil ++ } ++ return true, nil ++} ++ ++var validSelectorOperators = []v1.NodeSelectorOperator{ ++ v1.NodeSelectorOpIn, ++ v1.NodeSelectorOpNotIn, ++ v1.NodeSelectorOpExists, ++ v1.NodeSelectorOpDoesNotExist, ++ v1.NodeSelectorOpGt, ++ v1.NodeSelectorOpLt, ++} ++ ++// nodeSelectorRequirementsAsSelector converts the []NodeSelectorRequirement api type into a struct that implements ++// labels.Selector. ++func nodeSelectorRequirementsAsSelector(nsm []v1.NodeSelectorRequirement, path *field.Path) (labels.Selector, []error) { ++ if len(nsm) == 0 { ++ return labels.Nothing(), nil ++ } ++ var errs []error ++ selector := labels.NewSelector() ++ for i, expr := range nsm { ++ p := path.Index(i) ++ var op selection.Operator ++ switch expr.Operator { ++ case v1.NodeSelectorOpIn: ++ op = selection.In ++ case v1.NodeSelectorOpNotIn: ++ op = selection.NotIn ++ case v1.NodeSelectorOpExists: ++ op = selection.Exists ++ case v1.NodeSelectorOpDoesNotExist: ++ op = selection.DoesNotExist ++ case v1.NodeSelectorOpGt: ++ op = selection.GreaterThan ++ case v1.NodeSelectorOpLt: ++ op = selection.LessThan ++ default: ++ errs = append(errs, field.NotSupported(p.Child("operator"), expr.Operator, validSelectorOperators)) ++ continue ++ } ++ r, err := labels.NewRequirement(expr.Key, op, expr.Values, field.WithPath(p)) ++ if err != nil { ++ errs = append(errs, err) ++ } else { ++ selector = selector.Add(*r) ++ } ++ } ++ if len(errs) != 0 { ++ return nil, errs ++ } ++ return selector, nil ++} ++ ++var validFieldSelectorOperators = []v1.NodeSelectorOperator{ ++ v1.NodeSelectorOpIn, ++ v1.NodeSelectorOpNotIn, ++} ++ ++// nodeSelectorRequirementsAsFieldSelector converts the []NodeSelectorRequirement core type into a struct that implements ++// fields.Selector. ++func nodeSelectorRequirementsAsFieldSelector(nsr []v1.NodeSelectorRequirement, path *field.Path) (fields.Selector, []error) { ++ if len(nsr) == 0 { ++ return fields.Nothing(), nil ++ } ++ var errs []error ++ ++ var selectors []fields.Selector ++ for i, expr := range nsr { ++ p := path.Index(i) ++ switch expr.Operator { ++ case v1.NodeSelectorOpIn: ++ if len(expr.Values) != 1 { ++ errs = append(errs, field.Invalid(p.Child("values"), expr.Values, "must have one element")) ++ } else { ++ selectors = append(selectors, fields.OneTermEqualSelector(expr.Key, expr.Values[0])) ++ } ++ ++ case v1.NodeSelectorOpNotIn: ++ if len(expr.Values) != 1 { ++ errs = append(errs, field.Invalid(p.Child("values"), expr.Values, "must have one element")) ++ } else { ++ selectors = append(selectors, fields.OneTermNotEqualSelector(expr.Key, expr.Values[0])) ++ } ++ ++ default: ++ errs = append(errs, field.NotSupported(p.Child("operator"), expr.Operator, validFieldSelectorOperators)) ++ } ++ } ++ ++ if len(errs) != 0 { ++ return nil, errs ++ } ++ return fields.AndSelectors(selectors...), nil ++} ++ ++type RequiredNodeAffinity struct { ++ labelSelector labels.Selector ++ nodeSelector *LazyErrorNodeSelector ++} ++ ++// GetRequiredNodeAffinity returns the parsing result of pod's nodeSelector and nodeAffinity. ++func GetRequiredNodeAffinity(pod *v1.Pod) RequiredNodeAffinity { ++ var selector labels.Selector ++ if len(pod.Spec.NodeSelector) > 0 { ++ selector = labels.SelectorFromSet(pod.Spec.NodeSelector) ++ } ++ // Use LazyErrorNodeSelector for backwards compatibility of parsing errors. ++ var affinity *LazyErrorNodeSelector ++ if pod.Spec.Affinity != nil && ++ pod.Spec.Affinity.NodeAffinity != nil && ++ pod.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution != nil { ++ affinity = NewLazyErrorNodeSelector(pod.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution) ++ } ++ return RequiredNodeAffinity{labelSelector: selector, nodeSelector: affinity} ++} ++ ++// Match checks whether the pod is schedulable onto nodes according to ++// the requirements in both nodeSelector and nodeAffinity. ++func (s RequiredNodeAffinity) Match(node *v1.Node) (bool, error) { ++ if s.labelSelector != nil { ++ if !s.labelSelector.Matches(labels.Set(node.Labels)) { ++ return false, nil ++ } ++ } ++ if s.nodeSelector != nil { ++ return s.nodeSelector.Match(node) ++ } ++ return true, nil ++} +diff --git a/pkg/util/affinity/podaffinity.go b/pkg/util/affinity/podaffinity.go +new file mode 100644 +index 0000000000..b16c2f365f +--- /dev/null ++++ b/pkg/util/affinity/podaffinity.go +@@ -0,0 +1,125 @@ ++/* ++Copyright 2015 The Kubernetes Authors. ++ ++Licensed under the Apache License, Version 2.0 (the "License"); ++you may not use this file except in compliance with the License. ++You may obtain a copy of the License at ++ ++ http://www.apache.org/licenses/LICENSE-2.0 ++ ++Unless required by applicable law or agreed to in writing, software ++distributed under the License is distributed on an "AS IS" BASIS, ++WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++See the License for the specific language governing permissions and ++limitations under the License. ++*/ ++ ++package affinity ++ ++import ( ++ v1 "k8s.io/api/core/v1" ++ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ++ "k8s.io/apimachinery/pkg/labels" ++ "k8s.io/apimachinery/pkg/util/sets" ++) ++ ++// AffinityTerm is a processed version of v1.PodAffinityTerm. ++type AffinityTerm struct { ++ Namespaces sets.Set[string] ++ Selector labels.Selector ++ TopologyKey string ++ NamespaceSelector labels.Selector ++} ++ ++// Matches returns true if the pod matches the label selector and namespaces or namespace selector. ++func (at *AffinityTerm) Matches(pod *v1.Pod, nsLabels labels.Set) bool { ++ if at.Namespaces.Has(pod.Namespace) || at.NamespaceSelector.Matches(nsLabels) { ++ return at.Selector.Matches(labels.Set(pod.Labels)) ++ } ++ return false ++} ++ ++func newAffinityTerm(pod *v1.Pod, term *v1.PodAffinityTerm) (*AffinityTerm, error) { ++ selector, err := metav1.LabelSelectorAsSelector(term.LabelSelector) ++ if err != nil { ++ return nil, err ++ } ++ ++ namespaces := getNamespacesFromPodAffinityTerm(pod, term) ++ nsSelector, err := metav1.LabelSelectorAsSelector(term.NamespaceSelector) ++ if err != nil { ++ return nil, err ++ } ++ ++ return &AffinityTerm{Namespaces: namespaces, Selector: selector, TopologyKey: term.TopologyKey, NamespaceSelector: nsSelector}, nil ++} ++ ++// GetAffinityTerms receives a Pod and affinity terms and returns the namespaces and ++// selectors of the terms. ++func GetAffinityTerms(pod *v1.Pod, v1Terms []v1.PodAffinityTerm) ([]AffinityTerm, error) { ++ if v1Terms == nil { ++ return nil, nil ++ } ++ ++ var terms []AffinityTerm ++ for i := range v1Terms { ++ t, err := newAffinityTerm(pod, &v1Terms[i]) ++ if err != nil { ++ // We get here if the label selector failed to process ++ return nil, err ++ } ++ terms = append(terms, *t) ++ } ++ return terms, nil ++} ++ ++// returns a set of names according to the namespaces indicated in podAffinityTerm. ++// If namespaces is empty it considers the given pod's namespace. ++func getNamespacesFromPodAffinityTerm(pod *v1.Pod, podAffinityTerm *v1.PodAffinityTerm) sets.Set[string] { ++ names := sets.Set[string]{} ++ if len(podAffinityTerm.Namespaces) == 0 && podAffinityTerm.NamespaceSelector == nil { ++ names.Insert(pod.Namespace) ++ } else { ++ names.Insert(podAffinityTerm.Namespaces...) ++ } ++ return names ++} ++ ++func GetPodAffinityTerms(affinity *v1.Affinity) (terms []v1.PodAffinityTerm) { ++ if affinity != nil && affinity.PodAffinity != nil { ++ if len(affinity.PodAffinity.RequiredDuringSchedulingIgnoredDuringExecution) != 0 { ++ terms = affinity.PodAffinity.RequiredDuringSchedulingIgnoredDuringExecution ++ } ++ } ++ return terms ++} ++ ++func GetPodAntiAffinityTerms(affinity *v1.Affinity) (terms []v1.PodAffinityTerm) { ++ if affinity != nil && affinity.PodAntiAffinity != nil { ++ if len(affinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution) != 0 { ++ terms = affinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution ++ } ++ } ++ return terms ++} ++ ++func MatchPodAffinityTerms(terms []AffinityTerm, pod *v1.Pod, namespaceLabels labels.Set) bool { ++ if len(terms) == 0 { ++ return true ++ } ++ for _, term := range terms { ++ if term.Matches(pod, namespaceLabels) { ++ return true ++ } ++ } ++ return false ++} ++ ++func MatchPodAntiAffinityTerms(terms []AffinityTerm, pod *v1.Pod, namespaceLabels labels.Set) bool { ++ for _, term := range terms { ++ if term.Matches(pod, namespaceLabels) { ++ return true ++ } ++ } ++ return false ++} +diff --git a/pkg/virt-controller/watch/application.go b/pkg/virt-controller/watch/application.go +index 4a8d20d7be..17711edba5 100644 +--- a/pkg/virt-controller/watch/application.go ++++ b/pkg/virt-controller/watch/application.go +@@ -656,6 +656,9 @@ func (vca *VirtControllerApp) initCommon() { + vca.cdiConfigInformer, + vca.clusterConfig, + topologyHinter, ++ vca.allPodInformer, ++ vca.namespaceInformer, ++ vca.nodeInformer, + ) + if err != nil { + panic(err) +diff --git a/pkg/virt-controller/watch/vmi.go b/pkg/virt-controller/watch/vmi.go +index 0c4bfca389..fa4e86ee17 100644 +--- a/pkg/virt-controller/watch/vmi.go ++++ b/pkg/virt-controller/watch/vmi.go +@@ -69,6 +69,10 @@ import ( + virtconfig "kubevirt.io/kubevirt/pkg/virt-config" + "kubevirt.io/kubevirt/pkg/virt-controller/services" + "kubevirt.io/kubevirt/pkg/virt-controller/watch/descheduler" ++ ++ "k8s.io/apimachinery/pkg/labels" ++ ++ "kubevirt.io/kubevirt/pkg/util/affinity" + ) + + const ( +@@ -92,6 +96,9 @@ func NewVMIController(templateService services.TemplateService, + cdiConfigInformer cache.SharedIndexInformer, + clusterConfig *virtconfig.ClusterConfig, + topologyHinter topology.Hinter, ++ allPodInformer cache.SharedIndexInformer, ++ namespaceInformer cache.SharedIndexInformer, ++ nodeInformer cache.SharedIndexInformer, + ) (*VMIController, error) { + + c := &VMIController{ +@@ -112,12 +119,17 @@ func NewVMIController(templateService services.TemplateService, + topologyHinter: topologyHinter, + cidsMap: newCIDsMap(), + backendStorage: backendstorage.NewBackendStorage(clientset, clusterConfig, storageClassInformer.GetStore(), storageProfileInformer.GetStore(), pvcInformer.GetIndexer()), ++ ++ allPodIndexer: allPodInformer.GetIndexer(), ++ namespaceIndexer: namespaceInformer.GetIndexer(), ++ nodeIndexer: nodeInformer.GetIndexer(), + } + + c.hasSynced = func() bool { + return vmInformer.HasSynced() && vmiInformer.HasSynced() && podInformer.HasSynced() && + dataVolumeInformer.HasSynced() && cdiConfigInformer.HasSynced() && cdiInformer.HasSynced() && +- pvcInformer.HasSynced() && storageClassInformer.HasSynced() && storageProfileInformer.HasSynced() ++ pvcInformer.HasSynced() && storageClassInformer.HasSynced() && storageProfileInformer.HasSynced() && ++ allPodInformer.HasSynced() && namespaceInformer.HasSynced() && nodeInformer.HasSynced() + } + + _, err := vmiInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{ +@@ -221,6 +233,10 @@ type VMIController struct { + cidsMap *cidsMap + backendStorage *backendstorage.BackendStorage + hasSynced func() bool ++ ++ allPodIndexer cache.Indexer ++ namespaceIndexer cache.Indexer ++ nodeIndexer cache.Indexer + } + + func (c *VMIController) Run(threadiness int, stopCh <-chan struct{}) { +@@ -691,6 +707,10 @@ func (c *VMIController) updateStatus(vmi *virtv1.VirtualMachineInstance, pod *k8 + c.syncVolumesUpdate(vmiCopy) + } + ++ if err := c.syncNodePlacementCondition(vmiCopy, pod); err != nil { ++ return fmt.Errorf("failed to update condition %s", virtv1.VirtualMachineInstanceNodePlacementNotMatched) ++ } ++ + case vmi.IsScheduled(): + if !vmiPodExists { + vmiCopy.Status.Phase = virtv1.Failed +@@ -2416,6 +2436,172 @@ func (c *VMIController) syncVolumesUpdate(vmi *virtv1.VirtualMachineInstance) { + vmiConditions.UpdateCondition(vmi, &condition) + } + ++func (c *VMIController) syncNodePlacementCondition(vmi *virtv1.VirtualMachineInstance, pod *k8sv1.Pod) error { ++ status := k8sv1.ConditionFalse ++ templatePod, err := c.templateService.RenderLaunchManifest(vmi) ++ if err != nil { ++ return fmt.Errorf("failed to render pod manifest: %w", err) ++ } ++ changed, err := c.isChangedNodePlacement(pod, templatePod) ++ if err != nil { ++ return fmt.Errorf("could not verify if NodePlacement update is required: %w", err) ++ } ++ if changed { ++ matched, err := c.nodePlacementIsMatched(pod, templatePod) ++ if err != nil { ++ return fmt.Errorf("failed to verify if NodePlacement update is matched: %w", err) ++ } ++ if !matched { ++ status = k8sv1.ConditionTrue ++ } ++ } ++ c.syncNodePlacementNotMatchedCondition(vmi, status) ++ return nil ++} ++ ++func (c *VMIController) isChangedNodePlacement(pod, templatePod *k8sv1.Pod) (bool, error) { ++ if pod == nil || templatePod == nil { ++ return false, nil ++ } ++ ++ // when migration controller creating target pod. It will be created with PodAntiAffinity ++ { ++ var antiAffinityTerm *k8sv1.PodAffinityTerm ++ ++ if pod.Spec.Affinity != nil && ++ pod.Spec.Affinity.PodAntiAffinity != nil && ++ len(pod.Spec.Affinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution) > 0 { ++ for _, rd := range pod.Spec.Affinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution { ++ if rd.LabelSelector != nil { ++ if _, found := rd.LabelSelector.MatchLabels[virtv1.CreatedByLabel]; found { ++ antiAffinityTerm = rd.DeepCopy() ++ } ++ } ++ } ++ } ++ if antiAffinityTerm != nil { ++ antiAffinityRule := &k8sv1.PodAntiAffinity{ ++ RequiredDuringSchedulingIgnoredDuringExecution: []k8sv1.PodAffinityTerm{*antiAffinityTerm}, ++ } ++ if templatePod.Spec.Affinity == nil { ++ templatePod.Spec.Affinity = &k8sv1.Affinity{ ++ PodAntiAffinity: antiAffinityRule, ++ } ++ } else if templatePod.Spec.Affinity.PodAntiAffinity == nil { ++ templatePod.Spec.Affinity.PodAntiAffinity = antiAffinityRule ++ } else { ++ templatePod.Spec.Affinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution = append(templatePod.Spec.Affinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution, *antiAffinityTerm) ++ } ++ } ++ } ++ ++ return !equality.Semantic.DeepEqual(pod.Spec.NodeSelector, templatePod.Spec.NodeSelector) || ++ !equality.Semantic.DeepEqual(pod.Spec.Affinity, templatePod.Spec.Affinity), nil ++} ++ ++func (c *VMIController) nodePlacementIsMatched(pod, templatePod *k8sv1.Pod) (bool, error) { ++ if pod == nil || templatePod == nil { ++ return false, fmt.Errorf("pod or templatePod must not be nil") ++ } ++ templatePod.Namespace = pod.Namespace ++ templatePod.Name = pod.Name ++ obj, exist, err := c.nodeIndexer.GetByKey(pod.Spec.NodeName) ++ if err != nil { ++ return false, err ++ } ++ node := obj.(*k8sv1.Node) ++ if !exist || node == nil { ++ return false, fmt.Errorf("not found node %s", pod.Spec.NodeName) ++ } ++ ++ requiredNodeSelectorAndAffinity := affinity.GetRequiredNodeAffinity(templatePod) ++ match, err := requiredNodeSelectorAndAffinity.Match(node) ++ if err != nil { ++ return false, fmt.Errorf("failed to match required node selector and affinity: %w", err) ++ } ++ if !match { ++ return false, nil ++ } ++ ++ pods, err := c.listPodsByNode(pod.Spec.NodeName) ++ if err != nil { ++ return false, err ++ } ++ ++ podNamespaces := make(map[string]struct{}) ++ for _, p := range pods { ++ podNamespaces[p.GetNamespace()] = struct{}{} ++ } ++ allNamespaces := c.namespaceIndexer.List() ++ namespaceLabels := make(map[string]labels.Set, len(podNamespaces)) ++ for _, o := range allNamespaces { ++ ns := o.(*k8sv1.Namespace) ++ if _, ok := podNamespaces[ns.GetName()]; ok { ++ namespaceLabels[ns.GetName()] = ns.GetLabels() ++ } ++ } ++ ++ podAffinityTerms, err := affinity.GetAffinityTerms(templatePod, affinity.GetPodAffinityTerms(templatePod.Spec.Affinity)) ++ if err != nil { ++ return false, err ++ } ++ podAntiAffinityTerms, err := affinity.GetAffinityTerms(templatePod, affinity.GetPodAntiAffinityTerms(templatePod.Spec.Affinity)) ++ if err != nil { ++ return false, err ++ } ++ ++ var ( ++ podMatchedByPodAffinityFound bool ++ ) ++ ++ for _, p := range pods { ++ if p.GetUID() == pod.GetUID() { ++ continue ++ } ++ if p.Status.Phase == k8sv1.PodSucceeded || p.Status.Phase == k8sv1.PodFailed { ++ continue ++ } ++ nsLabels := namespaceLabels[p.GetNamespace()] ++ ++ // If at least one matches the podAffinity, then node placement is suitable. ++ if !podMatchedByPodAffinityFound && affinity.MatchPodAffinityTerms(podAffinityTerms, p, nsLabels) { ++ podMatchedByPodAffinityFound = true ++ } ++ // If at least one matches the podAntiAffinity, then node placement is not suitable. return false ++ if affinity.MatchPodAntiAffinityTerms(podAntiAffinityTerms, p, nsLabels) { ++ return false, nil ++ } ++ } ++ ++ return podMatchedByPodAffinityFound, nil ++} ++ ++// listPodsByNode takes a node and returns all Pods from the pod cache which run on this node ++func (c *VMIController) listPodsByNode(node string) ([]*k8sv1.Pod, error) { ++ objs, err := c.allPodIndexer.ByIndex("node", node) ++ if err != nil { ++ return nil, err ++ } ++ pods := make([]*k8sv1.Pod, 0, len(objs)) ++ for _, obj := range objs { ++ pod, ok := obj.(*k8sv1.Pod) ++ if ok { ++ pods = append(pods, pod) ++ } ++ } ++ return pods, nil ++} ++ ++func (c *VMIController) syncNodePlacementNotMatchedCondition(vmi *virtv1.VirtualMachineInstance, status k8sv1.ConditionStatus) { ++ vmiConditions := controller.NewVirtualMachineInstanceConditionManager() ++ condition := virtv1.VirtualMachineInstanceCondition{ ++ Type: virtv1.VirtualMachineInstanceNodePlacementNotMatched, ++ Status: status, ++ LastTransitionTime: v1.Now(), ++ } ++ vmiConditions.UpdateCondition(vmi, &condition) ++} ++ + func (c *VMIController) aggregateDataVolumesConditions(vmiCopy *virtv1.VirtualMachineInstance, dvs []*cdiv1.DataVolume) { + if len(dvs) == 0 { + return +diff --git a/pkg/virt-controller/watch/workload-updater/workload-updater.go b/pkg/virt-controller/watch/workload-updater/workload-updater.go +index a7d0f76e24..e9205679de 100644 +--- a/pkg/virt-controller/watch/workload-updater/workload-updater.go ++++ b/pkg/virt-controller/watch/workload-updater/workload-updater.go +@@ -214,7 +214,7 @@ func (c *WorkloadUpdateController) updateVmi(_, obj interface{}) { + return + } + +- if !(isHotplugInProgress(vmi) || isVolumesUpdateInProgress(vmi)) || ++ if !(isHotplugInProgress(vmi) || isVolumesUpdateInProgress(vmi) || isNodePlacementInProgress(vmi)) || + migrationutils.IsMigrating(vmi) { + return + } +@@ -324,6 +324,11 @@ func isVolumesUpdateInProgress(vmi *virtv1.VirtualMachineInstance) bool { + virtv1.VirtualMachineInstanceVolumesChange, k8sv1.ConditionTrue) + } + ++func isNodePlacementInProgress(vmi *virtv1.VirtualMachineInstance) bool { ++ return controller.NewVirtualMachineInstanceConditionManager().HasConditionWithStatus(vmi, ++ virtv1.VirtualMachineInstanceNodePlacementNotMatched, k8sv1.ConditionTrue) ++} ++ + func (c *WorkloadUpdateController) doesRequireMigration(vmi *virtv1.VirtualMachineInstance) bool { + if vmi.IsFinal() || migrationutils.IsMigrating(vmi) { + return false +@@ -337,6 +342,9 @@ func (c *WorkloadUpdateController) doesRequireMigration(vmi *virtv1.VirtualMachi + if isVolumesUpdateInProgress(vmi) { + return true + } ++ if isNodePlacementInProgress(vmi) { ++ return true ++ } + + return false + } +@@ -352,6 +360,9 @@ func (c *WorkloadUpdateController) shouldAbortMigration(vmi *virtv1.VirtualMachi + if isVolumesUpdateInProgress(vmi) { + return false + } ++ if isNodePlacementInProgress(vmi) { ++ return false ++ } + if vmi.Status.MigrationState != nil && vmi.Status.MigrationState.TargetNodeDomainReadyTimestamp != nil { + return false + } +diff --git a/pkg/virt-handler/vm.go b/pkg/virt-handler/vm.go +index cdc1f815c3..24352cf6e9 100644 +--- a/pkg/virt-handler/vm.go ++++ b/pkg/virt-handler/vm.go +@@ -3468,6 +3468,7 @@ func (d *VirtualMachineController) finalizeMigration(vmi *v1.VirtualMachineInsta + d.recorder.Event(vmi, k8sv1.EventTypeWarning, err.Error(), "failed to update guest memory") + } + removeMigratedVolumes(vmi) ++ finalizeNodePlacement(vmi) + + options := &cmdv1.VirtualMachineOptions{} + options.InterfaceMigration = domainspec.BindingMigrationByInterfaceName(vmi.Spec.Domain.Devices.Interfaces, d.clusterConfig.GetNetworkBindings()) +@@ -3684,6 +3685,10 @@ func (d *VirtualMachineController) hotplugMemory(vmi *v1.VirtualMachineInstance, + return nil + } + ++func finalizeNodePlacement(vmi *v1.VirtualMachineInstance) { ++ controller.NewVirtualMachineInstanceConditionManager().RemoveCondition(vmi, v1.VirtualMachineInstanceNodePlacementNotMatched) ++} ++ + func removeMigratedVolumes(vmi *v1.VirtualMachineInstance) { + vmiConditions := controller.NewVirtualMachineInstanceConditionManager() + vmiConditions.RemoveCondition(vmi, v1.VirtualMachineInstanceVolumesChange) +diff --git a/staging/src/kubevirt.io/api/core/v1/types.go b/staging/src/kubevirt.io/api/core/v1/types.go +index 7aa814d8f1..841387d304 100644 +--- a/staging/src/kubevirt.io/api/core/v1/types.go ++++ b/staging/src/kubevirt.io/api/core/v1/types.go +@@ -568,6 +568,9 @@ const ( + + // Summarizes that all the DataVolumes attached to the VMI are Ready or not + VirtualMachineInstanceDataVolumesReady VirtualMachineInstanceConditionType = "DataVolumesReady" ++ ++ // Indicates that the VMI has affinity or nodeSelector changes ++ VirtualMachineInstanceNodePlacementNotMatched VirtualMachineInstanceConditionType = "NodePlacementNotMatched" + ) + + // These are valid reasons for VMI conditions. diff --git a/images/virt-artifact/patches/README.md b/images/virt-artifact/patches/README.md index 377dfb135..2e5769808 100644 --- a/images/virt-artifact/patches/README.md +++ b/images/virt-artifact/patches/README.md @@ -93,3 +93,16 @@ Currently covered metrics: - virt-handler - virt-controller - virt-api + +#### `025-stream-graceful-shutdown.patch` + +Graceful termination of websocket connection for serial console and vnc connections. + +#### `026-auto-migrate-if-nodeplacement-changed.patch` + +Start the migration if the nodeSelector or affinity has changed. +How does it work? +1. When changing the affinity or nodeSelector in the vm, the vm controller updates the vmi specification. +2. When changing the affinity or nodeSelector in vmi, the vmi controller will set the `NodePlacementNotMatched` condition to True in vmi. +3. The workload-updater controller monitors the vmi and starts migration when there is a `NodePlacementNotMatched` conditions on the vmi. +4. When the migration is completed, virt-handler will remove the condition `NodePlacementNotMatched` from the vmi diff --git a/images/virt-controller/debug/dlv.Dockerfile b/images/virt-controller/debug/dlv.Dockerfile new file mode 100644 index 000000000..fc947a001 --- /dev/null +++ b/images/virt-controller/debug/dlv.Dockerfile @@ -0,0 +1,31 @@ +FROM golang:1.22.7 AS builder + +ENV VERSION="1.3.1" +ENV GOVERSION="1.22.7" + +RUN go install github.com/go-delve/delve/cmd/dlv@latest + +RUN git clone --depth 1 --branch v$VERSION https://github.com/kubevirt/kubevirt.git /kubevirt +COPY ./images/virt-artifact/patches /patches +WORKDIR /kubevirt +RUN for p in /patches/*.patch ; do git apply --ignore-space-change --ignore-whitespace ${p} && echo OK || (echo FAIL ; exit 1) ; done + +RUN go mod edit -go=$GOVERSION && \ + go mod download + +RUN go mod vendor + +ENV GO111MODULE=on +ENV GOOS=linux +ENV CGO_ENABLED=0 +ENV GOARCH=amd64 + +RUN go build -o /kubevirt-binaries/virt-controller ./cmd/virt-controller/ + +FROM busybox + +WORKDIR /app +COPY --from=builder /kubevirt-binaries/virt-controller /app/virt-controller +COPY --from=builder /go/bin/dlv /app/dlv +USER 65532:65532 +ENTRYPOINT ["./dlv", "--listen=:2345", "--headless=true", "--continue", "--log=true", "--log-output=debugger,debuglineerr,gdbwire,lldbout,rpc", "--accept-multiclient", "--api-version=2", "exec", "/app/virt-controller", "--"] diff --git a/images/virtualization-artifact/cmd/virtualization-controller/main.go b/images/virtualization-artifact/cmd/virtualization-controller/main.go index 5606b2398..1415a6b37 100644 --- a/images/virtualization-artifact/cmd/virtualization-controller/main.go +++ b/images/virtualization-artifact/cmd/virtualization-controller/main.go @@ -52,6 +52,7 @@ import ( "github.com/deckhouse/virtualization-controller/pkg/controller/vmrestore" "github.com/deckhouse/virtualization-controller/pkg/controller/vmsnapshot" "github.com/deckhouse/virtualization-controller/pkg/logger" + "github.com/deckhouse/virtualization-controller/pkg/version" "github.com/deckhouse/virtualization/api/client/kubeclient" virtv2alpha1 "github.com/deckhouse/virtualization/api/core/v1alpha2" ) @@ -316,6 +317,7 @@ func main() { func printVersion(log *log.Logger) { log.Info(fmt.Sprintf("Go Version: %s", runtime.Version())) log.Info(fmt.Sprintf("Go OS/Arch: %s/%s", runtime.GOOS, runtime.GOARCH)) + log.Info(fmt.Sprintf("Edition: %s", version.GetEdition())) } func getEnv(env, defaultEnv string) string { diff --git a/images/virtualization-artifact/pkg/controller/kvbuilder/kvvm.go b/images/virtualization-artifact/pkg/controller/kvbuilder/kvvm.go index 3795ecf07..1ed9f7fd6 100644 --- a/images/virtualization-artifact/pkg/controller/kvbuilder/kvvm.go +++ b/images/virtualization-artifact/pkg/controller/kvbuilder/kvvm.go @@ -96,7 +96,10 @@ func (b *KVVM) SetKVVMIAnnotation(annoKey, annoValue string) { } func (b *KVVM) SetCPUModel(class *virtv2.VirtualMachineClass) error { - var cpu virtv1.CPU + if b.Resource.Spec.Template.Spec.Domain.CPU == nil { + b.Resource.Spec.Template.Spec.Domain.CPU = &virtv1.CPU{} + } + cpu := b.Resource.Spec.Template.Spec.Domain.CPU switch class.Spec.CPU.Type { case virtv2.CPUTypeHost: @@ -106,23 +109,21 @@ func (b *KVVM) SetCPUModel(class *virtv2.VirtualMachineClass) error { case virtv2.CPUTypeModel: cpu.Model = class.Spec.CPU.Model case virtv2.CPUTypeFeatures, virtv2.CPUTypeDiscovery: - cpu.Features = make([]virtv1.CPUFeature, len(class.Status.CpuFeatures.Enabled)) + features := make([]virtv1.CPUFeature, len(class.Status.CpuFeatures.Enabled)) for i, feature := range class.Status.CpuFeatures.Enabled { policy := "require" if feature == "invtsc" { policy = "optional" } - cpu.Features[i] = virtv1.CPUFeature{ + features[i] = virtv1.CPUFeature{ Name: feature, Policy: policy, } } + cpu.Features = features default: return fmt.Errorf("unexpected cpu type: %q", class.Spec.CPU.Type) } - - b.Resource.Spec.Template.Spec.Domain.CPU = &cpu - return nil } @@ -206,24 +207,43 @@ func (b *KVVM) SetTopologySpreadConstraint(topology []corev1.TopologySpreadConst b.Resource.Spec.Template.Spec.TopologySpreadConstraints = topology } -func (b *KVVM) SetResourceRequirements(cores int, coreFraction string, memorySize resource.Quantity) error { +func (b *KVVM) SetCpu(cores int, coreFraction string) error { + domainSpec := &b.Resource.Spec.Template.Spec.Domain + if domainSpec.CPU == nil { + domainSpec.CPU = &virtv1.CPU{} + } cpuRequest, err := GetCPURequest(cores, coreFraction) if err != nil { return err } - b.Resource.Spec.Template.Spec.Domain.Resources = virtv1.ResourceRequirements{ - Requests: corev1.ResourceList{ - corev1.ResourceCPU: *cpuRequest, - corev1.ResourceMemory: memorySize, - }, - Limits: corev1.ResourceList{ - corev1.ResourceCPU: *GetCPULimit(cores), - corev1.ResourceMemory: memorySize, - }, + cpuLimit := GetCPULimit(cores) + if domainSpec.Resources.Requests == nil { + domainSpec.Resources.Requests = make(map[corev1.ResourceName]resource.Quantity) } + if domainSpec.Resources.Limits == nil { + domainSpec.Resources.Limits = make(map[corev1.ResourceName]resource.Quantity) + } + domainSpec.Resources.Requests[corev1.ResourceCPU] = *cpuRequest + domainSpec.Resources.Limits[corev1.ResourceCPU] = *cpuLimit + // https://bugzilla.redhat.com/show_bug.cgi?id=1653453 + domainSpec.CPU.Cores = uint32(1) + domainSpec.CPU.Sockets = uint32(cores) + domainSpec.CPU.MaxSockets = uint32(cores) return nil } +func (b *KVVM) SetMemory(memorySize resource.Quantity) { + res := &b.Resource.Spec.Template.Spec.Domain.Resources + if res.Requests == nil { + res.Requests = make(map[corev1.ResourceName]resource.Quantity) + } + if res.Limits == nil { + res.Limits = make(map[corev1.ResourceName]resource.Quantity) + } + res.Requests[corev1.ResourceMemory] = memorySize + res.Limits[corev1.ResourceMemory] = memorySize +} + func GetCPURequest(cores int, coreFraction string) (*resource.Quantity, error) { if coreFraction == "" { return GetCPULimit(cores), nil diff --git a/images/virtualization-artifact/pkg/controller/kvbuilder/kvvm_utils.go b/images/virtualization-artifact/pkg/controller/kvbuilder/kvvm_utils.go index 9b963dc8c..ee8798b32 100644 --- a/images/virtualization-artifact/pkg/controller/kvbuilder/kvvm_utils.go +++ b/images/virtualization-artifact/pkg/controller/kvbuilder/kvvm_utils.go @@ -92,7 +92,8 @@ func ApplyVirtualMachineSpec( kvvm.SetPriorityClassName(vm.Spec.PriorityClassName) kvvm.SetTerminationGracePeriod(vm.Spec.TerminationGracePeriodSeconds) kvvm.SetTopologySpreadConstraint(vm.Spec.TopologySpreadConstraints) - if err := kvvm.SetResourceRequirements(vm.Spec.CPU.Cores, vm.Spec.CPU.CoreFraction, vm.Spec.Memory.Size); err != nil { + kvvm.SetMemory(vm.Spec.Memory.Size) + if err := kvvm.SetCpu(vm.Spec.CPU.Cores, vm.Spec.CPU.CoreFraction); err != nil { return err } diff --git a/images/virtualization-artifact/pkg/controller/vmchange/comparator_pod_placement.go b/images/virtualization-artifact/pkg/controller/vmchange/comparator_pod_placement.go index 77f01e212..8d8de5fae 100644 --- a/images/virtualization-artifact/pkg/controller/vmchange/comparator_pod_placement.go +++ b/images/virtualization-artifact/pkg/controller/vmchange/comparator_pod_placement.go @@ -44,7 +44,7 @@ func compareAffinity(current, desired *v1alpha2.VirtualMachineSpec) []FieldChang currentValue, desiredValue, reflect.DeepEqual(current.Affinity, desired.Affinity), - ActionRestart, + placementAction, ) } @@ -57,7 +57,7 @@ func compareNodeSelector(current, desired *v1alpha2.VirtualMachineSpec) []FieldC currentValue, desiredValue, reflect.DeepEqual(current.NodeSelector, desired.NodeSelector), - ActionRestart, + placementAction, ) } diff --git a/images/virtualization-artifact/pkg/controller/vmchange/pod_placement_ce.go b/images/virtualization-artifact/pkg/controller/vmchange/pod_placement_ce.go new file mode 100644 index 000000000..442d68276 --- /dev/null +++ b/images/virtualization-artifact/pkg/controller/vmchange/pod_placement_ce.go @@ -0,0 +1,22 @@ +//go:build !EE +// +build !EE + +/* +Copyright 2024 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package vmchange + +const placementAction = ActionRestart diff --git a/images/virtualization-artifact/pkg/controller/vmchange/pod_placement_ee.go b/images/virtualization-artifact/pkg/controller/vmchange/pod_placement_ee.go new file mode 100644 index 000000000..1cf2fc0e6 --- /dev/null +++ b/images/virtualization-artifact/pkg/controller/vmchange/pod_placement_ee.go @@ -0,0 +1,11 @@ +//go:build EE +// +build EE + +/* +Copyright 2024 Flant JSC +Licensed under the Deckhouse Platform Enterprise Edition (EE) license. See https://github.com/deckhouse/deckhouse/blob/main/ee/LICENSE +*/ + +package vmchange + +const placementAction = ActionApplyImmediate diff --git a/images/virtualization-artifact/pkg/version/edition.go b/images/virtualization-artifact/pkg/version/edition.go new file mode 100644 index 000000000..cc15b792f --- /dev/null +++ b/images/virtualization-artifact/pkg/version/edition.go @@ -0,0 +1,22 @@ +//go:build !EE +// +build !EE + +/* +Copyright 2024 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package version + +const edition = "CE" diff --git a/images/virtualization-artifact/pkg/version/edition_ee.go b/images/virtualization-artifact/pkg/version/edition_ee.go new file mode 100644 index 000000000..6127ec557 --- /dev/null +++ b/images/virtualization-artifact/pkg/version/edition_ee.go @@ -0,0 +1,11 @@ +//go:build EE +// +build EE + +/* +Copyright 2024 Flant JSC +Licensed under the Deckhouse Platform Enterprise Edition (EE) license. See https://github.com/deckhouse/deckhouse/blob/main/ee/LICENSE +*/ + +package version + +const edition = "EE" diff --git a/images/virtualization-artifact/pkg/version/version.go b/images/virtualization-artifact/pkg/version/version.go new file mode 100644 index 000000000..97b16e4cf --- /dev/null +++ b/images/virtualization-artifact/pkg/version/version.go @@ -0,0 +1,21 @@ +/* +Copyright 2024 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package version + +func GetEdition() string { + return edition +} diff --git a/images/virtualization-artifact/werf.inc.yaml b/images/virtualization-artifact/werf.inc.yaml index e3b949d41..1f23c8f16 100644 --- a/images/virtualization-artifact/werf.inc.yaml +++ b/images/virtualization-artifact/werf.inc.yaml @@ -32,5 +32,5 @@ shell: - export GOOS=linux - export CGO_ENABLED=0 - export GOARCH=amd64 - - go build -v -a -o virtualization-controller ./cmd/virtualization-controller + - go build -tags {{ .MODULE_EDITION }} -v -a -o virtualization-controller ./cmd/virtualization-controller - go build -v -a -o virtualization-api ./cmd/virtualization-api diff --git a/templates/kubevirt/kubevirt.yaml b/templates/kubevirt/kubevirt.yaml index 228446089..2ebb481f0 100644 --- a/templates/kubevirt/kubevirt.yaml +++ b/templates/kubevirt/kubevirt.yaml @@ -22,11 +22,18 @@ spec: tokenBucketRateLimiter: qps: 5000 burst: 6000 + migrations: + bandwidthPerMigration: 64Mi + completionTimeoutPerGiB: 800 + parallelMigrationsPerCluster: 5 + parallelOutboundMigrationsPerNode: 2 + progressTimeout: 150 smbios: manufacturer: Flant family: Deckhouse product: DeckhouseVirtualizationPlatform evictionStrategy: LiveMigrate + vmRolloutStrategy: LiveUpdate developerConfiguration: featureGates: - HotplugVolumes