Skip to content

Commit

Permalink
Fix code handling unschedulable pods and only delete them if pods doe…
Browse files Browse the repository at this point in the history
…sn't match desired state (#872)
  • Loading branch information
SaaldjorMike authored Oct 31, 2024
1 parent abfa3ec commit 44e971e
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 27 deletions.
20 changes: 7 additions & 13 deletions controllers/humiocluster_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -1829,8 +1829,13 @@ func (r *HumioClusterReconciler) ensureMismatchedPodsAreDeleted(ctx context.Cont
return reconcile.Result{}, r.logErrorAndReturn(err, "failed to get pod status")
}

// based on all pods we have, fetch compare list of all current pods with desired pods
desiredLifecycleState, desiredPod, err := r.getPodDesiredLifecycleState(ctx, hnp, listOfAllCurrentPodsForNodePool, attachments, podsStatus.foundEvictedPodsOrPodsWithOrpahanedPVCs() || podsStatus.haveUnschedulablePodsOrPodsWithBadStatusConditions())
podList := listOfAllCurrentPodsForNodePool
if podsStatus.haveUnschedulablePodsOrPodsWithBadStatusConditions() {
podList = podsStatus.podAreUnschedulableOrHaveBadStatusConditions
}

// based on all pods we have, fetch compare list of all current pods with desired pods, or the pods we have prioritized to delete
desiredLifecycleState, desiredPod, err := r.getPodDesiredLifecycleState(ctx, hnp, podList, attachments, podsStatus.foundEvictedPodsOrPodsWithOrpahanedPVCs() || podsStatus.haveUnschedulablePodsOrPodsWithBadStatusConditions())
if err != nil {
return reconcile.Result{}, r.logErrorAndReturn(err, "got error when getting pod desired lifecycle")
}
Expand Down Expand Up @@ -1921,17 +1926,6 @@ func (r *HumioClusterReconciler) ensureMismatchedPodsAreDeleted(ctx context.Cont
return reconcile.Result{RequeueAfter: time.Second + 1}, nil
}

// delete unschedulable pods or pods with bad status conditions (crashing,exited)
if podsStatus.haveUnschedulablePodsOrPodsWithBadStatusConditions() {
r.Log.Info(fmt.Sprintf("found %d humio pods with errors", len(podsStatus.podAreUnschedulableOrHaveBadStatusConditions)))

for i, pod := range podsStatus.podAreUnschedulableOrHaveBadStatusConditions {
r.Log.Info(fmt.Sprintf("deleting pod with error[%d] %s", i, pod.Name))
err = r.Delete(ctx, &pod)
return reconcile.Result{Requeue: true}, err
}
}

podsForDeletion := desiredLifecycleState.podsToBeReplaced

// if zone awareness is enabled, we pin a zone until we're done replacing all pods in that zone,
Expand Down
26 changes: 12 additions & 14 deletions controllers/suite/clusters/humiocluster_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -354,24 +354,22 @@ var _ = Describe("HumioCluster Controller", func() {

ensurePodsGoPending(ctx, controllers.NewHumioNodeManagerFromHumioCluster(&updatedHumioCluster), 2, 1)

if !helpers.UseEnvtest() {
Eventually(func() int {
var pendingPodsCount int
updatedClusterPods, _ = kubernetes.ListPods(ctx, k8sClient, updatedHumioCluster.Namespace, controllers.NewHumioNodeManagerFromHumioCluster(&updatedHumioCluster).GetPodLabels())
for _, pod := range updatedClusterPods {
if pod.Status.Phase == corev1.PodPending {
for _, condition := range pod.Status.Conditions {
if condition.Type == corev1.PodScheduled {
if condition.Status == corev1.ConditionFalse && condition.Reason == controllers.PodConditionReasonUnschedulable {
pendingPodsCount++
}
Eventually(func() int {
var pendingPodsCount int
updatedClusterPods, _ = kubernetes.ListPods(ctx, k8sClient, updatedHumioCluster.Namespace, controllers.NewHumioNodeManagerFromHumioCluster(&updatedHumioCluster).GetPodLabels())
for _, pod := range updatedClusterPods {
if pod.Status.Phase == corev1.PodPending {
for _, condition := range pod.Status.Conditions {
if condition.Type == corev1.PodScheduled {
if condition.Status == corev1.ConditionFalse && condition.Reason == controllers.PodConditionReasonUnschedulable {
pendingPodsCount++
}
}
}
}
return pendingPodsCount
}, testTimeout, 250*time.Millisecond).Should(Equal(1))
}
}
return pendingPodsCount
}, testTimeout, 250*time.Millisecond).Should(Equal(1))

suite.UsingClusterBy(key.Name, "Updating the cluster resources successfully with working affinity")
Eventually(func() error {
Expand Down

0 comments on commit 44e971e

Please sign in to comment.