Skip to content

Commit

Permalink
Fix discrepency metrics <> rebootAsRequired
Browse files Browse the repository at this point in the history
Without this patch, one metric could say "reboot is required"
while the rebootAsRequired tick did not run (long period for
example).

This is a problem, as it leads to misexpectations: "Why
did the system not reboot, while the metrics indicate a reboot
was required".

This solves it by inlining the metrics management within the
rebootAsRequired goroutine.

Closes: kubereboot#725

Signed-off-by: Jean-Philippe Evrard <[email protected]>
  • Loading branch information
evrardjp committed Nov 10, 2024
1 parent 76765f5 commit d5e1b6e
Showing 1 changed file with 4 additions and 12 deletions.
16 changes: 4 additions & 12 deletions cmd/kured/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,6 @@ func main() {
lock := daemonsetlock.New(client, nodeID, dsNamespace, dsName, lockAnnotation, lockTTL, concurrency, lockReleaseDelay)

go rebootAsRequired(nodeID, rebooter, rebootChecker, blockCheckers, window, lock, client)
go maintainRebootRequiredMetric(nodeID, rebootChecker)

http.Handle("/metrics", promhttp.Handler())
log.Fatal(http.ListenAndServe(fmt.Sprintf("%s:%d", metricsHost, metricsPort), nil))
Expand Down Expand Up @@ -468,17 +467,6 @@ func uncordon(client *kubernetes.Clientset, node *v1.Node) error {
return nil
}

func maintainRebootRequiredMetric(nodeID string, checker checkers.Checker) {
for {
if checker.RebootRequired() {
rebootRequiredGauge.WithLabelValues(nodeID).Set(1)
} else {
rebootRequiredGauge.WithLabelValues(nodeID).Set(0)
}
time.Sleep(time.Minute)
}
}

func addNodeAnnotations(client *kubernetes.Clientset, nodeID string, annotations map[string]string) error {
node, err := client.CoreV1().Nodes().Get(context.TODO(), nodeID, metav1.GetOptions{})
if err != nil {
Expand Down Expand Up @@ -610,7 +598,9 @@ func rebootAsRequired(nodeID string, rebooter reboot.Rebooter, checker checkers.
preferNoScheduleTaint := taints.New(client, nodeID, preferNoScheduleTaintName, v1.TaintEffectPreferNoSchedule)

// Remove taint immediately during startup to quickly allow scheduling again.
// Also update the metric at startup (after lock shenanigans)
if !checker.RebootRequired() {
rebootRequiredGauge.WithLabelValues(nodeID).Set(0)
preferNoScheduleTaint.Disable()
}

Expand All @@ -625,9 +615,11 @@ func rebootAsRequired(nodeID string, rebooter reboot.Rebooter, checker checkers.

if !checker.RebootRequired() {
log.Infof("Reboot not required")
rebootRequiredGauge.WithLabelValues(nodeID).Set(0)
preferNoScheduleTaint.Disable()
continue
}
rebootRequiredGauge.WithLabelValues(nodeID).Set(1)

node, err := client.CoreV1().Nodes().Get(context.TODO(), nodeID, metav1.GetOptions{})
if err != nil {
Expand Down

0 comments on commit d5e1b6e

Please sign in to comment.