diff --git a/cmd/kured/main.go b/cmd/kured/main.go index 3ad1cbee2..ace7c60ff 100644 --- a/cmd/kured/main.go +++ b/cmd/kured/main.go @@ -84,11 +84,12 @@ var ( nodeID string concurrency int - rebootDays []string - rebootStart string - rebootEnd string - timezone string - annotateNodes bool + rebootDays []string + rebootStart string + rebootEnd string + timezone string + minRebootPeriod time.Duration + annotateNodes bool // Metrics rebootRequiredGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{ @@ -105,6 +106,8 @@ const ( KuredRebootInProgressAnnotation string = "weave.works/kured-reboot-in-progress" // KuredMostRecentRebootNeededAnnotation is the canonical string value for the kured most-recent-reboot-needed annotation KuredMostRecentRebootNeededAnnotation string = "weave.works/kured-most-recent-reboot-needed" + // KuredMostRecentRebootNeededAnnotation is the canonical string value for the kured last-successful-reboot annotation + KuredLastSuccessfulRebootAnnotation string = "weave.works/kured-last-successful-reboot" // EnvPrefix The environment variable prefix of all environment variables bound to our command line flags. EnvPrefix = "KURED" @@ -135,7 +138,8 @@ func NewRootCommand() *cobra.Command { Short: "Kubernetes Reboot Daemon", PersistentPreRunE: bindViper, PreRun: flagCheck, - Run: root} + Run: root, + } rootCmd.PersistentFlags().StringVar(&nodeID, "node-id", "", "node name kured runs on, should be passed down from spec.nodeName via KURED_NODE_ID environment variable") @@ -218,6 +222,8 @@ func NewRootCommand() *cobra.Command { "schedule reboot only before this time of day") rootCmd.PersistentFlags().StringVar(&timezone, "time-zone", "UTC", "use this timezone for schedule inputs") + rootCmd.PersistentFlags().DurationVar(&minRebootPeriod, "min-reboot-period", 0, + "the minimal duration between reboots of a node. Requires --annotate-nodes") rootCmd.PersistentFlags().BoolVar(&annotateNodes, "annotate-nodes", false, "if set, the annotations 'weave.works/kured-reboot-in-progress' and 'weave.works/kured-most-recent-reboot-needed' will be given to nodes undergoing kured reboots") @@ -265,6 +271,9 @@ func flagCheck(cmd *cobra.Command, args []string) { if !reflect.DeepEqual(preRebootNodeLabelKeys, postRebootNodeLabelKeys) { log.Warnf("pre-reboot-node-labels keys and post-reboot-node-labels keys do not match. This may result in unexpected behaviour.") } + if !annotateNodes && minRebootPeriod != 0 { + log.Fatal("Cannot use --min-reboot-period without --annotate-nodes") + } } // stripQuotes removes any literal single or double quote chars that surround a string @@ -317,7 +326,6 @@ func flagToEnvVar(flag string) string { // buildHostCommand writes a new command to run in the host namespace // Rancher based need different pid func buildHostCommand(pid int, command []string) []string { - // From the container, we nsenter into the proper PID to run the hostCommand. // For this, kured daemonset need to be configured with hostPID:true and privileged:true cmd := []string{"/usr/bin/nsenter", fmt.Sprintf("-m/proc/%d/ns/mnt", pid), "--"} @@ -400,7 +408,8 @@ func (kb KubernetesBlockingChecker) isBlocked() bool { podList, err := kb.client.CoreV1().Pods("").List(context.TODO(), metav1.ListOptions{ LabelSelector: labelSelector, FieldSelector: fieldSelector, - Limit: 10}) + Limit: 10, + }) if err != nil { log.Warnf("Reboot blocked: pod query error: %v", err) return true @@ -694,6 +703,11 @@ func rebootAsRequired(nodeID string, booter reboot.Reboot, sentinelCommand []str continue } } + if minRebootPeriod != 0 { + if err := addNodeAnnotations(client, nodeID, map[string]string{KuredLastSuccessfulRebootAnnotation: time.Now().Format(time.RFC3339)}); err != nil { + continue + } + } } throttle(releaseDelay) release(lock, concurrency > 1) @@ -725,16 +739,22 @@ func rebootAsRequired(nodeID string, booter reboot.Reboot, sentinelCommand []str continue } + node, err := client.CoreV1().Nodes().Get(context.TODO(), nodeID, metav1.GetOptions{}) + if err != nil { + log.Fatalf("Error retrieving node object via k8s API: %v", err) + } + + if lastSuccessfulRebootWithinMinRebootPeriod(node) { + log.Infof("Last successful reboot within minimal reboot period") + continue + } + if !rebootRequired(sentinelCommand) { log.Infof("Reboot not required") preferNoScheduleTaint.Disable() continue } - node, err := client.CoreV1().Nodes().Get(context.TODO(), nodeID, metav1.GetOptions{}) - if err != nil { - log.Fatalf("Error retrieving node object via k8s API: %v", err) - } nodeMeta.Unschedulable = node.Spec.Unschedulable var timeNowString string @@ -804,6 +824,21 @@ func rebootAsRequired(nodeID string, booter reboot.Reboot, sentinelCommand []str } } +func lastSuccessfulRebootWithinMinRebootPeriod(node *v1.Node) bool { + if minRebootPeriod == 0 { + return false + } + if v, ok := node.GetAnnotations()[KuredLastSuccessfulRebootAnnotation]; ok { + t, err := time.Parse(time.RFC3339, v) + if err != nil { + log.Warnf("failed to parse time %q in annotation %q: %s", v, KuredLastSuccessfulRebootAnnotation, err.Error()) + return false + } + return time.Now().Before(t.Add(minRebootPeriod)) + } + return false +} + // buildSentinelCommand creates the shell command line which will need wrapping to escape // the container boundaries func buildSentinelCommand(rebootSentinelFile string, rebootSentinelCommand string) []string { diff --git a/kured-ds-signal.yaml b/kured-ds-signal.yaml index 54568b670..59c7beb3b 100644 --- a/kured-ds-signal.yaml +++ b/kured-ds-signal.yaml @@ -98,3 +98,4 @@ spec: # - --annotate-nodes=false # - --lock-release-delay=30m # - --log-format=text +# - --min-reboot-period=336h diff --git a/kured-ds.yaml b/kured-ds.yaml index 340ef4695..6a30b7d2c 100644 --- a/kured-ds.yaml +++ b/kured-ds.yaml @@ -100,3 +100,4 @@ spec: # - --metrics-host="" # - --metrics-port=8080 # - --concurrency=1 +# - --min-reboot-period=336h