osd: use bash script if restart interval is > 0

If `FlappingRestartIntervalHours` for OSD is > 0 in the cephCluster CR, then start OSD inside a bash script. Signed-off-by: sp98 <[email protected]> (cherry picked from commit ffd6929)
red-hat-storage · Oct 12, 2023 · dd33950 · dd33950
1 parent fe613c5
commit dd33950
Show file tree

Hide file tree

Showing 9 changed files with 65 additions and 65 deletions.
diff --git a/Documentation/CRDs/Cluster/ceph-cluster-crd.md b/Documentation/CRDs/Cluster/ceph-cluster-crd.md
@@ -85,7 +85,7 @@ For more details on the mons and when to choose a number other than `3`, see the
     * `onlyApplyOSDPlacement`: Whether the placement specific for OSDs is merged with the `all` placement. If `false`, the OSD placement will be merged with the `all` placement. If true, the `OSD placement will be applied` and the `all` placement will be ignored. The placement for OSDs is computed from several different places depending on the type of OSD:
         * For non-PVCs: `placement.all` and `placement.osd`
         * For PVCs: `placement.all` and inside the storageClassDeviceSets from the `placement` or `preparePlacement`
-    * `flappingRestartIntervalHours`: Defines the time for which an OSD pod will sleep before restarting, if it stopped due to flapping. Flapping occurs where OSDs are marked `down` by Ceph more than 5 times in 600 seconds. The OSDs will stay down when flapping since they likely have a bad disk or other issue that needs investigation. The default is 24 hours. If the issue with the OSD is fixed manually, the OSD pod can be manually restarted.
+    * `flappingRestartIntervalHours`: Defines the time for which an OSD pod will sleep before restarting, if it stopped due to flapping. Flapping occurs where OSDs are marked `down` by Ceph more than 5 times in 600 seconds. The OSDs will stay down when flapping since they likely have a bad disk or other issue that needs investigation. If the issue with the OSD is fixed manually, the OSD pod can be manually restarted. The sleep is disabled if this interval is set to 0.
 * `disruptionManagement`: The section for configuring management of daemon disruptions
     * `managePodBudgets`: if `true`, the operator will create and manage PodDisruptionBudgets for OSD, Mon, RGW, and MDS daemons. OSD PDBs are managed dynamically via the strategy outlined in the [design](https://github.com/rook/rook/blob/master/design/ceph/ceph-managed-disruptionbudgets.md). The operator will block eviction of OSDs by default and unblock them safely when drains are detected.
     * `osdMaintenanceTimeout`: is a duration in minutes that determines how long an entire failureDomain like `region/zone/host` will be held in `noout` (in addition to the default DOWN/OUT interval) when it is draining. This is only relevant when  `managePodBudgets` is `true`. The default value is `30` minutes.

diff --git a/Documentation/CRDs/specification.md b/Documentation/CRDs/specification.md
@@ -11372,8 +11372,8 @@ int
 This is needed for OSD flapping where OSD daemons are marked down more than 5 times in 600 seconds by Ceph.
 Preventing the OSD pods to restart immediately in such scenarios will prevent Rook from marking OSD as <code>up</code> and thus
 peering of the PGs mapped to the OSD.
-The interval defaults to 24 hours if no value is provided. User needs to manually restart the OSD pod if they manage to fix
-the underlying OSD flapping issue before the restart interval.</p>
+User needs to manually restart the OSD pod if they manage to fix the underlying OSD flapping issue before the restart interval.
+The sleep will be disabled if this interval is set to 0.</p>
 </td>
 </tr>
 </tbody>

diff --git a/deploy/charts/rook-ceph/templates/resources.yaml b/deploy/charts/rook-ceph/templates/resources.yaml
@@ -2664,7 +2664,7 @@ spec:
                       type: array
                       x-kubernetes-preserve-unknown-fields: true
                     flappingRestartIntervalHours:
-                      description: FlappingRestartIntervalHours defines the time for which the OSD pods, that failed with zero exit code, will sleep before restarting. This is needed for OSD flapping where OSD daemons are marked down more than 5 times in 600 seconds by Ceph. Preventing the OSD pods to restart immediately in such scenarios will prevent Rook from marking OSD as `up` and thus peering of the PGs mapped to the OSD. The interval defaults to 24 hours if no value is provided. User needs to manually restart the OSD pod if they manage to fix the underlying OSD flapping issue before the restart interval.
+                      description: FlappingRestartIntervalHours defines the time for which the OSD pods, that failed with zero exit code, will sleep before restarting. This is needed for OSD flapping where OSD daemons are marked down more than 5 times in 600 seconds by Ceph. Preventing the OSD pods to restart immediately in such scenarios will prevent Rook from marking OSD as `up` and thus peering of the PGs mapped to the OSD. User needs to manually restart the OSD pod if they manage to fix the underlying OSD flapping issue before the restart interval. The sleep will be disabled if this interval is set to 0.
                       type: integer
                     nodes:
                       items:

diff --git a/deploy/examples/crds.yaml b/deploy/examples/crds.yaml
@@ -2662,7 +2662,7 @@ spec:
                       type: array
                       x-kubernetes-preserve-unknown-fields: true
                     flappingRestartIntervalHours:
-                      description: FlappingRestartIntervalHours defines the time for which the OSD pods, that failed with zero exit code, will sleep before restarting. This is needed for OSD flapping where OSD daemons are marked down more than 5 times in 600 seconds by Ceph. Preventing the OSD pods to restart immediately in such scenarios will prevent Rook from marking OSD as `up` and thus peering of the PGs mapped to the OSD. The interval defaults to 24 hours if no value is provided. User needs to manually restart the OSD pod if they manage to fix the underlying OSD flapping issue before the restart interval.
+                      description: FlappingRestartIntervalHours defines the time for which the OSD pods, that failed with zero exit code, will sleep before restarting. This is needed for OSD flapping where OSD daemons are marked down more than 5 times in 600 seconds by Ceph. Preventing the OSD pods to restart immediately in such scenarios will prevent Rook from marking OSD as `up` and thus peering of the PGs mapped to the OSD. User needs to manually restart the OSD pod if they manage to fix the underlying OSD flapping issue before the restart interval. The sleep will be disabled if this interval is set to 0.
                       type: integer
                     nodes:
                       items:

diff --git a/pkg/apis/ceph.rook.io/v1/types.go b/pkg/apis/ceph.rook.io/v1/types.go
@@ -2667,8 +2667,8 @@ type StorageScopeSpec struct {
 	// This is needed for OSD flapping where OSD daemons are marked down more than 5 times in 600 seconds by Ceph.
 	// Preventing the OSD pods to restart immediately in such scenarios will prevent Rook from marking OSD as `up` and thus
 	// peering of the PGs mapped to the OSD.
-	// The interval defaults to 24 hours if no value is provided. User needs to manually restart the OSD pod if they manage to fix
-	// the underlying OSD flapping issue before the restart interval.
+	// User needs to manually restart the OSD pod if they manage to fix the underlying OSD flapping issue before the restart interval.
+	// The sleep will be disabled if this interval is set to 0.
 	FlappingRestartIntervalHours int `json:"flappingRestartIntervalHours"`
 }
 

diff --git a/pkg/operator/ceph/cluster/osd/osd.go b/pkg/operator/ceph/cluster/osd/osd.go
@@ -21,7 +21,6 @@ import (
 	"bufio"
 	"context"
 	"fmt"
-	"regexp"
 	"sort"
 	"strconv"
 	"strings"
@@ -587,13 +586,7 @@ func (c *Cluster) getOSDInfo(d *appsv1.Deployment) (OSDInfo, error) {
 	}
 
 	locationFound := false
-	for _, a := range container.Command {
-		locationPrefix := "--crush-location="
-		if strings.Contains(a, locationPrefix) {
-			locationFound = true
-			osd.Location = getLocationWithRegex(a)
-		}
-	}
+	osd.Location, locationFound = getOSDLocationFromArgs(container.Args)
 
 	if !locationFound {
 		location, _, err := getLocationFromPod(c.clusterInfo.Context, c.context.Clientset, d, cephclient.GetCrushRootFromSpec(&c.spec))
@@ -871,11 +864,15 @@ func (c *Cluster) waitForHealthyPGs() (bool, error) {
 	return true, nil
 }
 
-func getLocationWithRegex(input string) string {
-	rx := regexp.MustCompile(`--crush-location="(.+?)"`)
-	match := rx.FindStringSubmatch(input)
-	if len(match) == 2 {
-		return strings.TrimSpace(match[1])
+func getOSDLocationFromArgs(args []string) (string, bool) {
+	for _, a := range args {
+		locationPrefix := "--crush-location="
+		if strings.HasPrefix(a, locationPrefix) {
+			// Extract the same CRUSH location as originally determined by the OSD prepare pod
+			// by cutting off the prefix: --crush-location=
+			return a[len(locationPrefix):], true
+		}
 	}
-	return ""
+
+	return "", false
 }
diff --git a/pkg/operator/ceph/cluster/osd/osd_test.go b/pkg/operator/ceph/cluster/osd/osd_test.go
@@ -814,13 +814,14 @@ func TestReplaceOSDForNewStore(t *testing.T) {
 	})
 }
 
-func TestGetLocationWithRegex(t *testing.T) {
-	location := getLocationWithRegex("")
-	assert.Equal(t, "", location)
-
-	location = getLocationWithRegex(`ceph-osd --crush-location="root=default host=node" --default-log-to-stderr=true`)
-	assert.Equal(t, "root=default host=node", location)
-
-	location = getLocationWithRegex(`ceph-osd --crush-location="" --default-log-to-stderr=true`)
-	assert.Equal(t, "", location)
+func TestGetOSDLocationFromArgs(t *testing.T) {
+	args := []string{"--id", "2", "--crush-location=root=default host=minikube"}
+	osdLocaiton, locationFound := getOSDLocationFromArgs(args)
+	assert.Equal(t, osdLocaiton, "root=default host=minikube")
+	assert.Equal(t, locationFound, true)
+
+	args = []string{"--id", "2"}
+	osdLocaiton, locationFound = getOSDLocationFromArgs(args)
+	assert.Equal(t, osdLocaiton, "")
+	assert.Equal(t, locationFound, false)
 }
diff --git a/pkg/operator/ceph/cluster/osd/spec.go b/pkg/operator/ceph/cluster/osd/spec.go
@@ -22,7 +22,6 @@ import (
 	"path"
 	"path/filepath"
 	"strconv"
-	"strings"
 
 	"github.com/pkg/errors"
 	cephv1 "github.com/rook/rook/pkg/apis/ceph.rook.io/v1"
@@ -62,35 +61,42 @@ const (
 	// DmcryptMetadataType is a portion of the device mapper name for the encrypted OSD on PVC block
 	DmcryptMetadataType = "db-dmcrypt"
 	// DmcryptWalType is a portion of the device mapper name for the encrypted OSD on PVC wal
-	DmcryptWalType            = "wal-dmcrypt"
-	bluestoreBlockName        = "block"
-	bluestoreMetadataName     = "block.db"
-	bluestoreWalName          = "block.wal"
-	tempEtcCephDir            = "/etc/temp-ceph"
-	osdPortv1                 = 6801
-	osdPortv2                 = 6800
-	defaultOSDRestartInterval = 24
+	DmcryptWalType        = "wal-dmcrypt"
+	bluestoreBlockName    = "block"
+	bluestoreMetadataName = "block.db"
+	bluestoreWalName      = "block.wal"
+	tempEtcCephDir        = "/etc/temp-ceph"
+	osdPortv1             = 6801
+	osdPortv2             = 6800
 )
 
 const (
 	cephOSDStart = `
+set -o nounset # fail if variables are unset
+child_pid=""
+sigterm_received=false
 function sigterm() {
 	echo "SIGTERM received"
-	exit
+	sigterm_received=true
+	kill -TERM "$child_pid"
 }
-
 trap sigterm SIGTERM
-
-%s %s & wait
-
-RESTART_INTERVAL=%d
-rc=$?
-if [ $rc -eq 0 ]; then
+"${@}" &
+# un-fixable race condition: if receive sigterm here, it won't be sent to child process
+child_pid="$!"
+wait "$child_pid" # wait returns the same return code of child process when called with argument
+wait "$child_pid" # first wait returns immediately upon SIGTERM, so wait again for child to actually stop; this is a noop if child exited normally
+ceph_osd_rc=$?
+if [ $ceph_osd_rc -eq 0 ] && ! $sigterm_received; then
 	touch /tmp/osd-sleep
-	echo "OSD daemon exited with code 0, possibly due to OSD flapping. The OSD pod will sleep for $RESTART_INTERVAL hours. Restart the pod manually once the flapping issue is fixed"
-	sleep "$RESTART_INTERVAL"h & wait
-	exit $rc
-fi`
+	echo "OSD daemon exited with code 0, possibly due to OSD flapping. The OSD pod will sleep for $ROOK_OSD_RESTART_INTERVAL hours. Restart the pod manually once the flapping issue is fixed"
+	sleep "$ROOK_OSD_RESTART_INTERVAL"h &
+	child_pid="$!"
+	wait "$child_pid"
+	wait "$child_pid" # wait again for sleep to stop
+fi
+exit $ceph_osd_rc
+`
 
 	activateOSDOnNodeCode = `
 set -o errexit
@@ -371,6 +377,7 @@ func (c *Cluster) makeDeployment(osdProps osdProperties, osd OSDInfo, provisionC
 	envVars := c.getConfigEnvVars(osdProps, dataDir, false)
 	envVars = append(envVars, k8sutil.ClusterDaemonEnvVars(c.spec.CephVersion.Image)...)
 	envVars = append(envVars, []v1.EnvVar{
+		{Name: "ROOK_OSD_RESTART_INTERVAL", Value: strconv.Itoa(c.spec.Storage.FlappingRestartIntervalHours)},
 		{Name: "ROOK_OSD_UUID", Value: osd.UUID},
 		{Name: "ROOK_OSD_ID", Value: osdID},
 		{Name: "ROOK_CEPH_MON_HOST",
@@ -421,7 +428,7 @@ func (c *Cluster) makeDeployment(osdProps osdProperties, osd OSDInfo, provisionC
 		"--fsid", c.clusterInfo.FSID,
 		"--setuser", "ceph",
 		"--setgroup", "ceph",
-		fmt.Sprintf("--crush-location=%q", osd.Location),
+		fmt.Sprintf("--crush-location=%s", osd.Location),
 	}...)
 
 	// Ceph expects initial weight as float value in tera-bytes units
@@ -619,7 +626,8 @@ func (c *Cluster) makeDeployment(osdProps osdProperties, osd OSDInfo, provisionC
 			InitContainers:     initContainers,
 			Containers: []v1.Container{
 				{
-					Command:         osdStartScript(command, args, c.spec.Storage.FlappingRestartIntervalHours),
+					Command:         getOSDCmd(command, c.spec.Storage.FlappingRestartIntervalHours),
+					Args:            args,
 					Name:            "osd",
 					Image:           c.spec.CephVersion.Image,
 					ImagePullPolicy: controller.GetContainerImagePullPolicy(c.spec.CephVersion.ImagePullPolicy),
@@ -1417,16 +1425,9 @@ func (c *Cluster) getOSDServicePorts() []v1.ServicePort {
 	return ports
 }
 
-func osdStartScript(cmd, args []string, interval int) []string {
-	osdRestartInterval := defaultOSDRestartInterval
+func getOSDCmd(cmd []string, interval int) []string {
 	if interval != 0 {
-		osdRestartInterval = interval
-	}
-
-	return []string{
-		"/bin/bash",
-		"-c",
-		"-x",
-		fmt.Sprintf(cephOSDStart, strings.Join(cmd, " "), strings.Join(args, " "), osdRestartInterval),
+		return append([]string{"bash", "-x", "-c", cephOSDStart, "--"}, cmd...)
 	}
+	return cmd
 }
diff --git a/pkg/operator/ceph/cluster/osd/spec_test.go b/pkg/operator/ceph/cluster/osd/spec_test.go
@@ -178,6 +178,7 @@ func testPodDevices(t *testing.T, dataDir, deviceName string, allDevices bool) {
 	cont := deployment.Spec.Template.Spec.Containers[0]
 	assert.Equal(t, spec.CephVersion.Image, cont.Image)
 	assert.Equal(t, 8, len(cont.VolumeMounts))
+	assert.Equal(t, "ceph-osd", cont.Command[0])
 	verifyEnvVar(t, cont.Env, "TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES", "134217728", true)
 
 	// Test OSD on PVC with LVM
@@ -433,15 +434,15 @@ func testPodDevices(t *testing.T, dataDir, deviceName string, allDevices bool) {
 	deployment, err = c.makeDeployment(osdProp, osd, dataPathMap)
 	assert.NoError(t, err)
 	for _, flag := range defaultTuneFastSettings {
-		assert.Contains(t, deployment.Spec.Template.Spec.Containers[0].Command[3], flag)
+		assert.Contains(t, deployment.Spec.Template.Spec.Containers[0].Args, flag)
 	}
 
 	// Test tune Slow settings when OSD on PVC
 	osdProp.tuneSlowDeviceClass = true
 	deployment, err = c.makeDeployment(osdProp, osd, dataPathMap)
 	assert.NoError(t, err)
 	for _, flag := range defaultTuneSlowSettings {
-		assert.Contains(t, deployment.Spec.Template.Spec.Containers[0].Command[3], flag)
+		assert.Contains(t, deployment.Spec.Template.Spec.Containers[0].Args, flag)
 	}
 
 	// Test shareProcessNamespace presence