From ad1bae9e1bcd9646013ecec79c3b67f40b2a16d0 Mon Sep 17 00:00:00 2001 From: Blaine Gardner Date: Thu, 3 Oct 2024 08:37:39 -0600 Subject: [PATCH] mds: fix liveness probe timeout When the MDS liveness probe times out, it should not fail the probe. If the cluster has a network partition or other issue that causes the Ceph mon cluster to become unstable, `ceph ...` commands can hang and cause a timeout. In this case, the MDS should not be restarted so as to not cause cascading cluster disruption. Signed-off-by: Blaine Gardner --- pkg/operator/ceph/file/mds/livenessprobe.go | 5 ++--- pkg/operator/ceph/file/mds/livenessprobe.sh | 3 ++- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pkg/operator/ceph/file/mds/livenessprobe.go b/pkg/operator/ceph/file/mds/livenessprobe.go index beec3882ac5d..6f318d6bab89 100644 --- a/pkg/operator/ceph/file/mds/livenessprobe.go +++ b/pkg/operator/ceph/file/mds/livenessprobe.go @@ -3,7 +3,6 @@ package mds import ( "bytes" _ "embed" - "fmt" "html/template" "github.com/pkg/errors" @@ -38,6 +37,7 @@ type mdsLivenessProbeConfig struct { MdsId string FilesystemName string Keyring string + CmdTimeout int32 } func renderProbe(mdsLivenessProbeConfigValue mdsLivenessProbeConfig) (string, error) { @@ -64,6 +64,7 @@ func generateMDSLivenessProbeExecDaemon(daemonID, filesystemName, keyring string MdsId: daemonID, FilesystemName: filesystemName, Keyring: keyring, + CmdTimeout: mdsCmdTimeout, } mdsLivenessProbeCmd, err := renderProbe(mdsLivenessProbeConfigValue) @@ -75,8 +76,6 @@ func generateMDSLivenessProbeExecDaemon(daemonID, filesystemName, keyring string ProbeHandler: v1.ProbeHandler{ Exec: &v1.ExecAction{ Command: []string{ - "timeout", - fmt.Sprintf("%d", mdsCmdTimeout), "sh", "-c", mdsLivenessProbeCmd, diff --git a/pkg/operator/ceph/file/mds/livenessprobe.sh b/pkg/operator/ceph/file/mds/livenessprobe.sh index 97ad139bdbeb..5cddfac15f32 100644 --- a/pkg/operator/ceph/file/mds/livenessprobe.sh +++ b/pkg/operator/ceph/file/mds/livenessprobe.sh @@ -5,8 +5,9 @@ MDS_ID="{{ .MdsId }}" FILESYSTEM_NAME="{{ .FilesystemName }}" KEYRING="{{ .Keyring }}" +CMD_TIMEOUT="{{ .CmdTimeout }}" -outp="$(ceph fs dump --mon-host="$ROOK_CEPH_MON_HOST" --mon-initial-members="$ROOK_CEPH_MON_INITIAL_MEMBERS" --keyring "$KEYRING" --format json)" +outp="$(ceph fs dump --mon-host="$ROOK_CEPH_MON_HOST" --mon-initial-members="$ROOK_CEPH_MON_INITIAL_MEMBERS" --keyring "$KEYRING" --connect-timeout="$CMD_TIMEOUT" --format json)" rc=$? if [ $rc -ne 0 ]; then echo "ceph MDS dump check failed with the following output:"