Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bug 2233674: core: operator skips reconcile of mons and osds in debug #511

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions pkg/apis/ceph.rook.io/v1/labels.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,11 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

const (
// SkipReconcileLabelKey is a label indicating that the pod should not be reconciled
SkipReconcileLabelKey = "ceph.rook.io/do-not-reconcile"
)

// LabelsSpec is the main spec label for all daemons
type LabelsSpec map[KeyType]Labels

Expand Down
9 changes: 9 additions & 0 deletions pkg/operator/ceph/cluster/mon/health.go
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,15 @@ func (c *Cluster) checkHealth(ctx context.Context) error {
return errors.New("skipping mon health check since there are no monitors")
}

monsToSkipReconcile, err := c.getMonsToSkipReconcile()
if err != nil {
return errors.Wrap(err, "failed to check for mons to skip reconcile")
}
if monsToSkipReconcile.Len() > 0 {
logger.Warningf("skipping mon health check since mons are labeled with %s: %v", cephv1.SkipReconcileLabelKey, monsToSkipReconcile.List())
return nil
}

logger.Debugf("Checking health for mons in cluster %q", c.ClusterInfo.Namespace)

// For an external connection we use a special function to get the status
Expand Down
29 changes: 29 additions & 0 deletions pkg/operator/ceph/cluster/mon/mon.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ import (
kerrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/apimachinery/pkg/util/wait"
)

Expand Down Expand Up @@ -214,6 +215,15 @@ func (c *Cluster) Start(clusterInfo *cephclient.ClusterInfo, rookVersion string,

logger.Infof("targeting the mon count %d", c.spec.Mon.Count)

monsToSkipReconcile, err := c.getMonsToSkipReconcile()
if err != nil {
return nil, errors.Wrap(err, "failed to check for mons to skip reconcile")
}
if monsToSkipReconcile.Len() > 0 {
logger.Warningf("skipping mon reconcile since mons are labeled with %s: %v", cephv1.SkipReconcileLabelKey, monsToSkipReconcile.List())
return c.ClusterInfo, nil
}

// create the mons for a new cluster or ensure mons are running in an existing cluster
return c.ClusterInfo, c.startMons(c.spec.Mon.Count)
}
Expand Down Expand Up @@ -1462,3 +1472,22 @@ func (c *Cluster) releaseOrchestrationLock() {
c.orchestrationMutex.Unlock()
logger.Debugf("Released lock for mon orchestration")
}

func (c *Cluster) getMonsToSkipReconcile() (sets.String, error) {
listOpts := metav1.ListOptions{LabelSelector: fmt.Sprintf("%s=%s,%s", k8sutil.AppAttr, AppName, cephv1.SkipReconcileLabelKey)}

deployments, err := c.context.Clientset.AppsV1().Deployments(c.ClusterInfo.Namespace).List(c.ClusterInfo.Context, listOpts)
if err != nil {
return nil, errors.Wrap(err, "failed to query mons to skip reconcile")
}

result := sets.NewString()
for _, deployment := range deployments.Items {
if monID, ok := deployment.Labels[config.MonType]; ok {
logger.Infof("found mon %q pod to skip reconcile", monID)
result.Insert(monID)
}
}

return result, nil
}
24 changes: 23 additions & 1 deletion pkg/operator/ceph/cluster/osd/osd.go
Original file line number Diff line number Diff line change
Expand Up @@ -194,8 +194,12 @@ func (c *Cluster) Start() error {
if err != nil {
return errors.Wrapf(err, "failed to get information about currently-running OSD Deployments in namespace %q", namespace)
}
osdsToSkipReconcile, err := c.getOSDsToSkipReconcile()
if err != nil {
logger.Warningf("failed to get osds to skip reconcile. %v", err)
}
logger.Debugf("%d of %d OSD Deployments need updated", updateQueue.Len(), deployments.Len())
updateConfig := c.newUpdateConfig(config, updateQueue, deployments)
updateConfig := c.newUpdateConfig(config, updateQueue, deployments, osdsToSkipReconcile)

// prepare for creating new OSDs
statusConfigMaps := sets.NewString()
Expand Down Expand Up @@ -256,6 +260,24 @@ func (c *Cluster) getExistingOSDDeploymentsOnPVCs() (sets.String, error) {
return result, nil
}

func (c *Cluster) getOSDsToSkipReconcile() (sets.String, error) {
listOpts := metav1.ListOptions{LabelSelector: fmt.Sprintf("%s=%s,%s", k8sutil.AppAttr, AppName, cephv1.SkipReconcileLabelKey)}

deployments, err := c.context.Clientset.AppsV1().Deployments(c.clusterInfo.Namespace).List(c.clusterInfo.Context, listOpts)
if err != nil {
return nil, errors.Wrap(err, "failed to query OSDs to skip reconcile")
}

result := sets.NewString()
for _, deployment := range deployments.Items {
if osdID, ok := deployment.Labels[OsdIdLabelKey]; ok {
result.Insert(osdID)
}
}

return result, nil
}

func deploymentOnNode(c *Cluster, osd OSDInfo, nodeName string, config *provisionConfig) (*appsv1.Deployment, error) {
osdLongName := fmt.Sprintf("OSD %d on node %q", osd.ID, nodeName)

Expand Down
20 changes: 14 additions & 6 deletions pkg/operator/ceph/cluster/osd/update.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ import (
appsv1 "k8s.io/api/apps/v1"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/sets"
)

// THE LIBRARY PROVIDED BY THIS FILE IS NOT THREAD SAFE
Expand All @@ -42,24 +43,27 @@ var (
)

type updateConfig struct {
cluster *Cluster
provisionConfig *provisionConfig
queue *updateQueue // these OSDs need updated
numUpdatesNeeded int // the number of OSDs that needed updating
deployments *existenceList // these OSDs have existing deployments
cluster *Cluster
provisionConfig *provisionConfig
queue *updateQueue // these OSDs need updated
numUpdatesNeeded int // the number of OSDs that needed updating
deployments *existenceList // these OSDs have existing deployments
osdsToSkipReconcile sets.String // these OSDs should not be updated during reconcile
}

func (c *Cluster) newUpdateConfig(
provisionConfig *provisionConfig,
queue *updateQueue,
deployments *existenceList,
osdsToSkipReconcile sets.String,
) *updateConfig {
return &updateConfig{
c,
provisionConfig,
queue,
queue.Len(),
deployments,
osdsToSkipReconcile,
}
}

Expand Down Expand Up @@ -126,6 +130,11 @@ func (c *updateConfig) updateExistingOSDs(errs *provisionErrors) {
continue
}

if c.osdsToSkipReconcile.Has(strconv.Itoa(osdID)) {
logger.Warningf("Skipping update for OSD %d since labeled with %s", osdID, cephv1.SkipReconcileLabelKey)
continue
}

// backward compatibility for old deployments
// Checking DeviceClass with None too, because ceph-volume lvm list return crush device class as None
// Tracker https://tracker.ceph.com/issues/53425
Expand Down Expand Up @@ -154,7 +163,6 @@ func (c *updateConfig) updateExistingOSDs(errs *provisionErrors) {
} else {
if !c.cluster.ValidStorage.NodeExists(nodeOrPVCName) {
// node will not reconcile, so don't update the deployment
// allow the OSD health checker to remove the OSD
logger.Warningf(
"not updating OSD %d on node %q. node no longer exists in the storage spec. "+
"if the user wishes to remove OSDs from the node, they must do so manually. "+
Expand Down
23 changes: 22 additions & 1 deletion pkg/operator/ceph/cluster/osd/update_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ import (
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/kubernetes/fake"
)
Expand Down Expand Up @@ -110,7 +111,7 @@ func Test_updateExistingOSDs(t *testing.T) {
}
c = New(ctx, clusterInfo, spec, "rook/rook:master")
config := c.newProvisionConfig()
updateConfig = c.newUpdateConfig(config, updateQueue, existingDeployments)
updateConfig = c.newUpdateConfig(config, updateQueue, existingDeployments, sets.NewString())

// prepare outputs
deploymentsUpdated = []string{}
Expand Down Expand Up @@ -480,6 +481,26 @@ func Test_updateExistingOSDs(t *testing.T) {

assert.Equal(t, 0, updateQueue.Len()) // should be done with updates
})

t.Run("skip osd reconcile", func(t *testing.T) {
clientset = fake.NewSimpleClientset()
updateQueue = newUpdateQueueWithIDs(0, 1)
existingDeployments = newExistenceListWithIDs(0)
forceUpgradeIfUnhealthy = true
updateInjectFailures = k8sutil.Failures{}
doSetup()
addDeploymentOnNode("node0", 0)

osdToBeQueried = 0
updateConfig.osdsToSkipReconcile.Insert("0")
updateConfig.updateExistingOSDs(errs)
assert.Zero(t, errs.len())
assert.Equal(t, 1, updateQueue.Len())
osdIDUpdated, ok := updateQueue.Pop()
assert.True(t, ok)
assert.Equal(t, 1, osdIDUpdated)
updateConfig.osdsToSkipReconcile.Delete("0")
})
}

func Test_getOSDUpdateInfo(t *testing.T) {
Expand Down
4 changes: 3 additions & 1 deletion pkg/operator/ceph/disruption/clusterdisruption/osd.go
Original file line number Diff line number Diff line change
Expand Up @@ -551,7 +551,9 @@ func (r *ReconcileClusterDisruption) getOSDFailureDomains(clusterInfo *cephclien
nodeDrainFailureDomains.Insert(failureDomainName)
}
} else {
logger.Infof("osd %q is down but no node drain is detected", deployment.Name)
if !strings.HasSuffix(deployment.Name, "-debug") {
logger.Infof("osd %q is down but no node drain is detected", deployment.Name)
}
}
}

Expand Down
Loading