red-hat-storage · vkathole · Jun 7, 2024 · Sep 19, 2024 · Sep 20, 2024 · Sep 26, 2024
@@ -2639,10 +2639,12 @@ def check_toleration_on_subscriptions(toleration_key=constants.TOLERATION_KEY):
             namespace=config.ENV_DATA["cluster_namespace"],
             kind=constants.SUBSCRIPTION,
         )
-        tolerations = sub_obj.get().get("spec").get("config").get("tolerations")
+        tolerations = (
+            sub_obj.get().get("spec", {}).get("config", {}).get("tolerations", [])
+        )
 
         # Check if any toleration matches the provided key
-        toleration_found = any(tol["key"] == toleration_key for tol in tolerations)
+        toleration_found = any(tol.get("key") == toleration_key for tol in tolerations)
 
         if not toleration_found:
             logger.error(

@@ -7,6 +7,7 @@
     is_flexible_scaling_enabled,
     check_ceph_health_after_add_capacity,
     CephClusterExternal,
+    is_vsphere_ipi_cluster,
 )
 from ocs_ci.framework.testlib import (
     tier4b,
@@ -17,7 +18,11 @@
     skipif_hci_provider_and_client,
 )
 from ocs_ci.framework import config
-from ocs_ci.ocs.exceptions import CommandFailed
+from ocs_ci.ocs.exceptions import (
+    CommandFailed,
+    ResourceWrongStatusException,
+    TolerationNotFoundException,
+)
 from ocs_ci.ocs.resources.pod import (
     get_all_pods,
     wait_for_pods_to_be_running,
@@ -28,6 +33,8 @@
     taint_nodes,
     untaint_nodes,
     get_worker_nodes,
+    wait_for_nodes_status,
+    get_nodes,
 )
 from ocs_ci.utility.retry import retry
 from ocs_ci.ocs.resources import storage_cluster
@@ -37,6 +44,10 @@
 )
 from ocs_ci.helpers.sanity_helpers import Sanity
 from ocs_ci.utility import version
+from tests.functional.z_cluster.nodes.test_node_replacement_proactive import (
+    delete_and_create_osd_node,
+    select_osd_node_name,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -75,15 +86,48 @@ def finalizer():
                 taint_label="xyz=true:NoSchedule",
             ), "Failed to untaint"
 
+            resource_name = constants.DEFAULT_CLUSTERNAME
+            if config.DEPLOYMENT["external_mode"]:
+                resource_name = constants.DEFAULT_CLUSTERNAME_EXTERNAL_MODE
+
+            logger.info("Remove tolerations from storagecluster")
+            storagecluster_obj = ocp.OCP(
+                resource_name=resource_name,
+                namespace=config.ENV_DATA["cluster_namespace"],
+                kind=constants.STORAGECLUSTER,
+            )
+            params = '[{"op": "remove", "path": "/spec/placement"},]'
+            storagecluster_obj.patch(params=params, format_type="json")
+
+            logger.info("Remove tolerations to the subscription")
-            logger.info("Remove tolerations to the subscription")
+            logger.info("Remove tolerations from the subscriptions")
-            logger.info("Remove tolerations to the subscription")
+            logger.info("Remove tolerations from the subscriptions")
+            sub_list = ocp.get_all_resource_names_of_a_kind(kind=constants.SUBSCRIPTION)
+
+            sub_obj = ocp.OCP(
+                namespace=config.ENV_DATA["cluster_namespace"],
+                kind=constants.SUBSCRIPTION,
+            )
+            for sub in sub_list:
+                subscription_data = sub_obj.get(resource_name=sub)
+                if "config" in subscription_data.get("spec", {}):
+                    params = '[{"op": "remove", "path": "/spec/config"}]'
+                    sub_obj.patch(resource_name=sub, params=params, format_type="json")
+            time.sleep(180)
+            assert wait_for_pods_to_be_running(
+                timeout=900, sleep=15
+            ), "some of the pods didn't came up running"
-            ), "some of the pods didn't came up running"
+            ), "Few pods failed to reach the desired running state"
-            ), "some of the pods didn't came up running"
+            ), "Few pods failed to reach the desired running state"
+
         request.addfinalizer(finalizer)
 
-    def test_non_ocs_taint_and_tolerations(self):
+    def test_non_ocs_taint_and_tolerations(self, nodes):
         """
         Test runs the following steps
-        1. Taint ocs nodes with non-ocs taint
+        1. Taint odf nodes with non-ocs taint
         2. Set tolerations on storagecluster, subscription, configmap and ocsinit
-        3. Check toleration on all ocs pods.
-        4. Add Capacity
+        3. check tolerations on all subscription yaml.
+        4. Check toleration on all odf pods.
+        5. Add Capacity.
+        6. Reboot one of the nodes and check toleration on all odf pods on that node.
+        7. Replace one of the nodes and check all odf pods on that node are running.
 
         """
 
@@ -157,7 +201,7 @@ def test_non_ocs_taint_and_tolerations(self):
                     sub_obj.patch(resource_name=sub, params=param, format_type="merge")
                     logger.info(f"Successfully added toleration to {sub}")
 
-        retry(CommandFailed, tries=5, delay=10,)(
+        retry((CommandFailed, TolerationNotFoundException), tries=5, delay=10,)(
             check_toleration_on_subscriptions
         )(toleration_key="xyz")
 
@@ -218,7 +262,7 @@ def test_non_ocs_taint_and_tolerations(self):
         logger.info(
             "Check non-ocs toleration on all newly created pods under openshift-storage NS"
         )
-        retry(CommandFailed, tries=5, delay=10,)(
+        retry((CommandFailed, TolerationNotFoundException), tries=5, delay=10,)(
             check_toleration_on_pods
         )(toleration_key="xyz")
         if config.DEPLOYMENT["external_mode"]:
@@ -255,3 +299,145 @@ def test_non_ocs_taint_and_tolerations(self):
                 resource_count=count * replica_count,
             ), "New OSDs failed to reach running state"
             check_ceph_health_after_add_capacity(ceph_rebalance_timeout=2500)
+
+        # Reboot one of the nodes
+        node = get_nodes("worker", num_of_nodes=1)
+        if is_vsphere_ipi_cluster():
+            nodes.restart_nodes(nodes=node, wait=False)
+            node_names = [n.name for n in node]
+            wait_for_nodes_status(node_names, constants.STATUS_READY, timeout=420)
+        else:
+            nodes.restart_nodes_by_stop_and_start(nodes=node)
+
+        # Wait some time after rebooting master
+        waiting_time = 320
+        logger.info(f"Waiting {waiting_time} seconds.")
+        time.sleep(waiting_time)
+
+        # Validate all nodes and services are in READY state and up
+        retry(
+            (CommandFailed, TimeoutError, AssertionError, ResourceWrongStatusException),
+            tries=60,
+            delay=15,
+        )(ocp.wait_for_cluster_connectivity(tries=400))
+        retry(
+            (CommandFailed, TimeoutError, AssertionError, ResourceWrongStatusException),
+            tries=60,
+            delay=15,
+        )(wait_for_nodes_status(timeout=1800))
+
+        # Check cluster is health ok and check toleration on pods
+        assert wait_for_pods_to_be_running(timeout=900, sleep=15)
+        retry((CommandFailed, TolerationNotFoundException), tries=5, delay=10,)(
+            check_toleration_on_pods
+        )(toleration_key="xyz")
+
+        # Replace the node
+        osd_node_name = select_osd_node_name()
+        delete_and_create_osd_node(osd_node_name)
+
+        # Check cluster is health ok and check toleration on pods
+        logger.info("Verifying All resources are Running and matches expected result")
+        retry((CommandFailed, TolerationNotFoundException), tries=5, delay=10,)(
+            check_toleration_on_pods
+        )(toleration_key="xyz")
+        self.sanity_helpers.health_check(tries=120)
+
+    def test_negative_custom_taint(self, nodes):
+        """
+        Test runs the following steps
+        1. Taint odf nodes with non-ocs taint
-        1. Taint odf nodes with non-ocs taint
+        1. Taint odf worker nodes with non-ocs taint
-        1. Taint odf nodes with non-ocs taint
+        1. Taint odf worker nodes with non-ocs taint
+        2. Set toleration in storagecluster yaml.
+        3. Set toleration in wrong subscription yaml.
+        4. Check that toleration is not applied on all subscriptions and pods.
+        5. Check that all pods are not in running state.
+
+        """
+
+        logger.info("Taint all nodes with custom taint")
+        ocs_nodes = get_worker_nodes()
+        taint_nodes(nodes=ocs_nodes, taint_label="xyz=true:NoSchedule")
+
+        resource_name = constants.DEFAULT_CLUSTERNAME
+        if config.DEPLOYMENT["external_mode"]:
+            resource_name = constants.DEFAULT_CLUSTERNAME_EXTERNAL_MODE
+
+        logger.info("Add tolerations to storagecluster")
+        storagecluster_obj = ocp.OCP(
+            resource_name=resource_name,
+            namespace=config.ENV_DATA["cluster_namespace"],
+            kind=constants.STORAGECLUSTER,
+        )
+
+        tolerations = (
+            '{"tolerations": [{"effect": "NoSchedule", "key": "xyz",'
+            '"operator": "Equal", "value": "true"}, '
+            '{"effect": "NoSchedule", "key": "node.ocs.openshift.io/storage", '
+            '"operator": "Equal", "value": "true"}]}'
+        )
+        if config.ENV_DATA["mcg_only_deployment"]:
+            param = f'{{"spec": {{"placement":{{"noobaa-standalone":{tolerations}}}}}}}'
+        elif config.DEPLOYMENT["external_mode"]:
+            param = (
+                f'{{"spec": {{"placement": {{"all": {tolerations}, '
+                f'"noobaa-core": {tolerations}}}}}}}'
+            )
+        else:
+            param = (
+                f'"all": {tolerations}, "csi-plugin": {tolerations}, "csi-provisioner": {tolerations}, '
+                f'"mds": {tolerations}, "metrics-exporter": {tolerations}, "noobaa-core": {tolerations}, '
+                f'"rgw": {tolerations}, "toolbox": {tolerations}'
+            )
+            param = f'{{"spec": {{"placement": {{{param}}}}}}}'
+
+        storagecluster_obj.patch(params=param, format_type="merge")
+        logger.info(f"Successfully added toleration to {storagecluster_obj.kind}")
+
+        logger.info("Add tolerations to the subscription")
+        sub_list = ocp.get_all_resource_names_of_a_kind(kind=constants.SUBSCRIPTION)
+        param = (
+            '{"spec": {"config":  {"tolerations": '
+            '[{"effect": "NoSchedule", "key": "xyz", "operator": "Equal", '
+            '"value": "true"}]}}}'
+        )
+        # Select one subscription other than odf subscription
+        selected_sub = None
+        for sub in sub_list:
+            if sub != constants.ODF_SUBSCRIPTION:
+                selected_sub = sub
+                break
+        if selected_sub:
+            sub_obj = ocp.OCP(
+                resource_name=selected_sub,
+                namespace=config.ENV_DATA["cluster_namespace"],
+                kind=constants.SUBSCRIPTION,
+            )
+            sub_obj.patch(params=param, format_type="merge")
+            logger.info(f"Successfully added toleration to {selected_sub}")
+
+        logger.info("Check custom toleration on all subscriptions")
+        try:
+            check_toleration_on_subscriptions(toleration_key="xyz")
+            raise AssertionError("Toleration was found, but it should not exist.")
+        except TolerationNotFoundException:
+            pass
+        time.sleep(300)
+        pod_list = get_all_pods(
+            namespace=config.ENV_DATA["cluster_namespace"],
+            exclude_selector=True,
+        )
+        for pod in pod_list:
+            pod.delete(wait=False)
+
+        assert not wait_for_pods_to_be_running(
+            timeout=120, sleep=15
+        ), "Pods are running when they should not be."
+
+        logger.info(
+            "Check custom toleration on all newly created pods under openshift-storage"
-            "Check custom toleration on all newly created pods under openshift-storage"
+            "Validate custom toleration not found on all newly created pods in openshift-storage"
-            "Check custom toleration on all newly created pods under openshift-storage"
+            "Validate custom toleration not found on all newly created pods in openshift-storage"
+        )
+        try:
+            check_toleration_on_pods(toleration_key="xyz")
+            raise AssertionError("Toleration was found, but it should not exist.")
+        except TolerationNotFoundException:
+            pass