Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Custom taints and toleration node operation #9920

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions ocs_ci/ocs/resources/pod.py
Original file line number Diff line number Diff line change
Expand Up @@ -2639,10 +2639,12 @@ def check_toleration_on_subscriptions(toleration_key=constants.TOLERATION_KEY):
namespace=config.ENV_DATA["cluster_namespace"],
kind=constants.SUBSCRIPTION,
)
tolerations = sub_obj.get().get("spec").get("config").get("tolerations")
tolerations = (
sub_obj.get().get("spec", {}).get("config", {}).get("tolerations", [])
)

# Check if any toleration matches the provided key
toleration_found = any(tol["key"] == toleration_key for tol in tolerations)
toleration_found = any(tol.get("key") == toleration_key for tol in tolerations)

if not toleration_found:
logger.error(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
is_flexible_scaling_enabled,
check_ceph_health_after_add_capacity,
CephClusterExternal,
is_vsphere_ipi_cluster,
)
from ocs_ci.framework.testlib import (
tier4b,
Expand All @@ -17,7 +18,11 @@
skipif_hci_provider_and_client,
)
from ocs_ci.framework import config
from ocs_ci.ocs.exceptions import CommandFailed
from ocs_ci.ocs.exceptions import (
CommandFailed,
ResourceWrongStatusException,
TolerationNotFoundException,
)
from ocs_ci.ocs.resources.pod import (
get_all_pods,
wait_for_pods_to_be_running,
Expand All @@ -28,6 +33,8 @@
taint_nodes,
untaint_nodes,
get_worker_nodes,
wait_for_nodes_status,
get_nodes,
)
from ocs_ci.utility.retry import retry
from ocs_ci.ocs.resources import storage_cluster
Expand All @@ -37,6 +44,10 @@
)
from ocs_ci.helpers.sanity_helpers import Sanity
from ocs_ci.utility import version
from tests.functional.z_cluster.nodes.test_node_replacement_proactive import (
delete_and_create_osd_node,
select_osd_node_name,
)

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -75,15 +86,48 @@ def finalizer():
taint_label="xyz=true:NoSchedule",
), "Failed to untaint"

resource_name = constants.DEFAULT_CLUSTERNAME
if config.DEPLOYMENT["external_mode"]:
resource_name = constants.DEFAULT_CLUSTERNAME_EXTERNAL_MODE

logger.info("Remove tolerations from storagecluster")
storagecluster_obj = ocp.OCP(
resource_name=resource_name,
namespace=config.ENV_DATA["cluster_namespace"],
kind=constants.STORAGECLUSTER,
)
params = '[{"op": "remove", "path": "/spec/placement"},]'
storagecluster_obj.patch(params=params, format_type="json")

logger.info("Remove tolerations to the subscription")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
logger.info("Remove tolerations to the subscription")
logger.info("Remove tolerations from the subscriptions")

sub_list = ocp.get_all_resource_names_of_a_kind(kind=constants.SUBSCRIPTION)

sub_obj = ocp.OCP(
namespace=config.ENV_DATA["cluster_namespace"],
kind=constants.SUBSCRIPTION,
)
for sub in sub_list:
subscription_data = sub_obj.get(resource_name=sub)
if "config" in subscription_data.get("spec", {}):
params = '[{"op": "remove", "path": "/spec/config"}]'
sub_obj.patch(resource_name=sub, params=params, format_type="json")
time.sleep(180)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are we not supposed to remove the tolerations from the rook-ceph operator configmap and ocsinitializations.ocs.openshift.io too ??

assert wait_for_pods_to_be_running(
timeout=900, sleep=15
), "some of the pods didn't came up running"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
), "some of the pods didn't came up running"
), "Few pods failed to reach the desired running state"


request.addfinalizer(finalizer)

def test_non_ocs_taint_and_tolerations(self):
def test_non_ocs_taint_and_tolerations(self, nodes):
"""
Test runs the following steps
1. Taint ocs nodes with non-ocs taint
1. Taint odf nodes with non-ocs taint
2. Set tolerations on storagecluster, subscription, configmap and ocsinit
3. Check toleration on all ocs pods.
4. Add Capacity
3. check tolerations on all subscription yaml.
4. Check toleration on all odf pods.
5. Add Capacity.
6. Reboot one of the nodes and check toleration on all odf pods on that node.
7. Replace one of the nodes and check all odf pods on that node are running.

"""

Expand Down Expand Up @@ -157,7 +201,7 @@ def test_non_ocs_taint_and_tolerations(self):
sub_obj.patch(resource_name=sub, params=param, format_type="merge")
logger.info(f"Successfully added toleration to {sub}")

retry(CommandFailed, tries=5, delay=10,)(
retry((CommandFailed, TolerationNotFoundException), tries=5, delay=10,)(
check_toleration_on_subscriptions
)(toleration_key="xyz")

Expand Down Expand Up @@ -218,7 +262,7 @@ def test_non_ocs_taint_and_tolerations(self):
logger.info(
"Check non-ocs toleration on all newly created pods under openshift-storage NS"
)
retry(CommandFailed, tries=5, delay=10,)(
retry((CommandFailed, TolerationNotFoundException), tries=5, delay=10,)(
check_toleration_on_pods
)(toleration_key="xyz")
if config.DEPLOYMENT["external_mode"]:
Expand Down Expand Up @@ -255,3 +299,145 @@ def test_non_ocs_taint_and_tolerations(self):
resource_count=count * replica_count,
), "New OSDs failed to reach running state"
check_ceph_health_after_add_capacity(ceph_rebalance_timeout=2500)

# Reboot one of the nodes
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are performing 4 admin operations in the same test. should we consider moving the newly added operations to a separate test function ? The repetitive code for setting the non ocs taints can be moved to a common function and called in the tests.

Please take input from Brown squad as well.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1

node = get_nodes("worker", num_of_nodes=1)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Better to get the ocs_nodes. There are chances that a non ocs node can get picked. Also, I would suggest selecting randomly instead of selecting the same node everytime

if is_vsphere_ipi_cluster():
nodes.restart_nodes(nodes=node, wait=False)
node_names = [n.name for n in node]
wait_for_nodes_status(node_names, constants.STATUS_READY, timeout=420)
else:
Comment on lines +305 to +309
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIUC, the vSPhere node related API would be same for both IPI and UPI. If that is the case, you really don't need to handle vpshere ipi case here

nodes.restart_nodes_by_stop_and_start(nodes=node)

# Wait some time after rebooting master
vkathole marked this conversation as resolved.
Show resolved Hide resolved
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you are rebooting worker, please correct it in the comment statment

waiting_time = 320
logger.info(f"Waiting {waiting_time} seconds.")
time.sleep(waiting_time)
Comment on lines +314 to +315
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you are waiting for the cluster connectivity below, do we still need to sleep?


# Validate all nodes and services are in READY state and up
vkathole marked this conversation as resolved.
Show resolved Hide resolved
retry(
(CommandFailed, TimeoutError, AssertionError, ResourceWrongStatusException),
tries=60,
delay=15,
)(ocp.wait_for_cluster_connectivity(tries=400))
retry(
(CommandFailed, TimeoutError, AssertionError, ResourceWrongStatusException),
tries=60,
delay=15,
)(wait_for_nodes_status(timeout=1800))

# Check cluster is health ok and check toleration on pods
assert wait_for_pods_to_be_running(timeout=900, sleep=15)
retry((CommandFailed, TolerationNotFoundException), tries=5, delay=10,)(
check_toleration_on_pods
)(toleration_key="xyz")

# Replace the node
osd_node_name = select_osd_node_name()
delete_and_create_osd_node(osd_node_name)

# Check cluster is health ok and check toleration on pods
logger.info("Verifying All resources are Running and matches expected result")
retry((CommandFailed, TolerationNotFoundException), tries=5, delay=10,)(
check_toleration_on_pods
)(toleration_key="xyz")
self.sanity_helpers.health_check(tries=120)

def test_negative_custom_taint(self, nodes):
"""
Test runs the following steps
1. Taint odf nodes with non-ocs taint
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
1. Taint odf nodes with non-ocs taint
1. Taint odf worker nodes with non-ocs taint

2. Set toleration in storagecluster yaml.
3. Set toleration in wrong subscription yaml.
4. Check that toleration is not applied on all subscriptions and pods.
5. Check that all pods are not in running state.

"""

logger.info("Taint all nodes with custom taint")
ocs_nodes = get_worker_nodes()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this fun returns all the worker nodes, should we taint only on odf pod running nodes?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you can use get_ocs_nodes() which will return the ocs nodes alone

taint_nodes(nodes=ocs_nodes, taint_label="xyz=true:NoSchedule")

resource_name = constants.DEFAULT_CLUSTERNAME
if config.DEPLOYMENT["external_mode"]:
resource_name = constants.DEFAULT_CLUSTERNAME_EXTERNAL_MODE

logger.info("Add tolerations to storagecluster")
vkathole marked this conversation as resolved.
Show resolved Hide resolved
storagecluster_obj = ocp.OCP(
resource_name=resource_name,
namespace=config.ENV_DATA["cluster_namespace"],
kind=constants.STORAGECLUSTER,
)

tolerations = (
'{"tolerations": [{"effect": "NoSchedule", "key": "xyz",'
'"operator": "Equal", "value": "true"}, '
'{"effect": "NoSchedule", "key": "node.ocs.openshift.io/storage", '
'"operator": "Equal", "value": "true"}]}'
)
if config.ENV_DATA["mcg_only_deployment"]:
param = f'{{"spec": {{"placement":{{"noobaa-standalone":{tolerations}}}}}}}'
elif config.DEPLOYMENT["external_mode"]:
param = (
f'{{"spec": {{"placement": {{"all": {tolerations}, '
f'"noobaa-core": {tolerations}}}}}}}'
)
else:
param = (
f'"all": {tolerations}, "csi-plugin": {tolerations}, "csi-provisioner": {tolerations}, '
f'"mds": {tolerations}, "metrics-exporter": {tolerations}, "noobaa-core": {tolerations}, '
f'"rgw": {tolerations}, "toolbox": {tolerations}'
)
param = f'{{"spec": {{"placement": {{{param}}}}}}}'

storagecluster_obj.patch(params=param, format_type="merge")
logger.info(f"Successfully added toleration to {storagecluster_obj.kind}")

logger.info("Add tolerations to the subscription")
sub_list = ocp.get_all_resource_names_of_a_kind(kind=constants.SUBSCRIPTION)
param = (
'{"spec": {"config": {"tolerations": '
'[{"effect": "NoSchedule", "key": "xyz", "operator": "Equal", '
'"value": "true"}]}}}'
)
# Select one subscription other than odf subscription
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please add a log info message on the step that the test is doing here

selected_sub = None
for sub in sub_list:
if sub != constants.ODF_SUBSCRIPTION:
selected_sub = sub
break
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add a message logging the selected subscription other than odf

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what subscription can we expect here? other than odf

if selected_sub:
sub_obj = ocp.OCP(
resource_name=selected_sub,
namespace=config.ENV_DATA["cluster_namespace"],
kind=constants.SUBSCRIPTION,
)
sub_obj.patch(params=param, format_type="merge")
logger.info(f"Successfully added toleration to {selected_sub}")

logger.info("Check custom toleration on all subscriptions")
try:
check_toleration_on_subscriptions(toleration_key="xyz")
raise AssertionError("Toleration was found, but it should not exist.")
except TolerationNotFoundException:
pass
time.sleep(300)
pod_list = get_all_pods(
namespace=config.ENV_DATA["cluster_namespace"],
exclude_selector=True,
)
for pod in pod_list:
pod.delete(wait=False)

assert not wait_for_pods_to_be_running(
timeout=120, sleep=15
), "Pods are running when they should not be."
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are we expecting all pods to go in a bad state ?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see we apply tolerations on storagecluster and subscription other than ODF, are we sure all pods will not be running if the toleration is just not applied properly on sub when we are setting it properly on storagecluster ? Please check the scenario again. if we are setting the toleration properly on storagecluster few pods should be up and running.


logger.info(
"Check custom toleration on all newly created pods under openshift-storage"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
"Check custom toleration on all newly created pods under openshift-storage"
"Validate custom toleration not found on all newly created pods in openshift-storage"

)
try:
check_toleration_on_pods(toleration_key="xyz")
raise AssertionError("Toleration was found, but it should not exist.")
except TolerationNotFoundException:
pass
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you can add a log message here replacing pass and in line 423

Loading