-
Notifications
You must be signed in to change notification settings - Fork 166
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Custom taints and toleration node operation #9920
base: master
Are you sure you want to change the base?
Changes from 7 commits
ee87c8d
7564fce
a3d05eb
84f36dd
e5b9f8e
8a2d775
6727cea
644f217
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -7,6 +7,7 @@ | |||||
is_flexible_scaling_enabled, | ||||||
check_ceph_health_after_add_capacity, | ||||||
CephClusterExternal, | ||||||
is_vsphere_ipi_cluster, | ||||||
) | ||||||
from ocs_ci.framework.testlib import ( | ||||||
tier4b, | ||||||
|
@@ -17,7 +18,11 @@ | |||||
skipif_hci_provider_and_client, | ||||||
) | ||||||
from ocs_ci.framework import config | ||||||
from ocs_ci.ocs.exceptions import CommandFailed | ||||||
from ocs_ci.ocs.exceptions import ( | ||||||
CommandFailed, | ||||||
ResourceWrongStatusException, | ||||||
TolerationNotFoundException, | ||||||
) | ||||||
from ocs_ci.ocs.resources.pod import ( | ||||||
get_all_pods, | ||||||
wait_for_pods_to_be_running, | ||||||
|
@@ -28,6 +33,8 @@ | |||||
taint_nodes, | ||||||
untaint_nodes, | ||||||
get_worker_nodes, | ||||||
wait_for_nodes_status, | ||||||
get_nodes, | ||||||
) | ||||||
from ocs_ci.utility.retry import retry | ||||||
from ocs_ci.ocs.resources import storage_cluster | ||||||
|
@@ -37,6 +44,10 @@ | |||||
) | ||||||
from ocs_ci.helpers.sanity_helpers import Sanity | ||||||
from ocs_ci.utility import version | ||||||
from tests.functional.z_cluster.nodes.test_node_replacement_proactive import ( | ||||||
delete_and_create_osd_node, | ||||||
select_osd_node_name, | ||||||
) | ||||||
|
||||||
logger = logging.getLogger(__name__) | ||||||
|
||||||
|
@@ -75,15 +86,48 @@ def finalizer(): | |||||
taint_label="xyz=true:NoSchedule", | ||||||
), "Failed to untaint" | ||||||
|
||||||
resource_name = constants.DEFAULT_CLUSTERNAME | ||||||
if config.DEPLOYMENT["external_mode"]: | ||||||
resource_name = constants.DEFAULT_CLUSTERNAME_EXTERNAL_MODE | ||||||
|
||||||
logger.info("Remove tolerations from storagecluster") | ||||||
storagecluster_obj = ocp.OCP( | ||||||
resource_name=resource_name, | ||||||
namespace=config.ENV_DATA["cluster_namespace"], | ||||||
kind=constants.STORAGECLUSTER, | ||||||
) | ||||||
params = '[{"op": "remove", "path": "/spec/placement"},]' | ||||||
storagecluster_obj.patch(params=params, format_type="json") | ||||||
|
||||||
logger.info("Remove tolerations to the subscription") | ||||||
sub_list = ocp.get_all_resource_names_of_a_kind(kind=constants.SUBSCRIPTION) | ||||||
|
||||||
sub_obj = ocp.OCP( | ||||||
namespace=config.ENV_DATA["cluster_namespace"], | ||||||
kind=constants.SUBSCRIPTION, | ||||||
) | ||||||
for sub in sub_list: | ||||||
subscription_data = sub_obj.get(resource_name=sub) | ||||||
if "config" in subscription_data.get("spec", {}): | ||||||
params = '[{"op": "remove", "path": "/spec/config"}]' | ||||||
sub_obj.patch(resource_name=sub, params=params, format_type="json") | ||||||
time.sleep(180) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. are we not supposed to remove the tolerations from the rook-ceph operator configmap and ocsinitializations.ocs.openshift.io too ?? |
||||||
assert wait_for_pods_to_be_running( | ||||||
timeout=900, sleep=15 | ||||||
), "some of the pods didn't came up running" | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
|
||||||
request.addfinalizer(finalizer) | ||||||
|
||||||
def test_non_ocs_taint_and_tolerations(self): | ||||||
def test_non_ocs_taint_and_tolerations(self, nodes): | ||||||
""" | ||||||
Test runs the following steps | ||||||
1. Taint ocs nodes with non-ocs taint | ||||||
1. Taint odf nodes with non-ocs taint | ||||||
2. Set tolerations on storagecluster, subscription, configmap and ocsinit | ||||||
3. Check toleration on all ocs pods. | ||||||
4. Add Capacity | ||||||
3. check tolerations on all subscription yaml. | ||||||
4. Check toleration on all odf pods. | ||||||
5. Add Capacity. | ||||||
6. Reboot one of the nodes and check toleration on all odf pods on that node. | ||||||
7. Replace one of the nodes and check all odf pods on that node are running. | ||||||
|
||||||
""" | ||||||
|
||||||
|
@@ -157,7 +201,7 @@ def test_non_ocs_taint_and_tolerations(self): | |||||
sub_obj.patch(resource_name=sub, params=param, format_type="merge") | ||||||
logger.info(f"Successfully added toleration to {sub}") | ||||||
|
||||||
retry(CommandFailed, tries=5, delay=10,)( | ||||||
retry((CommandFailed, TolerationNotFoundException), tries=5, delay=10,)( | ||||||
check_toleration_on_subscriptions | ||||||
)(toleration_key="xyz") | ||||||
|
||||||
|
@@ -218,7 +262,7 @@ def test_non_ocs_taint_and_tolerations(self): | |||||
logger.info( | ||||||
"Check non-ocs toleration on all newly created pods under openshift-storage NS" | ||||||
) | ||||||
retry(CommandFailed, tries=5, delay=10,)( | ||||||
retry((CommandFailed, TolerationNotFoundException), tries=5, delay=10,)( | ||||||
check_toleration_on_pods | ||||||
)(toleration_key="xyz") | ||||||
if config.DEPLOYMENT["external_mode"]: | ||||||
|
@@ -255,3 +299,145 @@ def test_non_ocs_taint_and_tolerations(self): | |||||
resource_count=count * replica_count, | ||||||
), "New OSDs failed to reach running state" | ||||||
check_ceph_health_after_add_capacity(ceph_rebalance_timeout=2500) | ||||||
|
||||||
# Reboot one of the nodes | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You are performing 4 admin operations in the same test. should we consider moving the newly added operations to a separate test function ? The repetitive code for setting the non ocs taints can be moved to a common function and called in the tests. Please take input from Brown squad as well. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1 |
||||||
node = get_nodes("worker", num_of_nodes=1) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Better to get the ocs_nodes. There are chances that a non ocs node can get picked. Also, I would suggest selecting randomly instead of selecting the same node everytime |
||||||
if is_vsphere_ipi_cluster(): | ||||||
nodes.restart_nodes(nodes=node, wait=False) | ||||||
node_names = [n.name for n in node] | ||||||
wait_for_nodes_status(node_names, constants.STATUS_READY, timeout=420) | ||||||
else: | ||||||
Comment on lines
+305
to
+309
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. IIUC, the vSPhere node related API would be same for both IPI and UPI. If that is the case, you really don't need to handle vpshere ipi case here |
||||||
nodes.restart_nodes_by_stop_and_start(nodes=node) | ||||||
|
||||||
# Wait some time after rebooting master | ||||||
vkathole marked this conversation as resolved.
Show resolved
Hide resolved
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you are rebooting worker, please correct it in the comment statment |
||||||
waiting_time = 320 | ||||||
logger.info(f"Waiting {waiting_time} seconds.") | ||||||
time.sleep(waiting_time) | ||||||
Comment on lines
+314
to
+315
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you are waiting for the cluster connectivity below, do we still need to sleep? |
||||||
|
||||||
# Validate all nodes and services are in READY state and up | ||||||
vkathole marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
retry( | ||||||
(CommandFailed, TimeoutError, AssertionError, ResourceWrongStatusException), | ||||||
tries=60, | ||||||
delay=15, | ||||||
)(ocp.wait_for_cluster_connectivity(tries=400)) | ||||||
retry( | ||||||
(CommandFailed, TimeoutError, AssertionError, ResourceWrongStatusException), | ||||||
tries=60, | ||||||
delay=15, | ||||||
)(wait_for_nodes_status(timeout=1800)) | ||||||
|
||||||
# Check cluster is health ok and check toleration on pods | ||||||
assert wait_for_pods_to_be_running(timeout=900, sleep=15) | ||||||
retry((CommandFailed, TolerationNotFoundException), tries=5, delay=10,)( | ||||||
check_toleration_on_pods | ||||||
)(toleration_key="xyz") | ||||||
|
||||||
# Replace the node | ||||||
osd_node_name = select_osd_node_name() | ||||||
delete_and_create_osd_node(osd_node_name) | ||||||
|
||||||
# Check cluster is health ok and check toleration on pods | ||||||
logger.info("Verifying All resources are Running and matches expected result") | ||||||
retry((CommandFailed, TolerationNotFoundException), tries=5, delay=10,)( | ||||||
check_toleration_on_pods | ||||||
)(toleration_key="xyz") | ||||||
self.sanity_helpers.health_check(tries=120) | ||||||
|
||||||
def test_negative_custom_taint(self, nodes): | ||||||
""" | ||||||
Test runs the following steps | ||||||
1. Taint odf nodes with non-ocs taint | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
2. Set toleration in storagecluster yaml. | ||||||
3. Set toleration in wrong subscription yaml. | ||||||
4. Check that toleration is not applied on all subscriptions and pods. | ||||||
5. Check that all pods are not in running state. | ||||||
|
||||||
""" | ||||||
|
||||||
logger.info("Taint all nodes with custom taint") | ||||||
ocs_nodes = get_worker_nodes() | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this fun returns all the worker nodes, should we taint only on odf pod running nodes? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you can use get_ocs_nodes() which will return the ocs nodes alone |
||||||
taint_nodes(nodes=ocs_nodes, taint_label="xyz=true:NoSchedule") | ||||||
|
||||||
resource_name = constants.DEFAULT_CLUSTERNAME | ||||||
if config.DEPLOYMENT["external_mode"]: | ||||||
resource_name = constants.DEFAULT_CLUSTERNAME_EXTERNAL_MODE | ||||||
|
||||||
logger.info("Add tolerations to storagecluster") | ||||||
vkathole marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
storagecluster_obj = ocp.OCP( | ||||||
resource_name=resource_name, | ||||||
namespace=config.ENV_DATA["cluster_namespace"], | ||||||
kind=constants.STORAGECLUSTER, | ||||||
) | ||||||
|
||||||
tolerations = ( | ||||||
'{"tolerations": [{"effect": "NoSchedule", "key": "xyz",' | ||||||
'"operator": "Equal", "value": "true"}, ' | ||||||
'{"effect": "NoSchedule", "key": "node.ocs.openshift.io/storage", ' | ||||||
'"operator": "Equal", "value": "true"}]}' | ||||||
) | ||||||
if config.ENV_DATA["mcg_only_deployment"]: | ||||||
param = f'{{"spec": {{"placement":{{"noobaa-standalone":{tolerations}}}}}}}' | ||||||
elif config.DEPLOYMENT["external_mode"]: | ||||||
param = ( | ||||||
f'{{"spec": {{"placement": {{"all": {tolerations}, ' | ||||||
f'"noobaa-core": {tolerations}}}}}}}' | ||||||
) | ||||||
else: | ||||||
param = ( | ||||||
f'"all": {tolerations}, "csi-plugin": {tolerations}, "csi-provisioner": {tolerations}, ' | ||||||
f'"mds": {tolerations}, "metrics-exporter": {tolerations}, "noobaa-core": {tolerations}, ' | ||||||
f'"rgw": {tolerations}, "toolbox": {tolerations}' | ||||||
) | ||||||
param = f'{{"spec": {{"placement": {{{param}}}}}}}' | ||||||
|
||||||
storagecluster_obj.patch(params=param, format_type="merge") | ||||||
logger.info(f"Successfully added toleration to {storagecluster_obj.kind}") | ||||||
|
||||||
logger.info("Add tolerations to the subscription") | ||||||
sub_list = ocp.get_all_resource_names_of_a_kind(kind=constants.SUBSCRIPTION) | ||||||
param = ( | ||||||
'{"spec": {"config": {"tolerations": ' | ||||||
'[{"effect": "NoSchedule", "key": "xyz", "operator": "Equal", ' | ||||||
'"value": "true"}]}}}' | ||||||
) | ||||||
# Select one subscription other than odf subscription | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please add a log info message on the step that the test is doing here |
||||||
selected_sub = None | ||||||
for sub in sub_list: | ||||||
if sub != constants.ODF_SUBSCRIPTION: | ||||||
selected_sub = sub | ||||||
break | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add a message logging the selected subscription other than odf There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what subscription can we expect here? other than odf |
||||||
if selected_sub: | ||||||
sub_obj = ocp.OCP( | ||||||
resource_name=selected_sub, | ||||||
namespace=config.ENV_DATA["cluster_namespace"], | ||||||
kind=constants.SUBSCRIPTION, | ||||||
) | ||||||
sub_obj.patch(params=param, format_type="merge") | ||||||
logger.info(f"Successfully added toleration to {selected_sub}") | ||||||
|
||||||
logger.info("Check custom toleration on all subscriptions") | ||||||
try: | ||||||
check_toleration_on_subscriptions(toleration_key="xyz") | ||||||
raise AssertionError("Toleration was found, but it should not exist.") | ||||||
except TolerationNotFoundException: | ||||||
pass | ||||||
time.sleep(300) | ||||||
pod_list = get_all_pods( | ||||||
namespace=config.ENV_DATA["cluster_namespace"], | ||||||
exclude_selector=True, | ||||||
) | ||||||
for pod in pod_list: | ||||||
pod.delete(wait=False) | ||||||
|
||||||
assert not wait_for_pods_to_be_running( | ||||||
timeout=120, sleep=15 | ||||||
), "Pods are running when they should not be." | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. are we expecting all pods to go in a bad state ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see we apply tolerations on storagecluster and subscription other than ODF, are we sure all pods will not be running if the toleration is just not applied properly on sub when we are setting it properly on storagecluster ? Please check the scenario again. if we are setting the toleration properly on storagecluster few pods should be up and running. |
||||||
|
||||||
logger.info( | ||||||
"Check custom toleration on all newly created pods under openshift-storage" | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
) | ||||||
try: | ||||||
check_toleration_on_pods(toleration_key="xyz") | ||||||
raise AssertionError("Toleration was found, but it should not exist.") | ||||||
except TolerationNotFoundException: | ||||||
pass | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you can add a log message here replacing pass and in line 423 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.