Skip to content

Commit

Permalink
Stretch cluster + CNV automation (red-hat-storage#10077)
Browse files Browse the repository at this point in the history
Signed-off-by: Mahesh Shetty <[email protected]>
  • Loading branch information
mashetty330 authored Sep 6, 2024
1 parent 5b15240 commit 16b6700
Show file tree
Hide file tree
Showing 8 changed files with 502 additions and 159 deletions.
134 changes: 133 additions & 1 deletion ocs_ci/deployment/cnv.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

from ocs_ci.framework import config
from ocs_ci.ocs.resources.ocs import OCS
from ocs_ci.ocs.ocp import OCP
from ocs_ci.ocs.ocp import OCP, switch_to_default_rook_cluster_project
from ocs_ci.ocs.resources.packagemanifest import PackageManifest
from ocs_ci.ocs.constants import (
CNV_NAMESPACE_YAML,
Expand Down Expand Up @@ -648,3 +648,135 @@ def disable_multicluster_engine(self):
logger.error(f"Failed to disable multicluster engine\n{cmd_res.stderr}")
return
logger.info(cmd_res.stdout.decode("utf-8").splitlines())

def check_if_any_vm_and_vmi(self, namespace=None):
"""
Checks if any VMs and VM instances are running
Args:
namespace (str): namespace to check
Returns:
True if any VMs or VMi else False
"""

vm_obj = OCP(kind=constants.VIRTUAL_MACHINE, namespace=namespace)
vmi_obj = OCP(kind=constants.VIRTUAL_MACHINE_INSTANCE, namespace=namespace)

return vm_obj.get(
out_yaml_format=False, all_namespaces=not namespace, dont_raise=True
) or vmi_obj.get(
out_yaml_format=False, all_namespaces=not namespace, dont_raise=True
)

def remove_hyperconverged(self):
"""
Remove HyperConverged CR
"""
hyperconverged_obj = OCP(
kind=constants.HYPERCONVERGED,
resource_name=constants.KUBEVIRT_HYPERCONVERGED,
namespace=self.namespace,
)
hyperconverged_obj.delete(resource_name=constants.KUBEVIRT_HYPERCONVERGED)
logger.info(
f"Deleted {constants.HYPERCONVERGED} {constants.KUBEVIRT_HYPERCONVERGED}"
)

def remove_cnv_subscription(self):
"""
Remove CNV subscription
"""
cnv_sub = OCP(
kind=constants.SUBSCRIPTION,
resource_name=constants.KUBEVIRT_HYPERCONVERGED,
namespace=self.namespace,
)
cnv_sub.delete(resource_name=constants.KUBEVIRT_HYPERCONVERGED)
logger.info(f"Deleted subscription {constants.KUBEVIRT_HYPERCONVERGED}")

def remove_cnv_csv(self):
"""
Remove CNV ClusterServiceVersion
"""
cnv_csv = OCP(
kind=constants.CLUSTER_SERVICE_VERSION,
selector=constants.CNV_SELECTOR,
namespace=self.namespace,
)
cnv_csv.delete(resource_name=cnv_csv.get()["items"][0]["metadata"]["name"])
logger.info(f"Deleted ClusterServiceVersion {constants.CNV_OPERATORNAME}")

def remove_cnv_operator(self):
"""
Remove CNV operator
"""
cnv_operator = OCP(
kind=constants.OPERATOR_KIND, resource_name=constants.CNV_OPERATORNAME
)
cnv_operator.delete(resource_name=constants.CNV_OPERATORNAME)
logger.info(f"Deleted operator {constants.CNV_OPERATORNAME}")

def remove_crds(self):
"""
Remove openshift virtualization CRDs
"""
OCP().exec_oc_cmd(
command=f"delete crd -n {self.namespace} -l {constants.CNV_SELECTOR}"
)
logger.info("Deleted all the openshift virtualization CRDs")

def remove_namespace(self):
"""
Remove openshift virtualization namespace
"""
cnv_namespace = OCP()
switch_to_default_rook_cluster_project()
cnv_namespace.delete_project(constants.CNV_NAMESPACE)
logger.info(f"Deleted the namespace {constants.CNV_NAMESPACE}")

def uninstall_cnv(self, check_cnv_installed=True):
"""
Uninstall CNV deployment
Args:
check_cnv_installed (bool): True if want to check if CNV installed
"""
if check_cnv_installed:
if not self.cnv_hyperconverged_installed():
logger.info("CNV is not installed, skipping the cleanup...")
return

assert not self.check_if_any_vm_and_vmi(), (
"Vm or Vmi instances are found in the cluster,"
"Please make sure all VMs and VM instances are removed"
)
logger.info(
"No VM or VM instances are found in the cluster, proceeding with the uninstallation"
)

logger.info("Removing the virtualization hyperconverged")
self.remove_hyperconverged()

logger.info("Removing the virtualization subscription")
self.remove_cnv_subscription()

logger.info("Removing the virtualization CSV")
self.remove_cnv_csv()

logger.info("Removing the virtualization Operator")
self.remove_cnv_operator()

logger.info("Removing the namespace")
self.remove_namespace()

logger.info("Removing the openshift virtualization CRDs")
self.remove_crds()
34 changes: 22 additions & 12 deletions ocs_ci/helpers/stretchcluster_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
modify_statefulset_replica_count,
modify_job_parallelism_count,
)
from ocs_ci.ocs.exceptions import CommandFailed
from ocs_ci.ocs.resources.pod import (
wait_for_pods_deletion,
)
Expand Down Expand Up @@ -46,9 +47,6 @@ def check_errors_regex(desc_out, err_msgs):
pod_names = [pod.name for pod in pods_not_running]
logger.info(f"Pods not running: {pod_names}")
scaled_down = []
dep_name = constants.LOGWRITER_CEPHFS_NAME
sts_name = constants.LOGWRITER_RBD_NAME
job_name = constants.LOGREADER_CEPHFS_NAME

for pod in pods_not_running:
# get the labels from the pod data
Expand All @@ -68,21 +66,29 @@ def check_errors_regex(desc_out, err_msgs):
continue

# get the pod describe output
desc_out = OCP().exec_oc_cmd(
command=f"describe pod {pod.name}", out_yaml_format=False
)
try:
desc_out = OCP().exec_oc_cmd(
command=f"describe pod {pod.name}", out_yaml_format=False
)
except CommandFailed as e:
if "NotFound" not in e.args[0]:
raise e
else:
continue

# if any of the above mentioned error messages are present in the
# describe outpout we scaled down respective deployment/job/statefulset
if check_errors_regex(desc_out, error_messages):
# Delete the ContainerStatusUnknown error pods
if pod.status() == constants.STATUS_CONTAINER_STATUS_UNKNOWN:
pod.delete()

if (
constants.LOGWRITER_CEPHFS_LABEL.split("=")[1] in labels
and constants.LOGWRITER_CEPHFS_LABEL not in scaled_down
):
modify_deployment_replica_count(
deployment_name=dep_name,
deployment_name=constants.LOGWRITER_CEPHFS_NAME,
replica_count=0,
namespace=constants.STRETCH_CLUSTER_NAMESPACE,
)
Expand All @@ -98,7 +104,7 @@ def check_errors_regex(desc_out, err_msgs):
):

modify_statefulset_replica_count(
statefulset_name=sts_name,
statefulset_name=constants.LOGWRITER_RBD_NAME,
replica_count=0,
namespace=constants.STRETCH_CLUSTER_NAMESPACE,
)
Expand All @@ -115,7 +121,9 @@ def check_errors_regex(desc_out, err_msgs):
):

modify_job_parallelism_count(
job_name, count=0, namespace=constants.STRETCH_CLUSTER_NAMESPACE
job_name=constants.LOGREADER_CEPHFS_NAME,
count=0,
namespace=constants.STRETCH_CLUSTER_NAMESPACE,
)
wait_for_pods_deletion(
constants.LOGREADER_CEPHFS_LABEL,
Expand All @@ -129,19 +137,21 @@ def check_errors_regex(desc_out, err_msgs):
for label in scaled_down:
if label == constants.LOGWRITER_CEPHFS_LABEL:
modify_deployment_replica_count(
deployment_name=dep_name,
deployment_name=constants.LOGWRITER_CEPHFS_NAME,
replica_count=4,
namespace=constants.STRETCH_CLUSTER_NAMESPACE,
)
elif label == constants.LOGWRITER_RBD_LABEL:
modify_statefulset_replica_count(
statefulset_name=sts_name,
statefulset_name=constants.LOGWRITER_RBD_NAME,
replica_count=2,
namespace=constants.STRETCH_CLUSTER_NAMESPACE,
)
elif label == constants.LOGREADER_CEPHFS_LABEL:
modify_job_parallelism_count(
job_name, count=4, namespace=constants.STRETCH_CLUSTER_NAMESPACE
job_name=constants.LOGREADER_CEPHFS_NAME,
count=4,
namespace=constants.STRETCH_CLUSTER_NAMESPACE,
)

# fetch workload pod details now and make sure all of them are running
Expand Down
1 change: 1 addition & 0 deletions ocs_ci/ocs/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,7 @@
DEFALUT_DEVICE_CLASS = "ssd"
VM = "vm"
HOSTED_CLUSTERS = "hostedclusters"
OPERATOR_KIND = "Operator"

# Provisioners
AWS_EFS_PROVISIONER = "openshift.org/aws-efs"
Expand Down
5 changes: 4 additions & 1 deletion ocs_ci/ocs/resources/pod.py
Original file line number Diff line number Diff line change
Expand Up @@ -2433,7 +2433,10 @@ def get_not_running_pods(selector=None, namespace=config.ENV_DATA["cluster_names
pods_not_running = list()
for pod in pod_objs:
status = pod.status()
if status != constants.STATUS_RUNNING:
if (
status != constants.STATUS_RUNNING
and status != constants.STATUS_TERMINATING
):
pods_not_running.append(pod)

return pods_not_running
Expand Down
33 changes: 26 additions & 7 deletions ocs_ci/ocs/resources/stretchcluster.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import logging
import json
import re
import time

from datetime import timedelta

Expand Down Expand Up @@ -185,7 +186,7 @@ def get_ocs_nodes_in_zone(self, zone):
ocs_nodes_in_zone = nodes_in_zone.intersection(ocs_nodes)
return get_node_objs(list(ocs_nodes_in_zone))

@retry(CommandFailed, tries=10, delay=10)
@retry(CommandFailed, tries=6, delay=10)
def check_for_read_pause(self, label, start_time, end_time):
"""
This checks for any read pause has occurred during the given
Expand Down Expand Up @@ -225,7 +226,7 @@ def check_for_read_pause(self, label, start_time, end_time):
paused += 1
return paused

@retry(CommandFailed, tries=10, delay=10)
@retry(CommandFailed, tries=6, delay=10)
def check_for_write_pause(self, label, start_time, end_time):
"""
Checks for write pause between start time and end time
Expand Down Expand Up @@ -309,7 +310,7 @@ def get_logfile_map(self, label):
self.logfile_map[label][0] = list(set(self.logfile_map[label][0]))
logger.info(self.logfile_map[label][0])

@retry(UnexpectedBehaviour, tries=10, delay=5)
@retry(UnexpectedBehaviour, tries=6, delay=5)
def get_logwriter_reader_pods(
self,
label,
Expand Down Expand Up @@ -436,7 +437,7 @@ def check_for_data_loss(self, label):
return True

@retry(CommandFailed, tries=15, delay=5)
def check_ceph_accessibility(self, timeout, delay=5, grace=15):
def check_ceph_accessibility(self, timeout, delay=5, grace=120):
"""
Check for ceph access for the 'timeout' seconds
Expand Down Expand Up @@ -482,8 +483,26 @@ def get_out_of_quorum_nodes(self):
"""
# find out the mons in quorum
ceph_tools_pod = pod.get_ceph_tools_pod()
output = dict(ceph_tools_pod.exec_cmd_on_pod(command="ceph quorum_status"))
quorum_mons = output.get("quorum_names")

@retry(CommandFailed, tries=10, delay=10)
def _get_non_quorum_mons():
"""
Get non quorum mon pods
"""
output = dict(ceph_tools_pod.exec_cmd_on_pod(command="ceph quorum_status"))
quorum_mons = output.get("quorum_names")

if len(quorum_mons) != 3:
raise CommandFailed
logger.info("waiting 10 seconds before re-checking")

time.sleep(10)
output = dict(ceph_tools_pod.exec_cmd_on_pod(command="ceph quorum_status"))
quorum_mons = output.get("quorum_names")
return quorum_mons

quorum_mons = _get_non_quorum_mons()
logger.info(f"Mon's in quorum are: {quorum_mons}")
mon_meta_data = list(
ceph_tools_pod.exec_cmd_on_pod(command="ceph mon metadata")
Expand Down Expand Up @@ -634,7 +653,7 @@ def rbd_failure_checks(self, start_time, end_time, **kwargs):
start_time,
end_time,
)
<= 2
== 0
), "Write operations paused for RBD workloads even for the ones in available zone"
logger.info("all write operations are successful for RBD workloads")

Expand Down
28 changes: 28 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import pytest
from collections import namedtuple

from ocs_ci.deployment.cnv import CNVInstaller
from ocs_ci.deployment import factory as dep_factory
from ocs_ci.deployment.helpers.hypershift_base import HyperShiftBase
from ocs_ci.deployment.hosted_cluster import HostedClients
Expand All @@ -33,6 +34,7 @@
upgrade_marks,
ignore_resource_not_found_error_label,
)

from ocs_ci.helpers.proxy import update_container_with_proxy_env
from ocs_ci.ocs import constants, defaults, fio_artefacts, node, ocp, platform_nodes
from ocs_ci.ocs.acm.acm import login_to_acm, AcmAddClusters
Expand Down Expand Up @@ -8271,3 +8273,29 @@ def run_description():
if not os.path.isfile(description_path):
with open(description_path, "w") as file:
file.write(f"{run_name}\n")


@pytest.fixture(scope="session")
def setup_cnv(request):
"""
Session scoped fixture to setup and cleanup CNV
based on need of the tests
"""
cnv_obj = CNVInstaller()
installed = False
if not cnv_obj.post_install_verification():
cnv_obj.deploy_cnv(check_cnv_deployed=False, check_cnv_ready=False)
installed = True

def finalizer():
"""
Clean up CNV deployment
"""

# Uninstall CNV only if installed by this fixture
if installed:
cnv_obj.uninstall_cnv()

request.addfinalizer(finalizer)
Loading

0 comments on commit 16b6700

Please sign in to comment.