Skip to content

Commit

Permalink
Merge pull request #41 from themaxbelov/check-pods
Browse files Browse the repository at this point in the history
Wait for deployment to be ready before moving to the next step
  • Loading branch information
ToxicWar authored Jan 11, 2018
2 parents 93e6253 + a540246 commit 39e92f3
Show file tree
Hide file tree
Showing 5 changed files with 329 additions and 3 deletions.
11 changes: 10 additions & 1 deletion apsconnectcli/apsconnect.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
from kubernetes.client.rest import ApiException

from apsconnectcli.action_logger import Logger
from apsconnectcli.cluster import read_cluster_certificate
from apsconnectcli.cluster import read_cluster_certificate, poll_deployment

if sys.version_info >= (3,):
import tempfile
Expand Down Expand Up @@ -825,6 +825,15 @@ def _create_deployment(name, image, api, healthcheck_path='/', replicas=2,

api.create_namespaced_deployment(namespace=namespace, body=template)

# Check deployment availability
sys.stdout.write("Waiting for deployment to become ready...")
sys.stdout.flush()
poll_result = poll_deployment(core_v1=core_api, ext_v1=api, namespace=namespace, name=name)
print()
if not poll_result.available:
print(poll_result.message)
sys.exit(1)


def _delete_deployment(name, api, namespace, core_api=None):
try:
Expand Down
95 changes: 95 additions & 0 deletions apsconnectcli/cluster.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,20 @@
import base64
import os
import sys
from collections import namedtuple
from datetime import datetime, timedelta
from time import sleep

from apsconnectcli.action_logger import Logger

LOG_DIR = os.path.expanduser('~/.apsconnect')

MAX_RESTARTS = 5
NUM_RETRIES = 5

POLL_INTERVAL = 5
TIMEOUT = 5 * 60

if not os.path.exists(LOG_DIR):
os.makedirs(LOG_DIR)

Expand All @@ -14,6 +23,8 @@
sys.stdout = Logger(LOG_FILE, sys.stdout)
sys.stderr = Logger(LOG_FILE, sys.stderr)

AvailabilityCheckResult = namedtuple('AvailabilityCheckResult', 'available message')


def read_cluster_certificate(ca_cert):
try:
Expand All @@ -24,3 +35,87 @@ def read_cluster_certificate(ca_cert):
sys.exit(1)
else:
return ca_cert_data


def poll_deployment(core_v1, ext_v1, namespace, name):
max_time = datetime.now() + timedelta(seconds=TIMEOUT)

while True:
deployment = ext_v1.read_namespaced_deployment(namespace=namespace, name=name)
available = bool(deployment.status.available_replicas)
error_reasons = ['InvalidImageName', 'CrashLoopBackOff', 'ErrImagePull']
correct_reasons = ['ContainerCreating']

if available:
return AvailabilityCheckResult(True, "Deployment has just become available")

if not check_containers_exist(core_v1, namespace, name, NUM_RETRIES, POLL_INTERVAL):
return AvailabilityCheckResult(False, "No containers available in the deployment. "
"Are there sufficient resources?")

pod = core_v1.list_namespaced_pod(namespace=namespace,
label_selector='name={}'.format(name)).items[0]

state = pod.status.container_statuses[0].state
restart_count = pod.status.container_statuses[0].restart_count
ready_condition = [c for c in pod.status.conditions if c.type == 'Ready'][0]

if (ready_condition.status == 'False' and
ready_condition.reason == 'ContainersNotReady' and
restart_count > MAX_RESTARTS):
log = get_log(name, core_v1, namespace)
err_msg = "Readiness check failed. Verify that health check URL responds with " \
"status code 200. Connector's standard output:\n{}".format(log)
return AvailabilityCheckResult(False, err_msg)

if state.waiting:
if state.waiting.reason in error_reasons:
error_message = error_report(name,
core_v1,
namespace,
state.waiting.reason,
state.waiting.message)
return AvailabilityCheckResult(False, error_message)
elif state.waiting.reason not in correct_reasons:
err_msg = "Unexpected error: {} {}".format(state.waiting.reason,
state.waiting.message)
return AvailabilityCheckResult(False, err_msg)

sleep(POLL_INTERVAL)
sys.stdout.write('.')
sys.stdout.flush()
if datetime.now() > max_time:
timeout_str = "Timed out after waiting {} seconds for deployment " \
"to become available".format(TIMEOUT)
return AvailabilityCheckResult(False, timeout_str)


def check_containers_exist(core_v1, namespace, name, num_retries=5, poll_interval=5):
attempt_num = 0
while True:
pod = core_v1.list_namespaced_pod(namespace=namespace,
label_selector='name={}'.format(name)).items[0]
if pod.status.container_statuses:
return True

sleep(poll_interval)
attempt_num += 1
if attempt_num >= num_retries:
break

if not pod.status.container_statuses:
return False


def get_log(name, core_v1, namespace):
pods = core_v1.list_namespaced_pod(namespace=namespace,
label_selector='name={}'.format(name)).items
log = core_v1.read_namespaced_pod_log(namespace=namespace, name=pods[0].metadata.name)
return log


def error_report(name, api, namespace, reason, message):
if reason == 'CrashLoopBackOff':
log = get_log(name, api, namespace)
return "Container failed to start. Logs:\n{}".format(log)
return "Error code: {};\n\nMessage: {}".format(reason, message)
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
setup(
name='apsconnectcli',
author='Ingram Micro',
version='1.7.20',
version='1.7.21',
keywords='aps apsconnect connector automation',
extras_require={
':python_version<="2.7"': ['backports.tempfile==1.0']},
Expand Down
22 changes: 22 additions & 0 deletions tests/test_apsconnect_internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
_to_bytes,
)

from apsconnectcli.cluster import AvailabilityCheckResult

from tests.fakes import FakeData, FakeK8sApi
from tests import utils

Expand Down Expand Up @@ -511,6 +513,26 @@ def test_creation_over_pods_and_bad_replicas(self):
assert_core_v1_fn=self._assert_core_v1_called_for_pods)


class CreateDeploymentPollingTest(TestCase):
def test_poll_ok(self):
api = MagicMock()
with patch('apsconnectcli.apsconnect.poll_deployment') as poll_deployment_mock, \
patch('apsconnectcli.apsconnect.sys') as sys_mock:
poll_deployment_mock.return_value = AvailabilityCheckResult(True, 'OK')
_create_deployment('name', 'image', api)
self.assertTrue(poll_deployment_mock.called)
sys_mock.exit.assert_not_called()

def test_poll_error(self):
api = MagicMock()
with patch('apsconnectcli.apsconnect.poll_deployment') as poll_deployment_mock, \
patch('apsconnectcli.apsconnect.sys') as sys_mock:
poll_deployment_mock.return_value = AvailabilityCheckResult(False, 'Error')
_create_deployment('name', 'image', api)
self.assertTrue(poll_deployment_mock.called)
sys_mock.exit.assert_called_with(1)


class CreateServiceTest(TestCase):
"""Tests for _create_service()"""

Expand Down
Loading

0 comments on commit 39e92f3

Please sign in to comment.