From 671de48aa83ebb050f4d2d825e20e00a9b7079b7 Mon Sep 17 00:00:00 2001 From: Egor Medvedev Date: Fri, 6 Sep 2024 13:54:48 +0100 Subject: [PATCH] Wait replication-sync returns valid exit-code while CH connection error --- ch_tools/chadmin/cli/wait_group.py | 9 +++++++-- tests/features/chadmin.feature | 20 ++++++++++++++++++++ tests/steps/common.py | 10 ++++++++++ 3 files changed, 37 insertions(+), 2 deletions(-) diff --git a/ch_tools/chadmin/cli/wait_group.py b/ch_tools/chadmin/cli/wait_group.py index 5691446d..cd378248 100644 --- a/ch_tools/chadmin/cli/wait_group.py +++ b/ch_tools/chadmin/cli/wait_group.py @@ -1,9 +1,9 @@ import os import sys +import requests import time from click import FloatRange, group, option, pass_context -from requests.exceptions import ReadTimeout from ch_tools.chadmin.cli.chadmin_group import Chadmin from ch_tools.chadmin.internal.clickhouse_disks import S3_METADATA_STORE_PATH @@ -112,7 +112,7 @@ def wait_replication_sync_command( timeout=timeout, settings={"receive_timeout": timeout}, ) - except ReadTimeout: + except requests.exceptions.ReadTimeout: logging.error("Timeout while running SYNC REPLICA on {}.", full_name) sys.exit(1) except ClickhouseError as e: @@ -120,6 +120,11 @@ def wait_replication_sync_command( logging.error("Timeout while running SYNC REPLICA on {}.", full_name) sys.exit(1) raise + except requests.exceptions.ConnectionError: + logging.error( + "Connection error while running SYNC REPLICA on {}.", full_name + ) + sys.exit(1) # Replication lag while time.time() < deadline: diff --git a/tests/features/chadmin.feature b/tests/features/chadmin.feature index be762541..d6a37b33 100644 --- a/tests/features/chadmin.feature +++ b/tests/features/chadmin.feature @@ -153,8 +153,28 @@ Feature: chadmin commands. """ When we execute command on clickhouse01 """ + supervisorctl stop clickhouse-server + """ + And we sleep for 5 seconds + When we execute command on clickhouse01 + """ + chadmin wait replication-sync --total-timeout 10 --replica-timeout 3 -p 1 -w 4 + """ + Then it fails with response contains + """ + Connection error while running SYNC REPLICA on + """ + Then last command completed with error exit code on clickhouse01 + When we execute command on clickhouse01 + """ + supervisorctl start clickhouse-server + """ + And we sleep for 5 seconds + When we execute command on clickhouse01 + """ chadmin wait replication-sync --total-timeout 10 --replica-timeout 3 -p 1 -w 4 """ + Then last command completed with success exit code on clickhouse01 Scenario Outline: Check replica restore ( replicas, workers) Given populated clickhouse with replicated tables on clickhouse01 with db database and table_ prefix diff --git a/tests/steps/common.py b/tests/steps/common.py index 4413fe19..d64b212d 100644 --- a/tests/steps/common.py +++ b/tests/steps/common.py @@ -115,3 +115,13 @@ def working_http(context): host, port = docker.get_exposed_port(container, 8080) response = requests.get(f"http://{host}:{port}/") assert response.text == "OK", f'expected "OK", got "{response.text}"' + + +@then("last command completed with {result} exit code on {node:w}") +def step_last_command_completed_with_exit_code(context, result, node): + container = docker.get_container(context, node) + result = container.exec_run(["echo", "$?"], user="root") + context.response = result.output.decode().strip() + context.exit_code = result.exit_code + exit_code = 0 if result == "success" else 1 + assert_that(context.exit_code, equal_to(exit_code))