Skip to content

Commit

Permalink
Wait replication-sync returns valid exit-code while CH connection error
Browse files Browse the repository at this point in the history
  • Loading branch information
MedvedewEM committed Sep 6, 2024
1 parent b3e694d commit 671de48
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 2 deletions.
9 changes: 7 additions & 2 deletions ch_tools/chadmin/cli/wait_group.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import os
import sys
import requests
import time

from click import FloatRange, group, option, pass_context
from requests.exceptions import ReadTimeout

from ch_tools.chadmin.cli.chadmin_group import Chadmin
from ch_tools.chadmin.internal.clickhouse_disks import S3_METADATA_STORE_PATH
Expand Down Expand Up @@ -112,14 +112,19 @@ def wait_replication_sync_command(
timeout=timeout,
settings={"receive_timeout": timeout},
)
except ReadTimeout:
except requests.exceptions.ReadTimeout:
logging.error("Timeout while running SYNC REPLICA on {}.", full_name)
sys.exit(1)
except ClickhouseError as e:
if "TIMEOUT_EXCEEDED" in str(e):
logging.error("Timeout while running SYNC REPLICA on {}.", full_name)
sys.exit(1)
raise
except requests.exceptions.ConnectionError:
logging.error(
"Connection error while running SYNC REPLICA on {}.", full_name
)
sys.exit(1)

# Replication lag
while time.time() < deadline:
Expand Down
20 changes: 20 additions & 0 deletions tests/features/chadmin.feature
Original file line number Diff line number Diff line change
Expand Up @@ -153,8 +153,28 @@ Feature: chadmin commands.
"""
When we execute command on clickhouse01
"""
supervisorctl stop clickhouse-server
"""
And we sleep for 5 seconds
When we execute command on clickhouse01
"""
chadmin wait replication-sync --total-timeout 10 --replica-timeout 3 -p 1 -w 4
"""
Then it fails with response contains
"""
Connection error while running SYNC REPLICA on
"""
Then last command completed with error exit code on clickhouse01
When we execute command on clickhouse01
"""
supervisorctl start clickhouse-server
"""
And we sleep for 5 seconds
When we execute command on clickhouse01
"""
chadmin wait replication-sync --total-timeout 10 --replica-timeout 3 -p 1 -w 4
"""
Then last command completed with success exit code on clickhouse01

Scenario Outline: Check replica restore (<replicas_count> replicas, <workers> workers)
Given populated clickhouse with <replicas_count> replicated tables on clickhouse01 with db database and table_ prefix
Expand Down
10 changes: 10 additions & 0 deletions tests/steps/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,3 +115,13 @@ def working_http(context):
host, port = docker.get_exposed_port(container, 8080)
response = requests.get(f"http://{host}:{port}/")
assert response.text == "OK", f'expected "OK", got "{response.text}"'


@then("last command completed with {result} exit code on {node:w}")
def step_last_command_completed_with_exit_code(context, result, node):
container = docker.get_container(context, node)
result = container.exec_run(["echo", "$?"], user="root")
context.response = result.output.decode().strip()
context.exit_code = result.exit_code
exit_code = 0 if result == "success" else 1
assert_that(context.exit_code, equal_to(exit_code))

0 comments on commit 671de48

Please sign in to comment.