Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix handling of NOT_ENOUGH_SPACE errors in "ch-monitoring backup" command #220

Merged
merged 2 commits into from
Aug 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 34 additions & 27 deletions ch_tools/monrun_checks/ch_backup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import os.path
from datetime import datetime, timedelta, timezone
from os.path import exists
from typing import Dict, List, Tuple
from typing import Dict, List, Sequence

from click import Context
from cloup import command, option, pass_context
Expand All @@ -21,6 +21,10 @@
LOAD_MONITOR_FLAG_PATH = "/tmp/load-monitor-userfault.flag"
RESTORE_CONTEXT_PATH = "/tmp/ch_backup_restore_state.json"
FAILED_PARTS_THRESHOLD = 10
USERFAULT_ERRORS = [
"Disk quota exceeded",
"NOT_ENOUGH_SPACE",
]


@command("backup")
Expand Down Expand Up @@ -92,10 +96,13 @@ def backup_command(
result = _merge_results(
_check_valid_backups_exist(backups),
_check_last_backup_not_failed(
backups, failed_backup_count_crit_threshold
backups,
failed_backup_count_crit_threshold,
),
_check_backup_age(
backups, backup_age_warn_threshold, backup_age_crit_threshold
backups,
backup_age_warn_threshold,
backup_age_crit_threshold,
),
_check_backup_count(backups, backup_count_warn_threshold),
_check_restored_data(),
Expand All @@ -109,7 +116,6 @@ def backup_command(
result,
min_uptime,
backups,
failed_backup_count_crit_threshold,
)

return result
Expand All @@ -126,27 +132,25 @@ def _check_valid_backups_exist(backups: List[Dict]) -> Result:
return Result(CRIT, "No valid backups found")


def _count_failed_backups(backups: List[Dict]) -> Tuple[int, int]:
counter, userfault_counter = 0, 0
def _count_failed_backups(backups: List[Dict]) -> int:
counter = 0
for i, backup in enumerate(backups):
state = backup["state"]

if state == "created":
break

if (state == "failed") or (state == "creating" and i > 0):
if "exception" in backup and _is_userfault_exception(backup["exception"]):
userfault_counter += 1
counter += 1

return counter, userfault_counter
return counter


def _check_last_backup_not_failed(backups: List[Dict], crit_threshold: int) -> Result:
"""
Check that the last backup is not failed. Its status must be 'created' or 'creating'.
"""
counter, _ = _count_failed_backups(backups)
counter = _count_failed_backups(backups)

if counter == 0:
return Result(OK)
Expand All @@ -161,7 +165,9 @@ def _check_last_backup_not_failed(backups: List[Dict], crit_threshold: int) -> R


def _check_backup_age(
backups: List[Dict], warn_threshold: timedelta, crit_threshold: timedelta
backups: List[Dict],
warn_threshold: timedelta,
crit_threshold: timedelta,
) -> Result:
"""
Check that the last backup is not too old.
Expand Down Expand Up @@ -194,7 +200,8 @@ def _check_backup_age(


def _check_backup_count(
backups: List[Dict], backup_count_warn_threshold: int
backups: List[Dict],
backup_count_warn_threshold: int,
) -> Result:
"""
Check that the number of backups is not too large.
Expand Down Expand Up @@ -239,7 +246,6 @@ def _suppress_if_required(
result: Result,
min_uptime: timedelta,
backups: List[Dict],
failed_backup_count_crit_threshold: int,
) -> None:
if result.code == OK:
return
Expand All @@ -252,13 +258,9 @@ def _suppress_if_required(
result.code = WARNING
result.message += " (suppressed by low ClickHouse uptime)"

counter, userfault_counter = _count_failed_backups(backups)
if (
userfault_counter > 0
and counter - userfault_counter < failed_backup_count_crit_threshold
):
if _is_backup_failed_by_userfault_error(backups):
result.code = WARNING
result.message += " (suppressed due to user fault backup exceptions)"
result.message += " (suppressed due to user fault errors)"


def _get_uptime(ctx: Context) -> timedelta:
Expand Down Expand Up @@ -288,13 +290,18 @@ def _merge_results(*results: Result) -> Result:
return merged_result


def _is_userfault_exception(exception):
"""
Check if exception was caused by user.
Current list:
* Disk quota exceeded
"""
if not exception:
def _is_backup_failed_by_userfault_error(backups: Sequence[Dict]) -> bool:
failed_backup = None
for i, backup in enumerate(backups):
state = backup["state"]
if state == "creating" and i == 0:
continue
if state == "failed":
failed_backup = backup
break

if not failed_backup:
return False

return "Disk quota exceeded" in exception
exception_msg = failed_backup.get("exception") or ""
return any(value in exception_msg for value in USERFAULT_ERRORS)
Original file line number Diff line number Diff line change
@@ -1,63 +1,56 @@
from typing import Dict, Sequence

from hamcrest import assert_that, equal_to
from pytest import mark

from ch_tools.monrun_checks.ch_backup import _count_failed_backups
from ch_tools.monrun_checks.ch_backup import _is_backup_failed_by_userfault_error


@mark.parametrize(
["backups", "userfault_expected"],
["backups", "result"],
[
(({"state": "created"},), 0),
(
(
{"state": "failed", "exception": None},
{"state": "created"},
),
0,
),
(({"state": "created"},), False),
(
(
{"state": "failed"},
{"state": "created"},
),
0,
False,
),
(
(
{"state": "failed"},
{"state": "failed"},
{"state": "created"},
),
0,
False,
),
(
(
{"state": "failed", "exception": "Disk quota exceeded"},
{"state": "failed"},
{"state": "failed"},
{"state": "failed", "exception": None},
{"state": "created"},
),
1,
False,
),
(
(
{"state": "failed", "exception": "God's will"},
{
"state": "failed",
"exception": "ClickhouseError: Code: 243. DB::Exception: Cannot reserve 1.00 MiB, not enough space. (NOT_ENOUGH_SPACE) (version 23.8.14.6 (official build))",
},
{"state": "failed"},
{"state": "failed"},
{"state": "created"},
),
0,
True,
),
(
(
{"state": "failed", "exception": None},
{"state": "failed", "exception": "God's will"},
{"state": "failed"},
{"state": "failed"},
{"state": "created"},
),
0,
False,
),
(
(
Expand All @@ -66,12 +59,11 @@
{"state": "failed", "exception": "Disk quota exceeded"},
{"state": "created"},
),
3,
True,
),
],
)
def test_last_backup_not_failed(
backups: Sequence[Dict], userfault_expected: Sequence[int]
def test_is_backup_failed_by_userfault_error(
backups: Sequence[Dict], result: bool
) -> None:
_, userfault = _count_failed_backups(list(backups))
assert_that(userfault, equal_to(userfault_expected))
assert _is_backup_failed_by_userfault_error(backups) == result
Loading