Skip to content

Commit

Permalink
Fix handling of NOT_ENOUGH_SPACE errors in "ch-monitoring backup" com…
Browse files Browse the repository at this point in the history
…mand (#220)

* Fix handling of NOT_ENOUGH_SPACE errors in "ch-monitoring backup" command

* Clean up implementation and update tests
  • Loading branch information
Alex-Burmak authored Aug 15, 2024
1 parent d514716 commit 2854573
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 53 deletions.
61 changes: 34 additions & 27 deletions ch_tools/monrun_checks/ch_backup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import os.path
from datetime import datetime, timedelta, timezone
from os.path import exists
from typing import Dict, List, Tuple
from typing import Dict, List, Sequence

from click import Context
from cloup import command, option, pass_context
Expand All @@ -21,6 +21,10 @@
LOAD_MONITOR_FLAG_PATH = "/tmp/load-monitor-userfault.flag"
RESTORE_CONTEXT_PATH = "/tmp/ch_backup_restore_state.json"
FAILED_PARTS_THRESHOLD = 10
USERFAULT_ERRORS = [
"Disk quota exceeded",
"NOT_ENOUGH_SPACE",
]


@command("backup")
Expand Down Expand Up @@ -92,10 +96,13 @@ def backup_command(
result = _merge_results(
_check_valid_backups_exist(backups),
_check_last_backup_not_failed(
backups, failed_backup_count_crit_threshold
backups,
failed_backup_count_crit_threshold,
),
_check_backup_age(
backups, backup_age_warn_threshold, backup_age_crit_threshold
backups,
backup_age_warn_threshold,
backup_age_crit_threshold,
),
_check_backup_count(backups, backup_count_warn_threshold),
_check_restored_data(),
Expand All @@ -109,7 +116,6 @@ def backup_command(
result,
min_uptime,
backups,
failed_backup_count_crit_threshold,
)

return result
Expand All @@ -126,27 +132,25 @@ def _check_valid_backups_exist(backups: List[Dict]) -> Result:
return Result(CRIT, "No valid backups found")


def _count_failed_backups(backups: List[Dict]) -> Tuple[int, int]:
counter, userfault_counter = 0, 0
def _count_failed_backups(backups: List[Dict]) -> int:
counter = 0
for i, backup in enumerate(backups):
state = backup["state"]

if state == "created":
break

if (state == "failed") or (state == "creating" and i > 0):
if "exception" in backup and _is_userfault_exception(backup["exception"]):
userfault_counter += 1
counter += 1

return counter, userfault_counter
return counter


def _check_last_backup_not_failed(backups: List[Dict], crit_threshold: int) -> Result:
"""
Check that the last backup is not failed. Its status must be 'created' or 'creating'.
"""
counter, _ = _count_failed_backups(backups)
counter = _count_failed_backups(backups)

if counter == 0:
return Result(OK)
Expand All @@ -161,7 +165,9 @@ def _check_last_backup_not_failed(backups: List[Dict], crit_threshold: int) -> R


def _check_backup_age(
backups: List[Dict], warn_threshold: timedelta, crit_threshold: timedelta
backups: List[Dict],
warn_threshold: timedelta,
crit_threshold: timedelta,
) -> Result:
"""
Check that the last backup is not too old.
Expand Down Expand Up @@ -194,7 +200,8 @@ def _check_backup_age(


def _check_backup_count(
backups: List[Dict], backup_count_warn_threshold: int
backups: List[Dict],
backup_count_warn_threshold: int,
) -> Result:
"""
Check that the number of backups is not too large.
Expand Down Expand Up @@ -239,7 +246,6 @@ def _suppress_if_required(
result: Result,
min_uptime: timedelta,
backups: List[Dict],
failed_backup_count_crit_threshold: int,
) -> None:
if result.code == OK:
return
Expand All @@ -252,13 +258,9 @@ def _suppress_if_required(
result.code = WARNING
result.message += " (suppressed by low ClickHouse uptime)"

counter, userfault_counter = _count_failed_backups(backups)
if (
userfault_counter > 0
and counter - userfault_counter < failed_backup_count_crit_threshold
):
if _is_backup_failed_by_userfault_error(backups):
result.code = WARNING
result.message += " (suppressed due to user fault backup exceptions)"
result.message += " (suppressed due to user fault errors)"


def _get_uptime(ctx: Context) -> timedelta:
Expand Down Expand Up @@ -288,13 +290,18 @@ def _merge_results(*results: Result) -> Result:
return merged_result


def _is_userfault_exception(exception):
"""
Check if exception was caused by user.
Current list:
* Disk quota exceeded
"""
if not exception:
def _is_backup_failed_by_userfault_error(backups: Sequence[Dict]) -> bool:
failed_backup = None
for i, backup in enumerate(backups):
state = backup["state"]
if state == "creating" and i == 0:
continue
if state == "failed":
failed_backup = backup
break

if not failed_backup:
return False

return "Disk quota exceeded" in exception
exception_msg = failed_backup.get("exception") or ""
return any(value in exception_msg for value in USERFAULT_ERRORS)
Original file line number Diff line number Diff line change
@@ -1,63 +1,56 @@
from typing import Dict, Sequence

from hamcrest import assert_that, equal_to
from pytest import mark

from ch_tools.monrun_checks.ch_backup import _count_failed_backups
from ch_tools.monrun_checks.ch_backup import _is_backup_failed_by_userfault_error


@mark.parametrize(
["backups", "userfault_expected"],
["backups", "result"],
[
(({"state": "created"},), 0),
(
(
{"state": "failed", "exception": None},
{"state": "created"},
),
0,
),
(({"state": "created"},), False),
(
(
{"state": "failed"},
{"state": "created"},
),
0,
False,
),
(
(
{"state": "failed"},
{"state": "failed"},
{"state": "created"},
),
0,
False,
),
(
(
{"state": "failed", "exception": "Disk quota exceeded"},
{"state": "failed"},
{"state": "failed"},
{"state": "failed", "exception": None},
{"state": "created"},
),
1,
False,
),
(
(
{"state": "failed", "exception": "God's will"},
{
"state": "failed",
"exception": "ClickhouseError: Code: 243. DB::Exception: Cannot reserve 1.00 MiB, not enough space. (NOT_ENOUGH_SPACE) (version 23.8.14.6 (official build))",
},
{"state": "failed"},
{"state": "failed"},
{"state": "created"},
),
0,
True,
),
(
(
{"state": "failed", "exception": None},
{"state": "failed", "exception": "God's will"},
{"state": "failed"},
{"state": "failed"},
{"state": "created"},
),
0,
False,
),
(
(
Expand All @@ -66,12 +59,11 @@
{"state": "failed", "exception": "Disk quota exceeded"},
{"state": "created"},
),
3,
True,
),
],
)
def test_last_backup_not_failed(
backups: Sequence[Dict], userfault_expected: Sequence[int]
def test_is_backup_failed_by_userfault_error(
backups: Sequence[Dict], result: bool
) -> None:
_, userfault = _count_failed_backups(list(backups))
assert_that(userfault, equal_to(userfault_expected))
assert _is_backup_failed_by_userfault_error(backups) == result

0 comments on commit 2854573

Please sign in to comment.