diff --git a/ch_tools/monrun_checks/ch_backup.py b/ch_tools/monrun_checks/ch_backup.py index 4fa8ae83..880f6d47 100644 --- a/ch_tools/monrun_checks/ch_backup.py +++ b/ch_tools/monrun_checks/ch_backup.py @@ -6,7 +6,7 @@ import os.path from datetime import datetime, timedelta, timezone from os.path import exists -from typing import Dict, List, Tuple +from typing import Dict, List, Sequence from click import Context from cloup import command, option, pass_context @@ -21,6 +21,10 @@ LOAD_MONITOR_FLAG_PATH = "/tmp/load-monitor-userfault.flag" RESTORE_CONTEXT_PATH = "/tmp/ch_backup_restore_state.json" FAILED_PARTS_THRESHOLD = 10 +USERFAULT_ERRORS = [ + "Disk quota exceeded", + "NOT_ENOUGH_SPACE", +] @command("backup") @@ -92,10 +96,13 @@ def backup_command( result = _merge_results( _check_valid_backups_exist(backups), _check_last_backup_not_failed( - backups, failed_backup_count_crit_threshold + backups, + failed_backup_count_crit_threshold, ), _check_backup_age( - backups, backup_age_warn_threshold, backup_age_crit_threshold + backups, + backup_age_warn_threshold, + backup_age_crit_threshold, ), _check_backup_count(backups, backup_count_warn_threshold), _check_restored_data(), @@ -109,7 +116,6 @@ def backup_command( result, min_uptime, backups, - failed_backup_count_crit_threshold, ) return result @@ -126,8 +132,8 @@ def _check_valid_backups_exist(backups: List[Dict]) -> Result: return Result(CRIT, "No valid backups found") -def _count_failed_backups(backups: List[Dict]) -> Tuple[int, int]: - counter, userfault_counter = 0, 0 +def _count_failed_backups(backups: List[Dict]) -> int: + counter = 0 for i, backup in enumerate(backups): state = backup["state"] @@ -135,18 +141,16 @@ def _count_failed_backups(backups: List[Dict]) -> Tuple[int, int]: break if (state == "failed") or (state == "creating" and i > 0): - if "exception" in backup and _is_userfault_exception(backup["exception"]): - userfault_counter += 1 counter += 1 - return counter, userfault_counter + return counter def _check_last_backup_not_failed(backups: List[Dict], crit_threshold: int) -> Result: """ Check that the last backup is not failed. Its status must be 'created' or 'creating'. """ - counter, _ = _count_failed_backups(backups) + counter = _count_failed_backups(backups) if counter == 0: return Result(OK) @@ -161,7 +165,9 @@ def _check_last_backup_not_failed(backups: List[Dict], crit_threshold: int) -> R def _check_backup_age( - backups: List[Dict], warn_threshold: timedelta, crit_threshold: timedelta + backups: List[Dict], + warn_threshold: timedelta, + crit_threshold: timedelta, ) -> Result: """ Check that the last backup is not too old. @@ -194,7 +200,8 @@ def _check_backup_age( def _check_backup_count( - backups: List[Dict], backup_count_warn_threshold: int + backups: List[Dict], + backup_count_warn_threshold: int, ) -> Result: """ Check that the number of backups is not too large. @@ -239,7 +246,6 @@ def _suppress_if_required( result: Result, min_uptime: timedelta, backups: List[Dict], - failed_backup_count_crit_threshold: int, ) -> None: if result.code == OK: return @@ -252,13 +258,9 @@ def _suppress_if_required( result.code = WARNING result.message += " (suppressed by low ClickHouse uptime)" - counter, userfault_counter = _count_failed_backups(backups) - if ( - userfault_counter > 0 - and counter - userfault_counter < failed_backup_count_crit_threshold - ): + if _is_backup_failed_by_userfault_error(backups): result.code = WARNING - result.message += " (suppressed due to user fault backup exceptions)" + result.message += " (suppressed due to user fault errors)" def _get_uptime(ctx: Context) -> timedelta: @@ -288,13 +290,18 @@ def _merge_results(*results: Result) -> Result: return merged_result -def _is_userfault_exception(exception): - """ - Check if exception was caused by user. - Current list: - * Disk quota exceeded - """ - if not exception: +def _is_backup_failed_by_userfault_error(backups: Sequence[Dict]) -> bool: + failed_backup = None + for i, backup in enumerate(backups): + state = backup["state"] + if state == "creating" and i == 0: + continue + if state == "failed": + failed_backup = backup + break + + if not failed_backup: return False - return "Disk quota exceeded" in exception + exception_msg = failed_backup.get("exception") or "" + return any(value in exception_msg for value in USERFAULT_ERRORS) diff --git a/tests/unit/monrun/test_last_backup.py b/tests/unit/monrun/test_backup.py similarity index 60% rename from tests/unit/monrun/test_last_backup.py rename to tests/unit/monrun/test_backup.py index 3e45f56a..31203f70 100644 --- a/tests/unit/monrun/test_last_backup.py +++ b/tests/unit/monrun/test_backup.py @@ -1,28 +1,20 @@ from typing import Dict, Sequence -from hamcrest import assert_that, equal_to from pytest import mark -from ch_tools.monrun_checks.ch_backup import _count_failed_backups +from ch_tools.monrun_checks.ch_backup import _is_backup_failed_by_userfault_error @mark.parametrize( - ["backups", "userfault_expected"], + ["backups", "result"], [ - (({"state": "created"},), 0), - ( - ( - {"state": "failed", "exception": None}, - {"state": "created"}, - ), - 0, - ), + (({"state": "created"},), False), ( ( {"state": "failed"}, {"state": "created"}, ), - 0, + False, ), ( ( @@ -30,34 +22,35 @@ {"state": "failed"}, {"state": "created"}, ), - 0, + False, ), ( ( - {"state": "failed", "exception": "Disk quota exceeded"}, - {"state": "failed"}, - {"state": "failed"}, + {"state": "failed", "exception": None}, {"state": "created"}, ), - 1, + False, ), ( ( - {"state": "failed", "exception": "God's will"}, + { + "state": "failed", + "exception": "ClickhouseError: Code: 243. DB::Exception: Cannot reserve 1.00 MiB, not enough space. (NOT_ENOUGH_SPACE) (version 23.8.14.6 (official build))", + }, {"state": "failed"}, {"state": "failed"}, {"state": "created"}, ), - 0, + True, ), ( ( - {"state": "failed", "exception": None}, + {"state": "failed", "exception": "God's will"}, {"state": "failed"}, {"state": "failed"}, {"state": "created"}, ), - 0, + False, ), ( ( @@ -66,12 +59,11 @@ {"state": "failed", "exception": "Disk quota exceeded"}, {"state": "created"}, ), - 3, + True, ), ], ) -def test_last_backup_not_failed( - backups: Sequence[Dict], userfault_expected: Sequence[int] +def test_is_backup_failed_by_userfault_error( + backups: Sequence[Dict], result: bool ) -> None: - _, userfault = _count_failed_backups(list(backups)) - assert_that(userfault, equal_to(userfault_expected)) + assert _is_backup_failed_by_userfault_error(backups) == result