From 90f0833f92f041b274c8d46010b80a3ba13bad59 Mon Sep 17 00:00:00 2001 From: Nikolay Ustinov Date: Thu, 11 Nov 2021 21:46:41 +0700 Subject: [PATCH 1/2] Prometheus Metrics View impl --- .gitignore | 2 + README.rst | 82 ++++++++++++++++++++ health_check/backends.py | 34 +++++++- health_check/conf.py | 2 + health_check/contrib/celery/apps.py | 4 +- health_check/contrib/celery/backends.py | 2 +- health_check/contrib/celery_ping/backends.py | 14 +++- health_check/exceptions.py | 4 + health_check/views.py | 22 +++++- setup.cfg | 2 + tests/test_celery_ping.py | 24 ++++++ tests/test_db.py | 27 +++++++ tests/test_migrations.py | 20 +++++ tests/test_views.py | 12 +++ 14 files changed, 243 insertions(+), 8 deletions(-) diff --git a/.gitignore b/.gitignore index f3d03de8..a6246fe3 100644 --- a/.gitignore +++ b/.gitignore @@ -102,3 +102,5 @@ ENV/ # pytest .pytest_cache/ + +.idea diff --git a/README.rst b/README.rst index 0625ecda..7b7e1c73 100644 --- a/README.rst +++ b/README.rst @@ -291,6 +291,87 @@ This should yield the following output: Similar to the http version, a critical error will cause the command to quit with the exit code `1`. +Prometheus Support +------------------ + +You can get metrics and healthcheck status in Prometheus format. +It supports official package for Prometheus - prometheus-client_. + +If you want to enable this format, set value: + +.. code:: python + + HEALTH_CHECK = { + 'USE_PROMETHEUS': True, + } + +To get metrics in Prometheus format: + +.. code:: + + $ curl -v -X GET "http://example.com/ht/" -H "Accept: text/plain" + + < HTTP/1.1 200 OK + < Date: Tue, 16 Nov 2021 10:44:40 GMT + < Server: WSGIServer/0.2 CPython/3.8.2 + < Content-Type: text/plain; version=0.0.4; charset=utf-8 + < Expires: Tue, 16 Nov 2021 10:44:40 GMT + < Cache-Control: max-age=0, no-cache, no-store, must-revalidate, private + < X-Frame-Options: DENY + < Content-Length: 2050 + < X-Content-Type-Options: nosniff + < Referrer-Policy: same-origin + + # HELP python_gc_objects_collected_total Objects collected during gc + # TYPE python_gc_objects_collected_total counter + python_gc_objects_collected_total{generation="0"} 51643.0 + python_gc_objects_collected_total{generation="1"} 4985.0 + python_gc_objects_collected_total{generation="2"} 209.0 + # HELP python_gc_objects_uncollectable_total Uncollectable object found during GC + # TYPE python_gc_objects_uncollectable_total counter + python_gc_objects_uncollectable_total{generation="0"} 0.0 + python_gc_objects_uncollectable_total{generation="1"} 0.0 + python_gc_objects_uncollectable_total{generation="2"} 0.0 + # HELP python_gc_collections_total Number of times this generation was collected + # TYPE python_gc_collections_total counter + python_gc_collections_total{generation="0"} 441.0 + python_gc_collections_total{generation="1"} 40.0 + python_gc_collections_total{generation="2"} 3.0 + # HELP python_info Python platform information + # TYPE python_info gauge + python_info{implementation="CPython",major="3",minor="8",patchlevel="2",version="3.8.2"} 1.0 + # HELP app_disk_usage_status Check status of DiskUsage + # TYPE app_disk_usage_status gauge + app_disk_usage_status 1.0 + # HELP app_memory_usage_status Check status of MemoryUsage + # TYPE app_memory_usage_status gauge + app_memory_usage_status 1.0 + # HELP app_rabbit_m_q_health_check_status Check status of RabbitMQHealthCheck + # TYPE app_rabbit_m_q_health_check_status gauge + app_rabbit_m_q_health_check_status 1.0 + # HELP app_database_backend_status Check status of DatabaseBackend + # TYPE app_database_backend_status gauge + app_database_backend_status 1.0 + + +Second way: + +.. code:: + + $ curl -v -X GET http://www.example.com/ht/?format=prometheus + + +Metrics with `python_` prefix are default metrics from prometheus-client_. Metrics with `app_` prefix are custom metrics from healthchecks. + +If you need to change the base path to metrics, then you can do this in urls.py in your Django application: + +.. code:: python + + urlpatterns = [ + # ... + url(r'^metrics/', include('health_check.urls')), + ] + Other resources --------------- @@ -309,3 +390,4 @@ Other resources .. _Pingdom: https://www.pingdom.com/ .. _django-watchman: https://github.com/mwarkentin/django-watchman .. _weblog: https://www.vincit.fi/en/blog/deploying-django-to-elastic-beanstalk-with-https-redirects-and-functional-health-checks/ +.. _prometheus-client: https://github.com/prometheus/client_python diff --git a/health_check/backends.py b/health_check/backends.py index 55186c3c..a7ae6d90 100644 --- a/health_check/backends.py +++ b/health_check/backends.py @@ -1,9 +1,14 @@ import logging +import re from timeit import default_timer as timer from django.utils.translation import gettext_lazy as _ # noqa: N812 +from prometheus_client import REGISTRY, Gauge, Metric -from health_check.exceptions import HealthCheckException +from health_check.conf import HEALTH_CHECK +from health_check.exceptions import ( + BadPrometheusMetricType, HealthCheckException +) logger = logging.getLogger('health-check') @@ -19,6 +24,31 @@ class BaseHealthCheckBackend: def __init__(self): self.errors = [] + self.use_prometheus = HEALTH_CHECK['USE_PROMETHEUS'] + + @property + def class_name_to_snake_case(self): + return re.sub(r'(? str: + return f"{self.class_name_to_snake_case}_status" + + @property + def prometheus_status_metric(self) -> Gauge: + return self.get_prometheus_metric(Gauge, self.prometheus_status_metric_name) + + def get_prometheus_metric(self, metric_type: type, name, description: str = None): + if issubclass(metric_type, Metric): + raise BadPrometheusMetricType(f"Metric type '{metric_type}' isn't subclass of prometheus_client.Metric") + + name = f"{HEALTH_CHECK['PROMETHEUS_METRIC_NAMESPACE']}_{name}" + description = description or f"Check status of {self.identifier()}" + + if name in REGISTRY._get_names(REGISTRY): + return REGISTRY._names_to_collectors[name] + + return metric_type(name, description) def check_status(self): raise NotImplementedError @@ -35,6 +65,8 @@ def run_check(self): raise finally: self.time_taken = timer() - start + if self.use_prometheus: + self.prometheus_status_metric.set(0 if len(self.errors) else 1) def add_error(self, error, cause=None): if isinstance(error, HealthCheckException): diff --git a/health_check/conf.py b/health_check/conf.py index 99465f81..85eddf4c 100644 --- a/health_check/conf.py +++ b/health_check/conf.py @@ -4,3 +4,5 @@ HEALTH_CHECK.setdefault('DISK_USAGE_MAX', 90) HEALTH_CHECK.setdefault('MEMORY_MIN', 100) HEALTH_CHECK.setdefault('WARNINGS_AS_ERRORS', True) +HEALTH_CHECK.setdefault('USE_PROMETHEUS', False) +HEALTH_CHECK.setdefault('PROMETHEUS_METRIC_NAMESPACE', 'app') diff --git a/health_check/contrib/celery/apps.py b/health_check/contrib/celery/apps.py index 237581ba..e465c47c 100644 --- a/health_check/contrib/celery/apps.py +++ b/health_check/contrib/celery/apps.py @@ -1,8 +1,8 @@ +import warnings + from celery import current_app from django.apps import AppConfig from django.conf import settings -import warnings - from health_check.plugins import plugin_dir diff --git a/health_check/contrib/celery/backends.py b/health_check/contrib/celery/backends.py index 9b4c3bed..272adca5 100644 --- a/health_check/contrib/celery/backends.py +++ b/health_check/contrib/celery/backends.py @@ -1,3 +1,4 @@ +from celery.exceptions import TaskRevokedError, TimeoutError from django.conf import settings from health_check.backends import BaseHealthCheckBackend @@ -6,7 +7,6 @@ ) from .tasks import add -from celery.exceptions import TaskRevokedError, TimeoutError class CeleryHealthCheck(BaseHealthCheckBackend): diff --git a/health_check/contrib/celery_ping/backends.py b/health_check/contrib/celery_ping/backends.py index 4c2a2d34..a027199e 100644 --- a/health_check/contrib/celery_ping/backends.py +++ b/health_check/contrib/celery_ping/backends.py @@ -1,5 +1,6 @@ from celery.app import default_app as app from django.conf import settings +from prometheus_client import Gauge from health_check.backends import BaseHealthCheckBackend from health_check.exceptions import ServiceUnavailable @@ -8,6 +9,10 @@ class CeleryPingHealthCheck(BaseHealthCheckBackend): CORRECT_PING_RESPONSE = {"ok": "pong"} + @property + def prometheus_active_queues_amount(self) -> Gauge: + return self.get_prometheus_metric(Gauge, "celery_active_queues", "Amount of active queues") + def check_status(self): timeout = getattr(settings, "HEALTHCHECK_CELERY_PING_TIMEOUT", 1) @@ -47,13 +52,15 @@ def _check_ping_result(self, ping_result): active_workers.append(worker) if not self.errors: - self._check_active_queues(active_workers) + amount_queues = self._check_active_queues(active_workers) + if self.use_prometheus: + self.prometheus_active_queues_amount.set(amount_queues) - def _check_active_queues(self, active_workers): + def _check_active_queues(self, active_workers) -> int: defined_queues = app.conf.CELERY_QUEUES if not defined_queues: - return + return 0 defined_queues = set([queue.name for queue in defined_queues]) active_queues = set() @@ -65,3 +72,4 @@ def _check_active_queues(self, active_workers): self.add_error( ServiceUnavailable(f"No worker for Celery task queue {queue}"), ) + return len(active_queues) diff --git a/health_check/exceptions.py b/health_check/exceptions.py index d17ef686..4e4874e0 100644 --- a/health_check/exceptions.py +++ b/health_check/exceptions.py @@ -28,3 +28,7 @@ class ServiceUnavailable(HealthCheckException): class ServiceReturnedUnexpectedResult(HealthCheckException): message_type = _("unexpected result") + + +class BadPrometheusMetricType(Exception): + pass diff --git a/health_check/views.py b/health_check/views.py index a1552fee..48811068 100644 --- a/health_check/views.py +++ b/health_check/views.py @@ -1,8 +1,11 @@ +import os import re +import prometheus_client from django.http import HttpResponse, JsonResponse from django.views.decorators.cache import never_cache from django.views.generic import TemplateView +from prometheus_client import multiprocess from health_check.mixins import CheckMixin @@ -89,9 +92,14 @@ def get(self, request, *args, **kwargs): if format_override == 'json': return self.render_to_response_json(self.plugins, status_code) + if format_override == 'prometheus': + return self.render_to_response_prometheus(self.plugins, status_code) + accept_header = request.META.get('HTTP_ACCEPT', '*/*') for media in MediaType.parse_header(accept_header): - if media.mime_type in ('text/html', 'application/xhtml+xml', 'text/*', '*/*'): + if media.mime_type in ('text/plain',): + return self.render_to_response_prometheus(self.plugins, status_code) + elif media.mime_type in ('text/html', 'application/xhtml+xml', 'text/*', '*/*'): context = self.get_context_data(**kwargs) return self.render_to_response(context, status=status_code) elif media.mime_type in ('application/json', 'application/*'): @@ -110,3 +118,15 @@ def render_to_response_json(self, plugins, status): {str(p.identifier()): str(p.pretty_status()) for p in plugins}, status=status ) + + def render_to_response_prometheus(self, *_): + if "prometheus_multiproc_dir" in os.environ: + registry = prometheus_client.CollectorRegistry() + multiprocess.MultiProcessCollector(registry) + else: + registry = prometheus_client.REGISTRY + metrics_page = prometheus_client.generate_latest(registry) + return HttpResponse( + metrics_page, + content_type=prometheus_client.CONTENT_TYPE_LATEST, + ) diff --git a/setup.cfg b/setup.cfg index 9e986546..ec642581 100644 --- a/setup.cfg +++ b/setup.cfg @@ -21,12 +21,14 @@ classifier = keywords = django postgresql + prometheus [options] include_package_data = True packages = health_check install_requires = django>=2.2 + prometheus-client>=0.11 setup_requires = setuptools_scm sphinx diff --git a/tests/test_celery_ping.py b/tests/test_celery_ping.py index efb3a8ce..02ba6852 100644 --- a/tests/test_celery_ping.py +++ b/tests/test_celery_ping.py @@ -122,6 +122,30 @@ def test_check_status_add_error_when_ping_result_failed( assert len(health_check.errors) == 1 assert "workers unavailable" in health_check.errors[0].message.lower() + def test_prometheus_check_status_doesnt_add_errors_when_ping_successfull(self, health_check): + celery_worker = "celery@4cc150a7b49b" + + with patch( + self.CELERY_APP_CONTROL_PING, + return_value=[ + {celery_worker: CeleryPingHealthCheck.CORRECT_PING_RESPONSE}, + {f"{celery_worker}-2": CeleryPingHealthCheck.CORRECT_PING_RESPONSE}, + ], + ), patch( + self.CELERY_APP_CONTROL_INSPECT_ACTIVE_QUEUES, + return_value={ + celery_worker: [ + {"name": queue.name} for queue in settings.CELERY_QUEUES + ] + }, + ): + health_check.use_prometheus = True + health_check.check_status() + + assert not health_check.errors + assert health_check.prometheus_active_queues_amount._name == 'app_celery_active_queues' + assert health_check.prometheus_active_queues_amount._value.get() == 2.0 + class TestCeleryPingHealthCheckApps: def test_apps(self): diff --git a/tests/test_db.py b/tests/test_db.py index 20f45429..55416965 100644 --- a/tests/test_db.py +++ b/tests/test_db.py @@ -67,3 +67,30 @@ def test_raise_exception(self): db_backend = DatabaseBackend() with self.assertRaises(Exception): db_backend.run_check() + + @patch('health_check.db.backends.TestModel.objects.create', + lambda title=None: MockDBModel()) + def test_prometheus_check_status_works(self): + db_backend = DatabaseBackend() + db_backend.use_prometheus = True + + db_backend.run_check() + + self.assertFalse(db_backend.errors) + + self.assertEquals(db_backend.prometheus_status_metric_name, 'database_backend_status') + self.assertEquals(db_backend.prometheus_status_metric._value.get(), 1.0) + + @patch('health_check.db.backends.TestModel.objects.create', + lambda title=None: raise_(IntegrityError)) + def test_prometheus_raise_integrity_error(self): + db_backend = DatabaseBackend() + db_backend.use_prometheus = True + + db_backend.run_check() + + self.assertTrue(db_backend.errors) + self.assertIn('unexpected result: Integrity Error', db_backend.pretty_status()) + + self.assertEquals(db_backend.prometheus_status_metric_name, 'database_backend_status') + self.assertEquals(db_backend.prometheus_status_metric._value.get(), 0.0) diff --git a/tests/test_migrations.py b/tests/test_migrations.py index 3e88008e..526ba54c 100644 --- a/tests/test_migrations.py +++ b/tests/test_migrations.py @@ -24,3 +24,23 @@ def test_check_status_raises_error_if_there_are_migrations(self): backend = MigrationsHealthCheck() backend.run_check() self.assertTrue(backend.errors) + + def test_prometheus_check_status_work(self): + with patch('health_check.contrib.migrations.backends.MigrationsHealthCheck.get_migration_plan', + return_value=[]): + backend = MigrationsHealthCheck() + backend.use_prometheus = True + backend.run_check() + self.assertFalse(backend.errors) + self.assertEquals(backend.prometheus_status_metric_name, 'migrations_health_check_status') + self.assertEquals(backend.prometheus_status_metric._value.get(), 1.0) + + def test_prometheus_check_status_raises_error_if_there_are_migrations(self): + with patch('health_check.contrib.migrations.backends.MigrationsHealthCheck.get_migration_plan', + return_value=[(MockMigration, False)]): + backend = MigrationsHealthCheck() + backend.use_prometheus = True + backend.run_check() + self.assertTrue(backend.errors) + self.assertEquals(backend.prometheus_status_metric_name, 'migrations_health_check_status') + self.assertEquals(backend.prometheus_status_metric._value.get(), 0.0) diff --git a/tests/test_views.py b/tests/test_views.py index 2ac2af03..469fd77e 100644 --- a/tests/test_views.py +++ b/tests/test_views.py @@ -1,5 +1,6 @@ import json +import prometheus_client import pytest from health_check.backends import BaseHealthCheckBackend @@ -164,6 +165,17 @@ def run_check(self): assert response['content-type'] == 'text/html; charset=utf-8' assert response.status_code == 200 + def test_success_accept_plain(self, client): + class SuccessBackend(BaseHealthCheckBackend): + def run_check(self): + pass + + plugin_dir.reset() + plugin_dir.register(SuccessBackend) + response = client.get(self.url, HTTP_ACCEPT='text/plain') + assert response['content-type'] == prometheus_client.CONTENT_TYPE_LATEST + assert response.status_code == 200 + def test_success_unsupported_accept(self, client): class SuccessBackend(BaseHealthCheckBackend): def run_check(self): From 7dc3ec35e32fb58dd661060564c2b86ec9777089 Mon Sep 17 00:00:00 2001 From: Nikolay Ustinov Date: Tue, 21 Dec 2021 21:21:54 +0700 Subject: [PATCH 2/2] CommonHealth Metric Impl. / CLI command json output --- health_check/backends.py | 23 +++++++++++------ health_check/conf.py | 1 + .../management/commands/health_check.py | 25 +++++++++++++++++-- health_check/mixins.py | 15 +++++++++++ 4 files changed, 55 insertions(+), 9 deletions(-) diff --git a/health_check/backends.py b/health_check/backends.py index a7ae6d90..e35791be 100644 --- a/health_check/backends.py +++ b/health_check/backends.py @@ -53,9 +53,9 @@ def get_prometheus_metric(self, metric_type: type, name, description: str = None def check_status(self): raise NotImplementedError - def run_check(self): + def run_check(self, external_errors=None): start = timer() - self.errors = [] + self.errors = external_errors or [] try: self.check_status() except HealthCheckException as e: @@ -77,15 +77,18 @@ def add_error(self, error, cause=None): else: msg = _("unknown error") error = HealthCheckException(msg) - if isinstance(cause, BaseException): - logger.exception(str(error)) - else: - logger.error(str(error)) + + if HEALTH_CHECK['VERBOSE']: + if isinstance(cause, BaseException): + logger.exception(str(error)) + else: + logger.error(str(error)) + self.errors.append(error) def pretty_status(self): if self.errors: - return "\n".join(str(e) for e in self.errors) + return "; ".join(str(e) for e in self.errors) return _('working') @property @@ -94,3 +97,9 @@ def status(self): def identifier(self): return self.__class__.__name__ + + +class CommonHealth(BaseHealthCheckBackend): + + def check_status(self): + pass diff --git a/health_check/conf.py b/health_check/conf.py index 85eddf4c..6f7b30d2 100644 --- a/health_check/conf.py +++ b/health_check/conf.py @@ -6,3 +6,4 @@ HEALTH_CHECK.setdefault('WARNINGS_AS_ERRORS', True) HEALTH_CHECK.setdefault('USE_PROMETHEUS', False) HEALTH_CHECK.setdefault('PROMETHEUS_METRIC_NAMESPACE', 'app') +HEALTH_CHECK.setdefault('VERBOSE', True) diff --git a/health_check/management/commands/health_check.py b/health_check/management/commands/health_check.py index 9964d0dc..199308c0 100644 --- a/health_check/management/commands/health_check.py +++ b/health_check/management/commands/health_check.py @@ -1,17 +1,34 @@ import sys +import json from django.core.management.base import BaseCommand +from health_check.conf import HEALTH_CHECK from health_check.mixins import CheckMixin class Command(CheckMixin, BaseCommand): help = "Run health checks and exit 0 if everything went well." + def add_arguments(self, parser): + parser.add_argument("--json-output", action="store_true", required=False) + parser.add_argument("--verbose", action="store_true", required=False) + def handle(self, *args, **options): + if not options["verbose"]: + HEALTH_CHECK["VERBOSE"] = False + # perform all checks errors = self.errors + if options["json_output"]: + + self.json_output() + else: + self.plain_output() + if errors: + sys.exit(1) + def plain_output(self): for plugin in self.plugins: style_func = self.style.SUCCESS if not plugin.errors else self.style.ERROR self.stdout.write( @@ -21,5 +38,9 @@ def handle(self, *args, **options): ) ) - if errors: - sys.exit(1) + def json_output(self): + metrics = { + p.identifier(): p.status + for p in self.plugins + } + self.stdout.write(json.dumps(metrics)) diff --git a/health_check/mixins.py b/health_check/mixins.py index 707a2f1b..6d76d853 100644 --- a/health_check/mixins.py +++ b/health_check/mixins.py @@ -1,6 +1,7 @@ import copy from concurrent.futures import ThreadPoolExecutor +from health_check.backends import CommonHealth from health_check.conf import HEALTH_CHECK from health_check.exceptions import ServiceWarning from health_check.plugins import plugin_dir @@ -9,6 +10,7 @@ class CheckMixin: _errors = None _plugins = None + _common_health = None @property def errors(self): @@ -25,6 +27,13 @@ def plugins(self): ), key=lambda plugin: plugin.identifier()) return self._plugins + @property + def common_health(self): + if not self._common_health: + self._common_health = CommonHealth() + self._plugins.append(self._common_health) + return self._common_health + def run_check(self): errors = [] @@ -36,8 +45,12 @@ def _run(plugin): from django.db import connections connections.close_all() + error_plugins = [] with ThreadPoolExecutor(max_workers=len(self.plugins) or 1) as executor: for plugin in executor.map(_run, self.plugins): + if plugin.errors: + error_plugins.append(plugin.identifier()) + if plugin.critical_service: if not HEALTH_CHECK['WARNINGS_AS_ERRORS']: errors.extend( @@ -47,4 +60,6 @@ def _run(plugin): else: errors.extend(plugin.errors) + self.common_health.run_check(external_errors=error_plugins) + return errors