Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prometheus Metrics View implementation #312

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -102,3 +102,5 @@ ENV/

# pytest
.pytest_cache/

.idea
82 changes: 82 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,87 @@ This should yield the following output:
Similar to the http version, a critical error will cause the command to quit with the exit code `1`.


Prometheus Support
------------------

You can get metrics and healthcheck status in Prometheus format.
It supports official package for Prometheus - prometheus-client_.

If you want to enable this format, set value:

.. code:: python

HEALTH_CHECK = {
'USE_PROMETHEUS': True,
}

To get metrics in Prometheus format:

.. code::

$ curl -v -X GET "http://example.com/ht/" -H "Accept: text/plain"

< HTTP/1.1 200 OK
< Date: Tue, 16 Nov 2021 10:44:40 GMT
< Server: WSGIServer/0.2 CPython/3.8.2
< Content-Type: text/plain; version=0.0.4; charset=utf-8
< Expires: Tue, 16 Nov 2021 10:44:40 GMT
< Cache-Control: max-age=0, no-cache, no-store, must-revalidate, private
< X-Frame-Options: DENY
< Content-Length: 2050
< X-Content-Type-Options: nosniff
< Referrer-Policy: same-origin

# HELP python_gc_objects_collected_total Objects collected during gc
# TYPE python_gc_objects_collected_total counter
python_gc_objects_collected_total{generation="0"} 51643.0
python_gc_objects_collected_total{generation="1"} 4985.0
python_gc_objects_collected_total{generation="2"} 209.0
# HELP python_gc_objects_uncollectable_total Uncollectable object found during GC
# TYPE python_gc_objects_uncollectable_total counter
python_gc_objects_uncollectable_total{generation="0"} 0.0
python_gc_objects_uncollectable_total{generation="1"} 0.0
python_gc_objects_uncollectable_total{generation="2"} 0.0
# HELP python_gc_collections_total Number of times this generation was collected
# TYPE python_gc_collections_total counter
python_gc_collections_total{generation="0"} 441.0
python_gc_collections_total{generation="1"} 40.0
python_gc_collections_total{generation="2"} 3.0
# HELP python_info Python platform information
# TYPE python_info gauge
python_info{implementation="CPython",major="3",minor="8",patchlevel="2",version="3.8.2"} 1.0
# HELP app_disk_usage_status Check status of DiskUsage
# TYPE app_disk_usage_status gauge
app_disk_usage_status 1.0
# HELP app_memory_usage_status Check status of MemoryUsage
# TYPE app_memory_usage_status gauge
app_memory_usage_status 1.0
# HELP app_rabbit_m_q_health_check_status Check status of RabbitMQHealthCheck
# TYPE app_rabbit_m_q_health_check_status gauge
app_rabbit_m_q_health_check_status 1.0
# HELP app_database_backend_status Check status of DatabaseBackend
# TYPE app_database_backend_status gauge
app_database_backend_status 1.0


Second way:

.. code::

$ curl -v -X GET http://www.example.com/ht/?format=prometheus


Metrics with `python_` prefix are default metrics from prometheus-client_. Metrics with `app_` prefix are custom metrics from healthchecks.

If you need to change the base path to metrics, then you can do this in urls.py in your Django application:

.. code:: python

urlpatterns = [
# ...
url(r'^metrics/', include('health_check.urls')),
]

Other resources
---------------

Expand All @@ -309,3 +390,4 @@ Other resources
.. _Pingdom: https://www.pingdom.com/
.. _django-watchman: https://github.com/mwarkentin/django-watchman
.. _weblog: https://www.vincit.fi/en/blog/deploying-django-to-elastic-beanstalk-with-https-redirects-and-functional-health-checks/
.. _prometheus-client: https://github.com/prometheus/client_python
57 changes: 49 additions & 8 deletions health_check/backends.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
import logging
import re
from timeit import default_timer as timer

from django.utils.translation import gettext_lazy as _ # noqa: N812
from prometheus_client import REGISTRY, Gauge, Metric

from health_check.exceptions import HealthCheckException
from health_check.conf import HEALTH_CHECK
from health_check.exceptions import (
BadPrometheusMetricType, HealthCheckException
)

logger = logging.getLogger('health-check')

Expand All @@ -19,13 +24,38 @@ class BaseHealthCheckBackend:

def __init__(self):
self.errors = []
self.use_prometheus = HEALTH_CHECK['USE_PROMETHEUS']

@property
def class_name_to_snake_case(self):
return re.sub(r'(?<!^)(?=[A-Z])', '_', self.__class__.__name__).lower()

@property
def prometheus_status_metric_name(self) -> str:
return f"{self.class_name_to_snake_case}_status"

@property
def prometheus_status_metric(self) -> Gauge:
return self.get_prometheus_metric(Gauge, self.prometheus_status_metric_name)

def get_prometheus_metric(self, metric_type: type, name, description: str = None):
if issubclass(metric_type, Metric):
raise BadPrometheusMetricType(f"Metric type '{metric_type}' isn't subclass of prometheus_client.Metric")

name = f"{HEALTH_CHECK['PROMETHEUS_METRIC_NAMESPACE']}_{name}"
description = description or f"Check status of {self.identifier()}"

if name in REGISTRY._get_names(REGISTRY):
return REGISTRY._names_to_collectors[name]

return metric_type(name, description)

def check_status(self):
raise NotImplementedError

def run_check(self):
def run_check(self, external_errors=None):
start = timer()
self.errors = []
self.errors = external_errors or []
try:
self.check_status()
except HealthCheckException as e:
Expand All @@ -35,6 +65,8 @@ def run_check(self):
raise
finally:
self.time_taken = timer() - start
if self.use_prometheus:
self.prometheus_status_metric.set(0 if len(self.errors) else 1)

def add_error(self, error, cause=None):
if isinstance(error, HealthCheckException):
Expand All @@ -45,15 +77,18 @@ def add_error(self, error, cause=None):
else:
msg = _("unknown error")
error = HealthCheckException(msg)
if isinstance(cause, BaseException):
logger.exception(str(error))
else:
logger.error(str(error))

if HEALTH_CHECK['VERBOSE']:
if isinstance(cause, BaseException):
logger.exception(str(error))
else:
logger.error(str(error))

self.errors.append(error)

def pretty_status(self):
if self.errors:
return "\n".join(str(e) for e in self.errors)
return "; ".join(str(e) for e in self.errors)
return _('working')

@property
Expand All @@ -62,3 +97,9 @@ def status(self):

def identifier(self):
return self.__class__.__name__


class CommonHealth(BaseHealthCheckBackend):

def check_status(self):
pass
3 changes: 3 additions & 0 deletions health_check/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,6 @@
HEALTH_CHECK.setdefault('DISK_USAGE_MAX', 90)
HEALTH_CHECK.setdefault('MEMORY_MIN', 100)
HEALTH_CHECK.setdefault('WARNINGS_AS_ERRORS', True)
HEALTH_CHECK.setdefault('USE_PROMETHEUS', False)
HEALTH_CHECK.setdefault('PROMETHEUS_METRIC_NAMESPACE', 'app')
HEALTH_CHECK.setdefault('VERBOSE', True)
4 changes: 2 additions & 2 deletions health_check/contrib/celery/apps.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import warnings

from celery import current_app
from django.apps import AppConfig
from django.conf import settings
import warnings


from health_check.plugins import plugin_dir

Expand Down
2 changes: 1 addition & 1 deletion health_check/contrib/celery/backends.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from celery.exceptions import TaskRevokedError, TimeoutError
from django.conf import settings

from health_check.backends import BaseHealthCheckBackend
Expand All @@ -6,7 +7,6 @@
)

from .tasks import add
from celery.exceptions import TaskRevokedError, TimeoutError


class CeleryHealthCheck(BaseHealthCheckBackend):
Expand Down
14 changes: 11 additions & 3 deletions health_check/contrib/celery_ping/backends.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from celery.app import default_app as app
from django.conf import settings
from prometheus_client import Gauge

from health_check.backends import BaseHealthCheckBackend
from health_check.exceptions import ServiceUnavailable
Expand All @@ -8,6 +9,10 @@
class CeleryPingHealthCheck(BaseHealthCheckBackend):
CORRECT_PING_RESPONSE = {"ok": "pong"}

@property
def prometheus_active_queues_amount(self) -> Gauge:
return self.get_prometheus_metric(Gauge, "celery_active_queues", "Amount of active queues")

def check_status(self):
timeout = getattr(settings, "HEALTHCHECK_CELERY_PING_TIMEOUT", 1)

Expand Down Expand Up @@ -47,13 +52,15 @@ def _check_ping_result(self, ping_result):
active_workers.append(worker)

if not self.errors:
self._check_active_queues(active_workers)
amount_queues = self._check_active_queues(active_workers)
if self.use_prometheus:
self.prometheus_active_queues_amount.set(amount_queues)

def _check_active_queues(self, active_workers):
def _check_active_queues(self, active_workers) -> int:
defined_queues = app.conf.CELERY_QUEUES

if not defined_queues:
return
return 0

defined_queues = set([queue.name for queue in defined_queues])
active_queues = set()
Expand All @@ -65,3 +72,4 @@ def _check_active_queues(self, active_workers):
self.add_error(
ServiceUnavailable(f"No worker for Celery task queue {queue}"),
)
return len(active_queues)
4 changes: 4 additions & 0 deletions health_check/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,7 @@ class ServiceUnavailable(HealthCheckException):

class ServiceReturnedUnexpectedResult(HealthCheckException):
message_type = _("unexpected result")


class BadPrometheusMetricType(Exception):
pass
25 changes: 23 additions & 2 deletions health_check/management/commands/health_check.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,34 @@
import sys
import json

from django.core.management.base import BaseCommand

from health_check.conf import HEALTH_CHECK
from health_check.mixins import CheckMixin


class Command(CheckMixin, BaseCommand):
help = "Run health checks and exit 0 if everything went well."

def add_arguments(self, parser):
parser.add_argument("--json-output", action="store_true", required=False)
parser.add_argument("--verbose", action="store_true", required=False)

def handle(self, *args, **options):
if not options["verbose"]:
HEALTH_CHECK["VERBOSE"] = False

# perform all checks
errors = self.errors
if options["json_output"]:

self.json_output()
else:
self.plain_output()
if errors:
sys.exit(1)

def plain_output(self):
for plugin in self.plugins:
style_func = self.style.SUCCESS if not plugin.errors else self.style.ERROR
self.stdout.write(
Expand All @@ -21,5 +38,9 @@ def handle(self, *args, **options):
)
)

if errors:
sys.exit(1)
def json_output(self):
metrics = {
p.identifier(): p.status
for p in self.plugins
}
self.stdout.write(json.dumps(metrics))
15 changes: 15 additions & 0 deletions health_check/mixins.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import copy
from concurrent.futures import ThreadPoolExecutor

from health_check.backends import CommonHealth
from health_check.conf import HEALTH_CHECK
from health_check.exceptions import ServiceWarning
from health_check.plugins import plugin_dir
Expand All @@ -9,6 +10,7 @@
class CheckMixin:
_errors = None
_plugins = None
_common_health = None

@property
def errors(self):
Expand All @@ -25,6 +27,13 @@ def plugins(self):
), key=lambda plugin: plugin.identifier())
return self._plugins

@property
def common_health(self):
if not self._common_health:
self._common_health = CommonHealth()
self._plugins.append(self._common_health)
return self._common_health

def run_check(self):
errors = []

Expand All @@ -36,8 +45,12 @@ def _run(plugin):
from django.db import connections
connections.close_all()

error_plugins = []
with ThreadPoolExecutor(max_workers=len(self.plugins) or 1) as executor:
for plugin in executor.map(_run, self.plugins):
if plugin.errors:
error_plugins.append(plugin.identifier())

if plugin.critical_service:
if not HEALTH_CHECK['WARNINGS_AS_ERRORS']:
errors.extend(
Expand All @@ -47,4 +60,6 @@ def _run(plugin):
else:
errors.extend(plugin.errors)

self.common_health.run_check(external_errors=error_plugins)

return errors
Loading