Skip to content

Commit

Permalink
Merge pull request #57 from scalableminds/container-exporter
Browse files Browse the repository at this point in the history
add container metrics exporter
  • Loading branch information
robert-oleynik authored Aug 20, 2024
2 parents 774f706 + 15c1420 commit 65538fd
Show file tree
Hide file tree
Showing 4 changed files with 193 additions and 3 deletions.
3 changes: 2 additions & 1 deletion metrics-pusher/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@ FROM debian:bookworm-slim

WORKDIR /app

RUN apt-get update && apt-get upgrade --yes python3-requests
RUN apt-get update && apt-get install --yes python3-requests python3-docker python3-prometheus-client

COPY ./metrics-pusher.py .
COPY ./monitor.py .

CMD [ "python3", "/app/metrics-pusher.py" ]
27 changes: 25 additions & 2 deletions metrics-pusher/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,22 @@
Run using docker:

```sh
docker run -v /var/run/docker.sock:/var/run/docker.sock \
docker run \
-v /var/run/docker.sock:/var/run/docker.sock \
-v /proc:/host/proc:ro \
-e INSTANCE_NAME="foo" \
-e PUSHGATEWAY_URL="https://<domain>/path/to/pushgateway" \
-e SCRAPE_INTERVAL=60 \
-e AUTH_USER="<user>" \
-e AUTH_PASSWORD="<password>" \
-e ENDPOINTS="http://<domain1>/metrics,http://<domain2>/metrics" \
-e HOST_PROC_PATH="/host/proc" \
scalableminds/metrics-pusher
```

This will scrape all specified endpoints.
> *Note:* If the monitor fails with permission denied and disk read and write is 0, then add `--cap-add CAP_SYS_PTRACE` to enable necessary capabilities.
This will scrape all specified endpoints and containers using the internal [Container Exporter](#container-exporter).

## Configuration

Expand All @@ -29,4 +34,22 @@ Environment Variables:
| `AUTH_USER` | User for Basic Auth |
| `AUTH_PASSWORD` | Password for Basic Auth |
| `ENDPOINTS` | Comma separated list of URLs. Each endpoint will be scraped once per interval. Allows at most one URL per hostname (e.g. `http://node_exporter:9100/metrics`) |
| `HOST_PROC_PATH` | Path to the mounted `/proc` directory |
| `DOCKER_HOST` | Path to docker socket. Defaults to `unix:///var/run/docker.sock` |

## Container Exporter

In addition to scraping multiple endpoints, this script also scrapes the container performance metrics.
Therefore, following metrics are generated:

- `system_cpu_total` All system jiffies spend, including idle.
- `container_cpu_user` Number of jiffies a container spend in user mode.
- `container_cpu_kernel` Number of jiffies a container spend in kernel mode.
- `container_memory_used` Number of Memory pages allocated to this container.
- `container_number_processes` Number of processes running inside a container.
- `container_number_threads` Number of threads created by the processes.
- `container_disk_read` Number of bytes read from disk.
- `container_disk_write` Number of bytes written to disk.

All metrics will be aggregated over all processes running inside a container.
In case a container is restarted these metrics reset to 0.
44 changes: 44 additions & 0 deletions metrics-pusher/metrics-pusher.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import os
import traceback
import urllib.parse
import prometheus_client


def push_metrics(
Expand Down Expand Up @@ -51,6 +52,39 @@ def push_metrics(
)
except Exception as e:
traceback.print_exception(e)
except KeyboardInterrupt:
break


def push_container_metrics(
name, pushgateway_url, proc_path, interval, username, password
):
print("(container) starting exporter")
import monitor

def auth_handler(url, method, timeout, headers, data):
from prometheus_client.exposition import basic_auth_handler

return basic_auth_handler(
url, method, timeout, headers, data, username, password
)

while True:
time_until_next_fetch = interval - (time.time() % interval)
time.sleep(time_until_next_fetch)

try:
monitor.scrape(proc_path)
prometheus_client.push_to_gateway(
pushgateway_url,
job=f"{name}.container",
handler=auth_handler,
registry=monitor.registry,
)
except Exception as e:
traceback.print_exception(e)
except KeyboardInterrupt:
break


if __name__ == "__main__":
Expand All @@ -60,6 +94,7 @@ def push_metrics(
auth_user = os.environ.get("AUTH_USER")
auth_pass = os.environ.get("AUTH_PASSWORD")
urls = os.environ.get("ENDPOINTS")
proc_path = os.environ.get("HOST_PROC_PATH", "/host/proc")

if name is None or name == "":
print("No INSTANCE_NAME provided")
Expand Down Expand Up @@ -100,6 +135,15 @@ def push_metrics(
p.start()
processes.append(p)

push_container_metrics(
name,
pushgateway_url,
proc_path,
scrape_interval,
auth_user,
auth_pass,
)

for p in processes:
p.join()
print("warning:", p.name, "closed")
122 changes: 122 additions & 0 deletions metrics-pusher/monitor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
import docker
import traceback

from prometheus_client import Gauge, CollectorRegistry

registry = CollectorRegistry()
cpu_total = Gauge(
"system_cpu_total",
"Total Number of Jiffies",
registry=registry,
)
used_memory = Gauge(
"container_memory_used",
"Number of Used Memory Pages",
["container"],
registry=registry,
)
number_processes = Gauge(
"container_number_processes",
"Number of Processes",
["container"],
registry=registry,
)
number_threads = Gauge(
"container_number_threads",
"Number of Threads",
["container"],
registry=registry,
)
cpu_user = Gauge(
"container_cpu_user",
"Number of Jiffies Spend in User Mode",
["container"],
registry=registry,
)
cpu_kernel = Gauge(
"container_cpu_kernel",
"Number of Jiffies Spend in Kernel Mode",
["container"],
registry=registry,
)
disk_write = Gauge(
"container_disk_write",
"Number of bytes written to disk",
["container"],
registry=registry,
)
disk_read = Gauge(
"container_disk_read",
"Number of bytes read from disk",
["container"],
registry=registry,
)

d = docker.from_env()


def scrape(proc_path):
f = open(f"{proc_path}/stat", "r")
total_system_jiffies = sum(int(v) for v in f.readline()[:-1].split()[1:])
cpu_total.set(total_system_jiffies)
f.close()

for container in d.containers.list():
name = container.name
pids = [proc[1] for proc in container.top()["Processes"]]

_used_mem = 0
_cpu_user = 0
_cpu_kernel = 0
_number_threads = 0
_number_processes = len(pids)
_disk_read = 0
_disk_write = 0

for pid in pids:
try:
# https://www.kernel.org/doc/html/latest/filesystems/proc.html#id10
utime = 10
stime = utime + 1
cutime = utime + 2
cstime = utime + 3
num_threads = 16
rss = 20
f = open(f"{proc_path}/{pid}/stat", "r")
stats = [int(s) for s in f.read().split(")")[1][3:].split(" ")]
f.close()

_used_mem += stats[rss]
_cpu_user += stats[utime] + stats[cutime]
_cpu_kernel += stats[stime] + stats[cstime]
_number_threads += stats[num_threads]

# https://www.kernel.org/doc/html/latest/filesystems/proc.html#proc-pid-io-display-the-io-accounting-fields
f = open(f"{proc_path}/{pid}/io", "r")
rchar = int(f.readline().split(" ")[1])
wchar = int(f.readline().split(" ")[1])
f.close()

_disk_read += rchar
_disk_write += wchar
except Exception as e:
_number_processes -= 1
traceback.print_exception(e)
used_memory.labels(container=name).set(_used_mem)
number_threads.labels(container=name).set(_number_threads)
number_processes.labels(container=name).set(_number_processes)
cpu_user.labels(container=name).set(_cpu_user)
cpu_kernel.labels(container=name).set(_cpu_kernel)
disk_read.labels(container=name).set(_disk_read)
disk_write.labels(container=name).set(_disk_write)

print(
f"(container) {container.name:<40}",
_cpu_user,
_cpu_kernel,
_used_mem,
_number_threads,
_number_processes,
_disk_read,
_disk_write,
)

0 comments on commit 65538fd

Please sign in to comment.