From 74fc9445612a81de088146ea61b6ea07f44ca91c Mon Sep 17 00:00:00 2001 From: fewensa <37804932+fewensa@users.noreply.github.com> Date: Mon, 29 Jul 2024 10:17:26 +0800 Subject: [PATCH] Monitor feature (#8) * Add monitor * monitor script * monitor * check ansible * Test monitor --- .editorconfig | 2 +- .github/workflows/check-ansible.yml | 55 ++++++ .github/workflows/monitor.yml | 25 +++ ansible/inventories/hosts.ini | 23 +++ .../playbooks/monitor/group_vars/monitor.yml | 3 + ansible/playbooks/monitor/playbook.yml | 3 + ansible/roles/monitor/defaults/main.yml | 16 ++ ansible/roles/monitor/tasks/main.yml | 15 ++ ansible/roles/monitor/templates/crawl.sh | 168 ++++++++++++++++++ 9 files changed, 309 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/check-ansible.yml create mode 100644 .github/workflows/monitor.yml create mode 100644 ansible/playbooks/monitor/group_vars/monitor.yml create mode 100644 ansible/playbooks/monitor/playbook.yml create mode 100644 ansible/roles/monitor/defaults/main.yml create mode 100644 ansible/roles/monitor/tasks/main.yml create mode 100755 ansible/roles/monitor/templates/crawl.sh diff --git a/.editorconfig b/.editorconfig index 24df5de..4de45b4 100644 --- a/.editorconfig +++ b/.editorconfig @@ -5,7 +5,7 @@ charset = utf-8 end_of_line = lf insert_final_newline = true indent_style = space -indent_size = 4 +indent_size = 2 trim_trailing_whitespace = true [*.{yml,yaml}] diff --git a/.github/workflows/check-ansible.yml b/.github/workflows/check-ansible.yml new file mode 100644 index 0000000..02d8ad0 --- /dev/null +++ b/.github/workflows/check-ansible.yml @@ -0,0 +1,55 @@ +name: Check + +on: + pull_request: + branches: [main] + +env: + SLACK_INCOMING_WEBHOOK_URL: ${{ secrets.SLACK_INCOMING_WEBHOOK_URL }} + +jobs: + deploy-essentials: + name: Deploy essentials + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + with: + fetch-depth: 0 + + - name: Verify essentials file changed + uses: tj-actions/changed-files@v37.4.0 + id: changed_files + with: + files: | + ansible/inventories/hosts.ini + ansible/playbooks/_essentials/* + ansible/playbooks/essentials.yml + + - name: Deploy essentials + id: deploy-essentials + if: steps.changed_files.outputs.any_changed == 'true' + uses: dawidd6/action-ansible-playbook@v2 + with: + directory: ansible + playbook: playbooks/essentials.yml + key: "${{ secrets.SSH_PRIVATE_KEY }}" + options: --user ansible + + check-playbooks: + name: Check playbook + runs-on: ubuntu-latest + needs: [deploy-essentials] + strategy: + matrix: + playbook: + - snapshots_crab + steps: + - uses: actions/checkout@v2 + + - name: Run playbook + uses: dawidd6/action-ansible-playbook@v2 + with: + directory: ansible + playbook: playbooks/${{ matrix.playbook }}/playbook.yml + key: "${{ secrets.SSH_PRIVATE_KEY }}" + options: --user ansible --verbose --diff --check diff --git a/.github/workflows/monitor.yml b/.github/workflows/monitor.yml new file mode 100644 index 0000000..7666d50 --- /dev/null +++ b/.github/workflows/monitor.yml @@ -0,0 +1,25 @@ +name: Monitor + +on: + schedule: + - cron: "0 */2 * * *" + workflow_dispatch: + +jobs: + monitor: + name: monitor + runs-on: ubuntu-latest + strategy: + matrix: + playbook: + - monitor + steps: + - uses: actions/checkout@v2 + + - name: Run playbook + uses: dawidd6/action-ansible-playbook@v2 + with: + directory: ansible + playbook: playbooks/${{ matrix.playbook }}/playbook.yml + key: "${{ secrets.SSH_PRIVATE_KEY }}" + options: --user ansible --verbose --diff diff --git a/ansible/inventories/hosts.ini b/ansible/inventories/hosts.ini index f69dd59..d651e74 100644 --- a/ansible/inventories/hosts.ini +++ b/ansible/inventories/hosts.ini @@ -4,3 +4,26 @@ g1.crab2.darwinia.network [darwinia_nodes] g1.darwinia2.darwinia.network +[monitor] +g1.crab2.darwinia.network +c1.crab2.darwinia.network +g1.darwinia2.darwinia.network +c1.darwinia2.darwinia.network +g1.testnets.darwinia.network +g2.testnets.darwinia.network +g3.testnets.darwinia.network +g1.generic.darwinia.network +g2.generic.darwinia.network + +c1.darwinia-rpc.itering.io +c2.darwinia-rpc.itering.io +c1.crab-rpc.itering.io +c2.crab-rpc.itering.io + +c1.collator.itering.io +c2.collator.itering.io +c3.collator.itering.io +c4.collator.itering.io + +c5.collator.itering.io +c6.collator.itering.io diff --git a/ansible/playbooks/monitor/group_vars/monitor.yml b/ansible/playbooks/monitor/group_vars/monitor.yml new file mode 100644 index 0000000..4ba9821 --- /dev/null +++ b/ansible/playbooks/monitor/group_vars/monitor.yml @@ -0,0 +1,3 @@ + +monitor: + notify_slack_webhook: "{{ lookup('env', 'SLACK_INCOMING_WEBHOOK_URL') }}" diff --git a/ansible/playbooks/monitor/playbook.yml b/ansible/playbooks/monitor/playbook.yml new file mode 100644 index 0000000..fdefde7 --- /dev/null +++ b/ansible/playbooks/monitor/playbook.yml @@ -0,0 +1,3 @@ +- hosts: monitor + roles: + - monitor diff --git a/ansible/roles/monitor/defaults/main.yml b/ansible/roles/monitor/defaults/main.yml new file mode 100644 index 0000000..59ed131 --- /dev/null +++ b/ansible/roles/monitor/defaults/main.yml @@ -0,0 +1,16 @@ + +monitor: + workdir: /tmp/monitor + notify_slack_webhook: '' + notify_slack_channel: 'darwinia-alert-notification' + server_name: '' + check_disks: + - /dev/sda + - /dev/sdb + alert_thread_cpu_p2: 90 + alert_thread_cpu_p1: 98 + alert_thread_ram_p2: 90 + alert_thread_ram_p1: 98 + alert_thread_disk_p2: 90 + alert_thread_disk_p1: 98 + diff --git a/ansible/roles/monitor/tasks/main.yml b/ansible/roles/monitor/tasks/main.yml new file mode 100644 index 0000000..489dfc4 --- /dev/null +++ b/ansible/roles/monitor/tasks/main.yml @@ -0,0 +1,15 @@ + +- name: Creates workdir + file: + path: "{{ monitor.workdir }}" + state: directory + +- name: Generate scripts file + template: + src: crawl.sh + dest: "{{ monitor.workdir }}/crawl.sh" + mode: "0644" + +- name: Run snapshot + command: bash {{ monitor.workdir }}/crawl.sh + diff --git a/ansible/roles/monitor/templates/crawl.sh b/ansible/roles/monitor/templates/crawl.sh new file mode 100755 index 0000000..b149b52 --- /dev/null +++ b/ansible/roles/monitor/templates/crawl.sh @@ -0,0 +1,168 @@ +#!/bin/bash + +DISKS_TO_MONITOR=({{ monitor.check_disks | join(' ') }}) + +SERVER_NAME='{{ monitor.server_name }}' +NOTIFY_SLACK_WEBHOOK='{{ monitor.notify_slack_webhook }}' +NOTIFY_SLACK_CHANNEL='{{ monitor.notify_slack_channel }}' + +ALERT_THREAD_CPU_P2={{ monitor.alert_thread_cpu_p2 }} +ALERT_THREAD_CPU_P1={{ monitor.alert_thread_cpu_p1 }} +ALERT_THREAD_RAM_P2={{ monitor.alert_thread_ram_p2 }} +ALERT_THREAD_RAM_P1={{ monitor.alert_thread_ram_p1 }} +ALERT_THREAD_DISK_P2={{ monitor.alert_thread_disk_p2 }} +ALERT_THREAD_DISK_P1={{ monitor.alert_thread_disk_p1 }} + +timestamp() { + date +"%Y-%m-%d %H:%M:%S" +} + +cpu_usage() { + top -bn1 | grep "Cpu(s)" | awk '{print $2 + $4}' | sed 's/%//' +} + +memory_usage() { + free | grep Mem | awk '{print $3/$2 * 100.0}' +} + +disk_usage() { + for disk in "${DISKS_TO_MONITOR[@]}"; do + usage=$(df -h | grep "^$disk" | awk '{print $5}' | sed 's/%//') + echo "$disk $usage" + done +} + +request_count() { + ss -s | grep 'estab' | awk '{print $2}' +} + +generate_alert_message() { + local cpu=$(cpu_usage) + local ram=$(memory_usage) + local tcp=$(request_count) + local alert_message="[]" + local priority='P2' + + if (( $(echo "$cpu > $ALERT_THREAD_CPU_P1" | bc -l) )); then + priority='P1' + fi + if (( $(echo "$ram > $ALERT_THREAD_RAM_P1" | bc -l) )); then + priority='P1' + fi + if [[ "P1" == "$priority" ]]; then + priority_alert=$(jq -n --arg priority "${priority}" '[{"type":"mrkdwn","text":"*Priority*"},{"type":"plain_text","text":$priority}]') + alert_message=$(echo "$alert_message" | jq --argjson priority_alert "$priority_alert" '. += $priority_alert') + fi + + if (( $(echo "$cpu > $ALERT_THREAD_CPU_P2" | bc -l) )); then + cpu_alert=$(jq -n --arg cpu "${cpu}%" '[{"type":"mrkdwn","text":"*CPU*"},{"type":"plain_text","text":$cpu}]') + alert_message=$(echo "$alert_message" | jq --argjson cpu_alert "$cpu_alert" '. += $cpu_alert') + fi + + if (( $(echo "$ram > $ALERT_THREAD_RAM_P2" | bc -l) )); then + ram_alert=$(jq -n --arg ram "${ram}%" '[{"type":"mrkdwn","text":"*RAM*"},{"type":"plain_text","text":$ram}]') + alert_message=$(echo "$alert_message" | jq --argjson ram_alert "$ram_alert" '. += $ram_alert') + fi + + if [[ "$alert_message" != "[]" ]]; then + tcp_alert=$(jq -n --arg tcp "${tcp}" '[{"type":"mrkdwn","text":"*TCP*"},{"type":"plain_text","text":$tcp}]') + alert_message=$(echo "$alert_message" | jq --argjson tcp_alert "$tcp_alert" '. += $tcp_alert') + fi + + echo "$alert_message" +} + + +generate_disk_alert_message() { + local alert_message="[]" + local priority='P2' + + while IFS= read -r line; do + local disk=$(echo $line | awk '{print $1}') + local usage=$(echo $line | awk '{print $2}') + if [[ -z "$usage" ]]; then + continue + fi + + if (( $(echo "$usage > $ALERT_THREAD_DISK_P1" | bc -l) )); then + priority='P1' + fi + if (( $(echo "$usage > $ALERT_THREAD_DISK_P2" | bc -l) )); then + disk_alert=$(jq -n --arg disk "*DISK* ($disk)" --arg usage "${usage}%" '[{"type":"mrkdwn","text":$disk},{"type":"plain_text","text":$usage}]') + alert_message=$(echo "$alert_message" | jq --argjson disk_alert "$disk_alert" '. += $disk_alert') + fi + done < <(disk_usage) + + if [[ "P1" == "$priority" ]]; then + priority_alert=$(jq -n --arg priority "${priority}" '[{"type":"mrkdwn","text":"*Priority*"},{"type":"plain_text","text":$priority}]') + alert_message=$(echo "$alert_message" | jq --argjson priority_alert "$priority_alert" '. += $priority_alert') + fi + + echo "$alert_message" +} + + +check_and_send_alert() { + local alert_message=$(generate_alert_message) + local disk_alert_message=$(generate_disk_alert_message) + local HOSTNAME=${SERVER_NAME:-$(hostname)} + + local blocks="[]" + + if [[ "$alert_message" != "[]" ]]; then + alert_block=$( + jq -n \ + --arg warning "[*WARNING*]: New server alert > $HOSTNAME" \ + --argjson msg "$alert_message" \ + '{ "type": "section", "text": {"type": "mrkdwn", "text": $warning}, "fields": $msg }' + ) + blocks=$(echo "$blocks" | jq --argjson block "$alert_block" '. += [$block]') + fi + + if [[ "$disk_alert_message" != "[]" ]]; then + disk_block=$( + jq -n \ + --arg warning "[*WARNING*]: New disk alert > $HOSTNAME" \ + --argjson msg "$disk_alert_message" \ + '{ "type": "section", "text": {"type": "mrkdwn", "text": $warning}, "fields": $msg }' + ) + blocks=$(echo "$blocks" | jq --argjson block "$disk_block" '. += [$block]') + fi + + if [[ "$blocks" != "[]" ]]; then + local data=$( + jq -n \ + --arg channel "$NOTIFY_SLACK_CHANNEL" \ + --argjson blocks "$blocks" \ + '{ + "username": "ServerBot", + "icon_emoji": ":loudspeaker:", + "channel": $channel, + "blocks": $blocks + }' + ) + + send_alert "$data" + fi +} + +send_alert() { + local message=$1 + + curl -X POST \ + -H "Content-type: application/json" \ + $NOTIFY_SLACK_WEBHOOK \ + --data "$message" +} + +main() { + local cpu=$(cpu_usage) + local ram=$(memory_usage) + local disk=$(disk_usage) + local requests=$(request_count) + echo "$(timestamp) CPU: ${cpu}% RAM: ${ram}% Disk: ${disk}% Requests: ${requests}" + + check_and_send_alert +} + +main