Netdata: add alerts for cpu, ram, disks
Linting / YAML Lint (push) Successful in 9s
Linting / Ansible Lint (push) Failing after 31s

This commit is contained in:
2026-05-01 13:58:51 +03:00
parent 472c7a984f
commit d25a28c611
4 changed files with 154 additions and 1 deletions
+3 -1
View File
@@ -7,7 +7,9 @@ services:
ports:
- "127.0.0.1:{{ apprise_external_port }}:8000"
networks:
- "web_proxy_network"
web_proxy_network:
aliases:
- "apprise"
volumes:
- "{{ config_dir }}:/config"
environment:
@@ -0,0 +1,67 @@
# Resource alerts for a low-spec home server.
# Overrides stock alerts where thresholds differ; baseline RAM use is ~80%, so stock 80/90% would fire constantly.
# RAM: warn at >92%, crit at >95% — by then less than ~200 MB free.
alarm: ram_in_use
on: system.ram
class: Utilization
type: System
component: Memory
calc: $used * 100 / ($used + $cached + $free + $buffers)
units: %
every: 10s
warn: $this > 92
crit: $this > 95
delay: down 5m multiplier 1.5 max 1h
summary: System memory utilization
info: System memory utilization (used / total, excluding reclaimable cache)
to: sysadmin
# CPU: replace stock 10min_cpu_usage with two windowed alerts.
alarm: 10min_cpu_usage
on: system.cpu
enabled: no
alarm: cpu_warn_30m
on: system.cpu
class: Utilization
type: System
component: CPU
lookup: average -30m unaligned of user,system,softirq,irq,guest,guest_nice,nice
units: %
every: 1m
warn: $this > 80
delay: down 30m multiplier 1.5 max 2h
summary: Sustained CPU load (30m avg)
info: Average CPU utilization over the last 30 minutes
to: sysadmin
alarm: cpu_crit_15m
on: system.cpu
class: Utilization
type: System
component: CPU
lookup: average -15m unaligned of user,system,softirq,irq,guest,guest_nice,nice
units: %
every: 1m
crit: $this > 95
delay: down 30m multiplier 1.5 max 2h
summary: High CPU load (15m avg)
info: Average CPU utilization over the last 15 minutes
to: sysadmin
# Disk: warn at >75%, crit at >90% on every mounted filesystem.
template: disk_space_usage
on: disk.space
class: Utilization
type: System
component: Disk
calc: $used * 100 / ($avail + $used)
units: %
every: 1m
warn: $this > 75
crit: $this > 90
delay: down 15m multiplier 1.5 max 1h
summary: Disk space utilization
info: Disk space utilization on ${label:mount_point}
to: sysadmin
@@ -0,0 +1,45 @@
# Override stock health_alarm_notify.conf — route every alert to apprise.
# Stock conf is sourced first; this only sets what differs.
SEND_EMAIL="NO"
SEND_CUSTOM="YES"
DEFAULT_RECIPIENT_CUSTOM="server"
role_recipients_custom[sysadmin]="server"
role_recipients_custom[domainadmin]="server"
role_recipients_custom[dba]="server"
role_recipients_custom[webmaster]="server"
role_recipients_custom[proxyadmin]="server"
role_recipients_custom[silent]=""
custom_sender() {
local apprise_url="http://apprise:8000/notify/${1}/"
local notif_type="info"
case "${status}" in
CRITICAL) notif_type="failure" ;;
WARNING) notif_type="warning" ;;
CLEAR) notif_type="success" ;;
esac
local title="[${status}] ${name} on ${host}"
local body="${status_message}: ${alarm}
Chart: ${chart}
Value: ${value} ${units}
Info: ${info}
Raised for: ${raised_for}"
local httpcode
httpcode=$(docurl -X POST \
--data-urlencode "title=${title}" \
--data-urlencode "body=${body}" \
--data-urlencode "type=${notif_type}" \
"${apprise_url}")
if [ "${httpcode}" = "200" ]; then
info "sent custom notification for ${name} on ${host}"
return 0
fi
error "failed to send notification for ${name} on ${host} (HTTP ${httpcode})"
return 1
}
+39
View File
@@ -13,6 +13,7 @@
base_dir: "{{ (application_dir, app_name) | path_join }}"
config_dir: "{{ (base_dir, 'config') | path_join }}"
config_go_d_dir: "{{ (config_dir, 'go.d') | path_join }}"
config_health_d_dir: "{{ (config_dir, 'health.d') | path_join }}"
data_dir: "{{ (base_dir, 'data') | path_join }}"
tasks:
@@ -37,6 +38,7 @@
- "{{ data_dir }}"
- "{{ config_dir }}"
- "{{ config_go_d_dir }}"
- "{{ config_health_d_dir }}"
- name: "Copy netdata config file"
ansible.builtin.template:
@@ -75,6 +77,43 @@
loop: "{{ go_d_existing_files.files }}"
when: (item.path | basename) not in (go_d_source_files.files | map(attribute='path') | map('basename') | list)
- name: "Find all health.d config files"
ansible.builtin.find:
paths: "files/{{ app_name }}/health.d"
file_type: file
delegate_to: localhost
register: health_d_source_files
- name: "Template all health.d config files"
ansible.builtin.template:
src: "{{ item.path }}"
dest: "{{ config_health_d_dir }}/{{ item.path | basename }}"
owner: "{{ app_user }}"
group: "{{ app_user }}"
mode: "0640"
loop: "{{ health_d_source_files.files }}"
- name: "Find existing health.d config files on server"
ansible.builtin.find:
paths: "{{ config_health_d_dir }}"
file_type: file
register: health_d_existing_files
- name: "Remove health.d config files that don't exist in source"
ansible.builtin.file:
path: "{{ item.path }}"
state: absent
loop: "{{ health_d_existing_files.files }}"
when: (item.path | basename) not in (health_d_source_files.files | map(attribute='path') | map('basename') | list)
- name: "Copy health alarm notify config"
ansible.builtin.template:
src: "files/{{ app_name }}/health_alarm_notify.template.conf"
dest: "{{ config_dir }}/health_alarm_notify.conf"
owner: "{{ app_user }}"
group: "{{ app_user }}"
mode: "0640"
- name: "Grab docker group id."
ansible.builtin.shell:
cmd: |