Netdata: add alerts for cpu, ram, disks
Linting / YAML Lint (push) Successful in 9s
Linting / Ansible Lint (push) Failing after 31s

This commit is contained in:
2026-05-01 13:58:51 +03:00
parent 472c7a984f
commit d25a28c611
4 changed files with 154 additions and 1 deletions
@@ -0,0 +1,67 @@
# Resource alerts for a low-spec home server.
# Overrides stock alerts where thresholds differ; baseline RAM use is ~80%, so stock 80/90% would fire constantly.
# RAM: warn at >92%, crit at >95% — by then less than ~200 MB free.
alarm: ram_in_use
on: system.ram
class: Utilization
type: System
component: Memory
calc: $used * 100 / ($used + $cached + $free + $buffers)
units: %
every: 10s
warn: $this > 92
crit: $this > 95
delay: down 5m multiplier 1.5 max 1h
summary: System memory utilization
info: System memory utilization (used / total, excluding reclaimable cache)
to: sysadmin
# CPU: replace stock 10min_cpu_usage with two windowed alerts.
alarm: 10min_cpu_usage
on: system.cpu
enabled: no
alarm: cpu_warn_30m
on: system.cpu
class: Utilization
type: System
component: CPU
lookup: average -30m unaligned of user,system,softirq,irq,guest,guest_nice,nice
units: %
every: 1m
warn: $this > 80
delay: down 30m multiplier 1.5 max 2h
summary: Sustained CPU load (30m avg)
info: Average CPU utilization over the last 30 minutes
to: sysadmin
alarm: cpu_crit_15m
on: system.cpu
class: Utilization
type: System
component: CPU
lookup: average -15m unaligned of user,system,softirq,irq,guest,guest_nice,nice
units: %
every: 1m
crit: $this > 95
delay: down 30m multiplier 1.5 max 2h
summary: High CPU load (15m avg)
info: Average CPU utilization over the last 15 minutes
to: sysadmin
# Disk: warn at >75%, crit at >90% on every mounted filesystem.
template: disk_space_usage
on: disk.space
class: Utilization
type: System
component: Disk
calc: $used * 100 / ($avail + $used)
units: %
every: 1m
warn: $this > 75
crit: $this > 90
delay: down 15m multiplier 1.5 max 1h
summary: Disk space utilization
info: Disk space utilization on ${label:mount_point}
to: sysadmin