Netdata: add alerts for cpu, ram, disks
This commit is contained in:
@@ -7,7 +7,9 @@ services:
|
||||
ports:
|
||||
- "127.0.0.1:{{ apprise_external_port }}:8000"
|
||||
networks:
|
||||
- "web_proxy_network"
|
||||
web_proxy_network:
|
||||
aliases:
|
||||
- "apprise"
|
||||
volumes:
|
||||
- "{{ config_dir }}:/config"
|
||||
environment:
|
||||
|
||||
@@ -0,0 +1,67 @@
|
||||
# Resource alerts for a low-spec home server.
|
||||
# Overrides stock alerts where thresholds differ; baseline RAM use is ~80%, so stock 80/90% would fire constantly.
|
||||
|
||||
# RAM: warn at >92%, crit at >95% — by then less than ~200 MB free.
|
||||
alarm: ram_in_use
|
||||
on: system.ram
|
||||
class: Utilization
|
||||
type: System
|
||||
component: Memory
|
||||
calc: $used * 100 / ($used + $cached + $free + $buffers)
|
||||
units: %
|
||||
every: 10s
|
||||
warn: $this > 92
|
||||
crit: $this > 95
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
summary: System memory utilization
|
||||
info: System memory utilization (used / total, excluding reclaimable cache)
|
||||
to: sysadmin
|
||||
|
||||
# CPU: replace stock 10min_cpu_usage with two windowed alerts.
|
||||
alarm: 10min_cpu_usage
|
||||
on: system.cpu
|
||||
enabled: no
|
||||
|
||||
alarm: cpu_warn_30m
|
||||
on: system.cpu
|
||||
class: Utilization
|
||||
type: System
|
||||
component: CPU
|
||||
lookup: average -30m unaligned of user,system,softirq,irq,guest,guest_nice,nice
|
||||
units: %
|
||||
every: 1m
|
||||
warn: $this > 80
|
||||
delay: down 30m multiplier 1.5 max 2h
|
||||
summary: Sustained CPU load (30m avg)
|
||||
info: Average CPU utilization over the last 30 minutes
|
||||
to: sysadmin
|
||||
|
||||
alarm: cpu_crit_15m
|
||||
on: system.cpu
|
||||
class: Utilization
|
||||
type: System
|
||||
component: CPU
|
||||
lookup: average -15m unaligned of user,system,softirq,irq,guest,guest_nice,nice
|
||||
units: %
|
||||
every: 1m
|
||||
crit: $this > 95
|
||||
delay: down 30m multiplier 1.5 max 2h
|
||||
summary: High CPU load (15m avg)
|
||||
info: Average CPU utilization over the last 15 minutes
|
||||
to: sysadmin
|
||||
|
||||
# Disk: warn at >75%, crit at >90% on every mounted filesystem.
|
||||
template: disk_space_usage
|
||||
on: disk.space
|
||||
class: Utilization
|
||||
type: System
|
||||
component: Disk
|
||||
calc: $used * 100 / ($avail + $used)
|
||||
units: %
|
||||
every: 1m
|
||||
warn: $this > 75
|
||||
crit: $this > 90
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
summary: Disk space utilization
|
||||
info: Disk space utilization on ${label:mount_point}
|
||||
to: sysadmin
|
||||
@@ -0,0 +1,45 @@
|
||||
# Override stock health_alarm_notify.conf — route every alert to apprise.
|
||||
# Stock conf is sourced first; this only sets what differs.
|
||||
|
||||
SEND_EMAIL="NO"
|
||||
SEND_CUSTOM="YES"
|
||||
DEFAULT_RECIPIENT_CUSTOM="server"
|
||||
|
||||
role_recipients_custom[sysadmin]="server"
|
||||
role_recipients_custom[domainadmin]="server"
|
||||
role_recipients_custom[dba]="server"
|
||||
role_recipients_custom[webmaster]="server"
|
||||
role_recipients_custom[proxyadmin]="server"
|
||||
role_recipients_custom[silent]=""
|
||||
|
||||
custom_sender() {
|
||||
local apprise_url="http://apprise:8000/notify/${1}/"
|
||||
|
||||
local notif_type="info"
|
||||
case "${status}" in
|
||||
CRITICAL) notif_type="failure" ;;
|
||||
WARNING) notif_type="warning" ;;
|
||||
CLEAR) notif_type="success" ;;
|
||||
esac
|
||||
|
||||
local title="[${status}] ${name} on ${host}"
|
||||
local body="${status_message}: ${alarm}
|
||||
Chart: ${chart}
|
||||
Value: ${value} ${units}
|
||||
Info: ${info}
|
||||
Raised for: ${raised_for}"
|
||||
|
||||
local httpcode
|
||||
httpcode=$(docurl -X POST \
|
||||
--data-urlencode "title=${title}" \
|
||||
--data-urlencode "body=${body}" \
|
||||
--data-urlencode "type=${notif_type}" \
|
||||
"${apprise_url}")
|
||||
|
||||
if [ "${httpcode}" = "200" ]; then
|
||||
info "sent custom notification for ${name} on ${host}"
|
||||
return 0
|
||||
fi
|
||||
error "failed to send notification for ${name} on ${host} (HTTP ${httpcode})"
|
||||
return 1
|
||||
}
|
||||
Reference in New Issue
Block a user