Netdata: add alerts for cpu, ram, disks
This commit is contained in:
@@ -7,7 +7,9 @@ services:
|
||||
ports:
|
||||
- "127.0.0.1:{{ apprise_external_port }}:8000"
|
||||
networks:
|
||||
- "web_proxy_network"
|
||||
web_proxy_network:
|
||||
aliases:
|
||||
- "apprise"
|
||||
volumes:
|
||||
- "{{ config_dir }}:/config"
|
||||
environment:
|
||||
|
||||
@@ -0,0 +1,67 @@
|
||||
# Resource alerts for a low-spec home server.
|
||||
# Overrides stock alerts where thresholds differ; baseline RAM use is ~80%, so stock 80/90% would fire constantly.
|
||||
|
||||
# RAM: warn at >92%, crit at >95% — by then less than ~200 MB free.
|
||||
alarm: ram_in_use
|
||||
on: system.ram
|
||||
class: Utilization
|
||||
type: System
|
||||
component: Memory
|
||||
calc: $used * 100 / ($used + $cached + $free + $buffers)
|
||||
units: %
|
||||
every: 10s
|
||||
warn: $this > 92
|
||||
crit: $this > 95
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
summary: System memory utilization
|
||||
info: System memory utilization (used / total, excluding reclaimable cache)
|
||||
to: sysadmin
|
||||
|
||||
# CPU: replace stock 10min_cpu_usage with two windowed alerts.
|
||||
alarm: 10min_cpu_usage
|
||||
on: system.cpu
|
||||
enabled: no
|
||||
|
||||
alarm: cpu_warn_30m
|
||||
on: system.cpu
|
||||
class: Utilization
|
||||
type: System
|
||||
component: CPU
|
||||
lookup: average -30m unaligned of user,system,softirq,irq,guest,guest_nice,nice
|
||||
units: %
|
||||
every: 1m
|
||||
warn: $this > 80
|
||||
delay: down 30m multiplier 1.5 max 2h
|
||||
summary: Sustained CPU load (30m avg)
|
||||
info: Average CPU utilization over the last 30 minutes
|
||||
to: sysadmin
|
||||
|
||||
alarm: cpu_crit_15m
|
||||
on: system.cpu
|
||||
class: Utilization
|
||||
type: System
|
||||
component: CPU
|
||||
lookup: average -15m unaligned of user,system,softirq,irq,guest,guest_nice,nice
|
||||
units: %
|
||||
every: 1m
|
||||
crit: $this > 95
|
||||
delay: down 30m multiplier 1.5 max 2h
|
||||
summary: High CPU load (15m avg)
|
||||
info: Average CPU utilization over the last 15 minutes
|
||||
to: sysadmin
|
||||
|
||||
# Disk: warn at >75%, crit at >90% on every mounted filesystem.
|
||||
template: disk_space_usage
|
||||
on: disk.space
|
||||
class: Utilization
|
||||
type: System
|
||||
component: Disk
|
||||
calc: $used * 100 / ($avail + $used)
|
||||
units: %
|
||||
every: 1m
|
||||
warn: $this > 75
|
||||
crit: $this > 90
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
summary: Disk space utilization
|
||||
info: Disk space utilization on ${label:mount_point}
|
||||
to: sysadmin
|
||||
@@ -0,0 +1,45 @@
|
||||
# Override stock health_alarm_notify.conf — route every alert to apprise.
|
||||
# Stock conf is sourced first; this only sets what differs.
|
||||
|
||||
SEND_EMAIL="NO"
|
||||
SEND_CUSTOM="YES"
|
||||
DEFAULT_RECIPIENT_CUSTOM="server"
|
||||
|
||||
role_recipients_custom[sysadmin]="server"
|
||||
role_recipients_custom[domainadmin]="server"
|
||||
role_recipients_custom[dba]="server"
|
||||
role_recipients_custom[webmaster]="server"
|
||||
role_recipients_custom[proxyadmin]="server"
|
||||
role_recipients_custom[silent]=""
|
||||
|
||||
custom_sender() {
|
||||
local apprise_url="http://apprise:8000/notify/${1}/"
|
||||
|
||||
local notif_type="info"
|
||||
case "${status}" in
|
||||
CRITICAL) notif_type="failure" ;;
|
||||
WARNING) notif_type="warning" ;;
|
||||
CLEAR) notif_type="success" ;;
|
||||
esac
|
||||
|
||||
local title="[${status}] ${name} on ${host}"
|
||||
local body="${status_message}: ${alarm}
|
||||
Chart: ${chart}
|
||||
Value: ${value} ${units}
|
||||
Info: ${info}
|
||||
Raised for: ${raised_for}"
|
||||
|
||||
local httpcode
|
||||
httpcode=$(docurl -X POST \
|
||||
--data-urlencode "title=${title}" \
|
||||
--data-urlencode "body=${body}" \
|
||||
--data-urlencode "type=${notif_type}" \
|
||||
"${apprise_url}")
|
||||
|
||||
if [ "${httpcode}" = "200" ]; then
|
||||
info "sent custom notification for ${name} on ${host}"
|
||||
return 0
|
||||
fi
|
||||
error "failed to send notification for ${name} on ${host} (HTTP ${httpcode})"
|
||||
return 1
|
||||
}
|
||||
@@ -13,6 +13,7 @@
|
||||
base_dir: "{{ (application_dir, app_name) | path_join }}"
|
||||
config_dir: "{{ (base_dir, 'config') | path_join }}"
|
||||
config_go_d_dir: "{{ (config_dir, 'go.d') | path_join }}"
|
||||
config_health_d_dir: "{{ (config_dir, 'health.d') | path_join }}"
|
||||
data_dir: "{{ (base_dir, 'data') | path_join }}"
|
||||
|
||||
tasks:
|
||||
@@ -37,6 +38,7 @@
|
||||
- "{{ data_dir }}"
|
||||
- "{{ config_dir }}"
|
||||
- "{{ config_go_d_dir }}"
|
||||
- "{{ config_health_d_dir }}"
|
||||
|
||||
- name: "Copy netdata config file"
|
||||
ansible.builtin.template:
|
||||
@@ -75,6 +77,43 @@
|
||||
loop: "{{ go_d_existing_files.files }}"
|
||||
when: (item.path | basename) not in (go_d_source_files.files | map(attribute='path') | map('basename') | list)
|
||||
|
||||
- name: "Find all health.d config files"
|
||||
ansible.builtin.find:
|
||||
paths: "files/{{ app_name }}/health.d"
|
||||
file_type: file
|
||||
delegate_to: localhost
|
||||
register: health_d_source_files
|
||||
|
||||
- name: "Template all health.d config files"
|
||||
ansible.builtin.template:
|
||||
src: "{{ item.path }}"
|
||||
dest: "{{ config_health_d_dir }}/{{ item.path | basename }}"
|
||||
owner: "{{ app_user }}"
|
||||
group: "{{ app_user }}"
|
||||
mode: "0640"
|
||||
loop: "{{ health_d_source_files.files }}"
|
||||
|
||||
- name: "Find existing health.d config files on server"
|
||||
ansible.builtin.find:
|
||||
paths: "{{ config_health_d_dir }}"
|
||||
file_type: file
|
||||
register: health_d_existing_files
|
||||
|
||||
- name: "Remove health.d config files that don't exist in source"
|
||||
ansible.builtin.file:
|
||||
path: "{{ item.path }}"
|
||||
state: absent
|
||||
loop: "{{ health_d_existing_files.files }}"
|
||||
when: (item.path | basename) not in (health_d_source_files.files | map(attribute='path') | map('basename') | list)
|
||||
|
||||
- name: "Copy health alarm notify config"
|
||||
ansible.builtin.template:
|
||||
src: "files/{{ app_name }}/health_alarm_notify.template.conf"
|
||||
dest: "{{ config_dir }}/health_alarm_notify.conf"
|
||||
owner: "{{ app_user }}"
|
||||
group: "{{ app_user }}"
|
||||
mode: "0640"
|
||||
|
||||
- name: "Grab docker group id."
|
||||
ansible.builtin.shell:
|
||||
cmd: |
|
||||
|
||||
Reference in New Issue
Block a user