Netdata: add alerts for cpu, ram, disks
This commit is contained in:
@@ -7,7 +7,9 @@ services:
|
|||||||
ports:
|
ports:
|
||||||
- "127.0.0.1:{{ apprise_external_port }}:8000"
|
- "127.0.0.1:{{ apprise_external_port }}:8000"
|
||||||
networks:
|
networks:
|
||||||
- "web_proxy_network"
|
web_proxy_network:
|
||||||
|
aliases:
|
||||||
|
- "apprise"
|
||||||
volumes:
|
volumes:
|
||||||
- "{{ config_dir }}:/config"
|
- "{{ config_dir }}:/config"
|
||||||
environment:
|
environment:
|
||||||
|
|||||||
@@ -0,0 +1,67 @@
|
|||||||
|
# Resource alerts for a low-spec home server.
|
||||||
|
# Overrides stock alerts where thresholds differ; baseline RAM use is ~80%, so stock 80/90% would fire constantly.
|
||||||
|
|
||||||
|
# RAM: warn at >92%, crit at >95% — by then less than ~200 MB free.
|
||||||
|
alarm: ram_in_use
|
||||||
|
on: system.ram
|
||||||
|
class: Utilization
|
||||||
|
type: System
|
||||||
|
component: Memory
|
||||||
|
calc: $used * 100 / ($used + $cached + $free + $buffers)
|
||||||
|
units: %
|
||||||
|
every: 10s
|
||||||
|
warn: $this > 92
|
||||||
|
crit: $this > 95
|
||||||
|
delay: down 5m multiplier 1.5 max 1h
|
||||||
|
summary: System memory utilization
|
||||||
|
info: System memory utilization (used / total, excluding reclaimable cache)
|
||||||
|
to: sysadmin
|
||||||
|
|
||||||
|
# CPU: replace stock 10min_cpu_usage with two windowed alerts.
|
||||||
|
alarm: 10min_cpu_usage
|
||||||
|
on: system.cpu
|
||||||
|
enabled: no
|
||||||
|
|
||||||
|
alarm: cpu_warn_30m
|
||||||
|
on: system.cpu
|
||||||
|
class: Utilization
|
||||||
|
type: System
|
||||||
|
component: CPU
|
||||||
|
lookup: average -30m unaligned of user,system,softirq,irq,guest,guest_nice,nice
|
||||||
|
units: %
|
||||||
|
every: 1m
|
||||||
|
warn: $this > 80
|
||||||
|
delay: down 30m multiplier 1.5 max 2h
|
||||||
|
summary: Sustained CPU load (30m avg)
|
||||||
|
info: Average CPU utilization over the last 30 minutes
|
||||||
|
to: sysadmin
|
||||||
|
|
||||||
|
alarm: cpu_crit_15m
|
||||||
|
on: system.cpu
|
||||||
|
class: Utilization
|
||||||
|
type: System
|
||||||
|
component: CPU
|
||||||
|
lookup: average -15m unaligned of user,system,softirq,irq,guest,guest_nice,nice
|
||||||
|
units: %
|
||||||
|
every: 1m
|
||||||
|
crit: $this > 95
|
||||||
|
delay: down 30m multiplier 1.5 max 2h
|
||||||
|
summary: High CPU load (15m avg)
|
||||||
|
info: Average CPU utilization over the last 15 minutes
|
||||||
|
to: sysadmin
|
||||||
|
|
||||||
|
# Disk: warn at >75%, crit at >90% on every mounted filesystem.
|
||||||
|
template: disk_space_usage
|
||||||
|
on: disk.space
|
||||||
|
class: Utilization
|
||||||
|
type: System
|
||||||
|
component: Disk
|
||||||
|
calc: $used * 100 / ($avail + $used)
|
||||||
|
units: %
|
||||||
|
every: 1m
|
||||||
|
warn: $this > 75
|
||||||
|
crit: $this > 90
|
||||||
|
delay: down 15m multiplier 1.5 max 1h
|
||||||
|
summary: Disk space utilization
|
||||||
|
info: Disk space utilization on ${label:mount_point}
|
||||||
|
to: sysadmin
|
||||||
@@ -0,0 +1,45 @@
|
|||||||
|
# Override stock health_alarm_notify.conf — route every alert to apprise.
|
||||||
|
# Stock conf is sourced first; this only sets what differs.
|
||||||
|
|
||||||
|
SEND_EMAIL="NO"
|
||||||
|
SEND_CUSTOM="YES"
|
||||||
|
DEFAULT_RECIPIENT_CUSTOM="server"
|
||||||
|
|
||||||
|
role_recipients_custom[sysadmin]="server"
|
||||||
|
role_recipients_custom[domainadmin]="server"
|
||||||
|
role_recipients_custom[dba]="server"
|
||||||
|
role_recipients_custom[webmaster]="server"
|
||||||
|
role_recipients_custom[proxyadmin]="server"
|
||||||
|
role_recipients_custom[silent]=""
|
||||||
|
|
||||||
|
custom_sender() {
|
||||||
|
local apprise_url="http://apprise:8000/notify/${1}/"
|
||||||
|
|
||||||
|
local notif_type="info"
|
||||||
|
case "${status}" in
|
||||||
|
CRITICAL) notif_type="failure" ;;
|
||||||
|
WARNING) notif_type="warning" ;;
|
||||||
|
CLEAR) notif_type="success" ;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
local title="[${status}] ${name} on ${host}"
|
||||||
|
local body="${status_message}: ${alarm}
|
||||||
|
Chart: ${chart}
|
||||||
|
Value: ${value} ${units}
|
||||||
|
Info: ${info}
|
||||||
|
Raised for: ${raised_for}"
|
||||||
|
|
||||||
|
local httpcode
|
||||||
|
httpcode=$(docurl -X POST \
|
||||||
|
--data-urlencode "title=${title}" \
|
||||||
|
--data-urlencode "body=${body}" \
|
||||||
|
--data-urlencode "type=${notif_type}" \
|
||||||
|
"${apprise_url}")
|
||||||
|
|
||||||
|
if [ "${httpcode}" = "200" ]; then
|
||||||
|
info "sent custom notification for ${name} on ${host}"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
error "failed to send notification for ${name} on ${host} (HTTP ${httpcode})"
|
||||||
|
return 1
|
||||||
|
}
|
||||||
@@ -13,6 +13,7 @@
|
|||||||
base_dir: "{{ (application_dir, app_name) | path_join }}"
|
base_dir: "{{ (application_dir, app_name) | path_join }}"
|
||||||
config_dir: "{{ (base_dir, 'config') | path_join }}"
|
config_dir: "{{ (base_dir, 'config') | path_join }}"
|
||||||
config_go_d_dir: "{{ (config_dir, 'go.d') | path_join }}"
|
config_go_d_dir: "{{ (config_dir, 'go.d') | path_join }}"
|
||||||
|
config_health_d_dir: "{{ (config_dir, 'health.d') | path_join }}"
|
||||||
data_dir: "{{ (base_dir, 'data') | path_join }}"
|
data_dir: "{{ (base_dir, 'data') | path_join }}"
|
||||||
|
|
||||||
tasks:
|
tasks:
|
||||||
@@ -37,6 +38,7 @@
|
|||||||
- "{{ data_dir }}"
|
- "{{ data_dir }}"
|
||||||
- "{{ config_dir }}"
|
- "{{ config_dir }}"
|
||||||
- "{{ config_go_d_dir }}"
|
- "{{ config_go_d_dir }}"
|
||||||
|
- "{{ config_health_d_dir }}"
|
||||||
|
|
||||||
- name: "Copy netdata config file"
|
- name: "Copy netdata config file"
|
||||||
ansible.builtin.template:
|
ansible.builtin.template:
|
||||||
@@ -75,6 +77,43 @@
|
|||||||
loop: "{{ go_d_existing_files.files }}"
|
loop: "{{ go_d_existing_files.files }}"
|
||||||
when: (item.path | basename) not in (go_d_source_files.files | map(attribute='path') | map('basename') | list)
|
when: (item.path | basename) not in (go_d_source_files.files | map(attribute='path') | map('basename') | list)
|
||||||
|
|
||||||
|
- name: "Find all health.d config files"
|
||||||
|
ansible.builtin.find:
|
||||||
|
paths: "files/{{ app_name }}/health.d"
|
||||||
|
file_type: file
|
||||||
|
delegate_to: localhost
|
||||||
|
register: health_d_source_files
|
||||||
|
|
||||||
|
- name: "Template all health.d config files"
|
||||||
|
ansible.builtin.template:
|
||||||
|
src: "{{ item.path }}"
|
||||||
|
dest: "{{ config_health_d_dir }}/{{ item.path | basename }}"
|
||||||
|
owner: "{{ app_user }}"
|
||||||
|
group: "{{ app_user }}"
|
||||||
|
mode: "0640"
|
||||||
|
loop: "{{ health_d_source_files.files }}"
|
||||||
|
|
||||||
|
- name: "Find existing health.d config files on server"
|
||||||
|
ansible.builtin.find:
|
||||||
|
paths: "{{ config_health_d_dir }}"
|
||||||
|
file_type: file
|
||||||
|
register: health_d_existing_files
|
||||||
|
|
||||||
|
- name: "Remove health.d config files that don't exist in source"
|
||||||
|
ansible.builtin.file:
|
||||||
|
path: "{{ item.path }}"
|
||||||
|
state: absent
|
||||||
|
loop: "{{ health_d_existing_files.files }}"
|
||||||
|
when: (item.path | basename) not in (health_d_source_files.files | map(attribute='path') | map('basename') | list)
|
||||||
|
|
||||||
|
- name: "Copy health alarm notify config"
|
||||||
|
ansible.builtin.template:
|
||||||
|
src: "files/{{ app_name }}/health_alarm_notify.template.conf"
|
||||||
|
dest: "{{ config_dir }}/health_alarm_notify.conf"
|
||||||
|
owner: "{{ app_user }}"
|
||||||
|
group: "{{ app_user }}"
|
||||||
|
mode: "0640"
|
||||||
|
|
||||||
- name: "Grab docker group id."
|
- name: "Grab docker group id."
|
||||||
ansible.builtin.shell:
|
ansible.builtin.shell:
|
||||||
cmd: |
|
cmd: |
|
||||||
|
|||||||
Reference in New Issue
Block a user