diff --git a/files/apprise/docker-compose.template.yml b/files/apprise/docker-compose.template.yml index a26ecab..94126ab 100644 --- a/files/apprise/docker-compose.template.yml +++ b/files/apprise/docker-compose.template.yml @@ -7,7 +7,9 @@ services: ports: - "127.0.0.1:{{ apprise_external_port }}:8000" networks: - - "web_proxy_network" + web_proxy_network: + aliases: + - "apprise" volumes: - "{{ config_dir }}:/config" environment: diff --git a/files/netdata/health.d/system_resources.conf b/files/netdata/health.d/system_resources.conf new file mode 100644 index 0000000..ab00f01 --- /dev/null +++ b/files/netdata/health.d/system_resources.conf @@ -0,0 +1,67 @@ +# Resource alerts for a low-spec home server. +# Overrides stock alerts where thresholds differ; baseline RAM use is ~80%, so stock 80/90% would fire constantly. + +# RAM: warn at >92%, crit at >95% — by then less than ~200 MB free. + alarm: ram_in_use + on: system.ram + class: Utilization + type: System +component: Memory + calc: $used * 100 / ($used + $cached + $free + $buffers) + units: % + every: 10s + warn: $this > 92 + crit: $this > 95 + delay: down 5m multiplier 1.5 max 1h +summary: System memory utilization + info: System memory utilization (used / total, excluding reclaimable cache) + to: sysadmin + +# CPU: replace stock 10min_cpu_usage with two windowed alerts. + alarm: 10min_cpu_usage + on: system.cpu +enabled: no + + alarm: cpu_warn_30m + on: system.cpu + class: Utilization + type: System +component: CPU +lookup: average -30m unaligned of user,system,softirq,irq,guest,guest_nice,nice + units: % + every: 1m + warn: $this > 80 + delay: down 30m multiplier 1.5 max 2h +summary: Sustained CPU load (30m avg) + info: Average CPU utilization over the last 30 minutes + to: sysadmin + + alarm: cpu_crit_15m + on: system.cpu + class: Utilization + type: System +component: CPU +lookup: average -15m unaligned of user,system,softirq,irq,guest,guest_nice,nice + units: % + every: 1m + crit: $this > 95 + delay: down 30m multiplier 1.5 max 2h +summary: High CPU load (15m avg) + info: Average CPU utilization over the last 15 minutes + to: sysadmin + +# Disk: warn at >75%, crit at >90% on every mounted filesystem. + template: disk_space_usage + on: disk.space + class: Utilization + type: System +component: Disk + calc: $used * 100 / ($avail + $used) + units: % + every: 1m + warn: $this > 75 + crit: $this > 90 + delay: down 15m multiplier 1.5 max 1h + summary: Disk space utilization + info: Disk space utilization on ${label:mount_point} + to: sysadmin diff --git a/files/netdata/health_alarm_notify.template.conf b/files/netdata/health_alarm_notify.template.conf new file mode 100644 index 0000000..5523017 --- /dev/null +++ b/files/netdata/health_alarm_notify.template.conf @@ -0,0 +1,45 @@ +# Override stock health_alarm_notify.conf — route every alert to apprise. +# Stock conf is sourced first; this only sets what differs. + +SEND_EMAIL="NO" +SEND_CUSTOM="YES" +DEFAULT_RECIPIENT_CUSTOM="server" + +role_recipients_custom[sysadmin]="server" +role_recipients_custom[domainadmin]="server" +role_recipients_custom[dba]="server" +role_recipients_custom[webmaster]="server" +role_recipients_custom[proxyadmin]="server" +role_recipients_custom[silent]="" + +custom_sender() { + local apprise_url="http://apprise:8000/notify/${1}/" + + local notif_type="info" + case "${status}" in + CRITICAL) notif_type="failure" ;; + WARNING) notif_type="warning" ;; + CLEAR) notif_type="success" ;; + esac + + local title="[${status}] ${name} on ${host}" + local body="${status_message}: ${alarm} +Chart: ${chart} +Value: ${value} ${units} +Info: ${info} +Raised for: ${raised_for}" + + local httpcode + httpcode=$(docurl -X POST \ + --data-urlencode "title=${title}" \ + --data-urlencode "body=${body}" \ + --data-urlencode "type=${notif_type}" \ + "${apprise_url}") + + if [ "${httpcode}" = "200" ]; then + info "sent custom notification for ${name} on ${host}" + return 0 + fi + error "failed to send notification for ${name} on ${host} (HTTP ${httpcode})" + return 1 +} diff --git a/playbook-netdata.yml b/playbook-netdata.yml index 8bbd9e2..b75f027 100644 --- a/playbook-netdata.yml +++ b/playbook-netdata.yml @@ -13,6 +13,7 @@ base_dir: "{{ (application_dir, app_name) | path_join }}" config_dir: "{{ (base_dir, 'config') | path_join }}" config_go_d_dir: "{{ (config_dir, 'go.d') | path_join }}" + config_health_d_dir: "{{ (config_dir, 'health.d') | path_join }}" data_dir: "{{ (base_dir, 'data') | path_join }}" tasks: @@ -37,6 +38,7 @@ - "{{ data_dir }}" - "{{ config_dir }}" - "{{ config_go_d_dir }}" + - "{{ config_health_d_dir }}" - name: "Copy netdata config file" ansible.builtin.template: @@ -75,6 +77,43 @@ loop: "{{ go_d_existing_files.files }}" when: (item.path | basename) not in (go_d_source_files.files | map(attribute='path') | map('basename') | list) + - name: "Find all health.d config files" + ansible.builtin.find: + paths: "files/{{ app_name }}/health.d" + file_type: file + delegate_to: localhost + register: health_d_source_files + + - name: "Template all health.d config files" + ansible.builtin.template: + src: "{{ item.path }}" + dest: "{{ config_health_d_dir }}/{{ item.path | basename }}" + owner: "{{ app_user }}" + group: "{{ app_user }}" + mode: "0640" + loop: "{{ health_d_source_files.files }}" + + - name: "Find existing health.d config files on server" + ansible.builtin.find: + paths: "{{ config_health_d_dir }}" + file_type: file + register: health_d_existing_files + + - name: "Remove health.d config files that don't exist in source" + ansible.builtin.file: + path: "{{ item.path }}" + state: absent + loop: "{{ health_d_existing_files.files }}" + when: (item.path | basename) not in (health_d_source_files.files | map(attribute='path') | map('basename') | list) + + - name: "Copy health alarm notify config" + ansible.builtin.template: + src: "files/{{ app_name }}/health_alarm_notify.template.conf" + dest: "{{ config_dir }}/health_alarm_notify.conf" + owner: "{{ app_user }}" + group: "{{ app_user }}" + mode: "0640" + - name: "Grab docker group id." ansible.builtin.shell: cmd: |