Monitoring stack
The monitoring stack with Grafana, Prometheus, and Alertmanager delivers efficient system and application insights. Grafana visualizes metrics, Prometheus monitors and alerts on time series data, and Alertmanager handles alert notifications. This setup ensures proactive infrastructure management.
version : '3.3'
networks :
monitoring :
driver : bridge
volumes :
prometheus_data : {}
services :
node-exporter :
image : prom/node-exporter:latest
container_name : node-exporter
restart : unless-stopped
volumes :
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
command :
- '--path.procfs=/host/proc'
- '--path.rootfs=/rootfs'
- '--path.sysfs=/host/sys'
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
ports :
- 9100:9100
networks :
- monitoring
prometheus :
image : prom/prometheus:latest
user : "1001"
environment :
- PUID=1001
- PGID=1001
container_name : prometheus
restart : unless-stopped
volumes :
- /home/1001/promgrafnode/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
- /home/1001/promgrafnode/prometheus/alert.rules.yml:/etc/prometheus/alert.rules.yml
- /home/1001/promgrafnode/prometheus:/prometheus
command :
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.console.libraries=/etc/prometheus/console_libraries'
- '--web.console.templates=/etc/prometheus/consoles'
- '--web.enable-lifecycle'
ports :
- 9090:9090
networks :
- monitoring
grafana :
image : grafana/grafana:latest
user : "1001"
container_name : grafana
ports :
- 3000:3000
restart : unless-stopped
volumes :
- /home/1001/promgrafnode/grafana/provisioning/datasources:/etc/grafana/provisioning/datasources
- /home/1001/promgrafnode/grafana:/var/lib/grafana
networks :
- monitoring
alertmanager :
image : prom/alertmanager:latest
command :
- '--config.file=/etc/alertmanager/config.yml'
ports :
- '9093:9093'
volumes :
- '/home/alertmanager/:/etc/alertmanager'
networks :
- alertmanager-net
networks :
alertmanager-net :
cadvisor :
image : gcr.io/cadvisor/cadvisor:latest
container_name : cadvisor
ports :
- 8081:8080
networks :
- monitoring
volumes :
- /:/rootfs:ro
- /var/run:/var/run:rw
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
depends_on :
- redis
redis :
image : redis:latest
container_name : redis
ports :
- 6379:6379
networks :
- monitoring
```
## prometheus.yml
``` yaml linenums="1"
global :
scrape_interval : 1m
rule_files :
- alert.rules.yml
alerting :
alertmanagers :
- static_configs :
- targets : [ 'alto.merox.cloud:9093' ]
scrape_configs :
- job_name : "prometheus"
scrape_interval : 1m
static_configs :
- targets : [ "localhost:9090" ]
- job_name : "AltoVM"
static_configs :
- targets : [ "node-exporter:9100" ]
- job_name : "CirrusVM"
static_configs :
- targets : [ "cirrus.merox.cloud:9100" ]
- job_name : "cadvisor_alto"
scrape_interval : 5s
static_configs :
- targets : [ "alto.merox.cloud:8081" ]
- job_name : "cadvisor_cirrus"
scrape_interval : 5s
static_configs :
- targets : [ "cirrus.merox.cloud:8081" ]
# - job_name: "cadvisor_tower"
# scrape_interval: 5s
# static_configs:
# - targets: ["media.merox.cloud:8081"]
#Windows machines
- job_name : "Tower"
scrape_interval : 5s
static_configs :
- targets : [ "tower.merox.cloud:9182" ]
- job_name : "Wserver"
scrape_interval : 5s
static_configs :
- targets : [ "wserver.merox.cloud:9182" ]
#Proxmox
- job_name : 'pve-exporter'
static_configs :
- targets :
- nexus.merox.cloud:9221
metrics_path : /pve
params :
module : [ default ]
alert.rules.yml
groups :
- name : alert.rules
rules :
- alert : InstanceDown
expr : up == 0
for : 1m
labels :
severity : "critical"
annotations :
summary : "Endpoint {{ $labels.instance }} down"
description : "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes."
- alert : HostOutOfMemory
expr : node_memory_MemAvailable / node_memory_MemTotal * 100 < 25
for : 5m
labels :
severity : warning
annotations :
summary : "Host out of memory (instance {{ $labels.instance }})"
description : "Node memory is filling up (< 25% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert : HostOutOfDiskSpace
expr : (node_filesystem_avail{mountpoint="/"} * 100) / node_filesystem_size{mountpoint="/"} < 50
for : 1s
labels :
severity : warning
annotations :
summary : "Host out of disk space (instance {{ $labels.instance }})"
description : "Disk is almost full (< 50% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert : HostHighCpuLoad
expr : (sum by (instance) (irate(node_cpu{job="node_exporter_metrics",mode="idle"}[5m]))) > 80
for : 5m
labels :
severity : warning
annotations :
summary : "Host high CPU load (instance {{ $labels.instance }})"
description : "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert : HighHTTPRequestRate
expr : rate(http_requests_total[1m]) > 1000
for : 1m
labels :
severity : warning
annotations :
summary : "Possible DDoS Attack Detected"
description : "High rate of HTTP requests detected. Possible DDoS attack."
docker-compose -f /path/to/your/docker-compose-file.yml up -d
February 29, 2024
February 29, 2024