Alerts


/etc/prometheus/rules/server_alerts.yml > AllInstances
EspaceDisqueFull (4 active)
alert: EspaceDisqueFull
expr: ((node_filesystem_avail_bytes
  * 100) / node_filesystem_size_bytes < 10 and on(instance, device, mountpoint)
  node_filesystem_readonly == 0) * on(instance) group_left(alias) node_exporter_build_info
for: 2m
labels:
  severity: warning
annotations:
  description: |
    Le disque est bientôt plein (< 10% left)
  summary: Serveur à court d'espace disque ({{ $labels.alias }})
Labels State Active Since Value
alertname="EspaceDisqueFull" alias="airflow-metrics" device="/dev/sda1" fstype="ext4" instance="51.75.12.216:9100" job="Airflow" mountpoint="/" severity="warning" firing 2026-04-06 10:34:37.195354459 +0000 UTC 6.330396656671675
alertname="EspaceDisqueFull" alias="media.belive.ai" device="/dev/sda1" fstype="ext4" instance="5.196.234.26:9100" job="Others" mountpoint="/mnt/data" severity="warning" firing 2026-04-06 06:07:37.195354459 +0000 UTC 6.679054782030816
alertname="EspaceDisqueFull" alias="dev-server2" device="/dev/sdb1" fstype="ext4" instance="5.196.234.244:9100" job="server-dev" mountpoint="/" severity="warning" firing 2026-04-06 22:00:37.195354459 +0000 UTC 9.887404090957734
alertname="EspaceDisqueFull" alias="media.belive.ai" device="/dev/sda1" fstype="ext4" instance="5.196.234.26:9100" job="Others" mountpoint="/var/jail/mediauser/data" severity="warning" firing 2026-04-06 06:07:37.195354459 +0000 UTC 6.679054782030816
EspaceDisqueFull24Hours (0 active)
alert: EspaceDisqueFull24Hours
expr: ((node_filesystem_avail_bytes
  * 100) / node_filesystem_size_bytes < 10 and on(instance, device, mountpoint)
  predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600)
  < 0 and on(instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance)
  group_left(alias) node_exporter_build_info
for: 2m
labels:
  severity: warning
annotations:
  description: |
    Le système de fichiers devrait manquer d'espace dans les prochaines 24 heures au taux d'écriture actuel
  summary: Disque plein dans les prochaines 24 heures ({{ $labels.alias }})
GPUTemperatureHigh (0 active)
alert: GPUTemperatureHigh
expr: (nvidia_gpu_temperature_celsius
  > 80) * on(instance) group_left(alias) node_exporter_build_info
for: 2m
labels:
  severity: critical
annotations:
  description: La température du GPU est de {{ $value }}°C.
  summary: Surchauffe GPU sur {{ $labels.alias }}
GPUUsageHigh (0 active)
alert: GPUUsageHigh
expr: (nvidia_gpu_duty_cycle
  > 90) * on(instance) group_left(alias) node_exporter_build_info
for: 5m
labels:
  severity: warning
annotations:
  description: L'utilisation du GPU est supérieure à 90% depuis 5 minutes.
  summary: Utilisation GPU élevée sur {{ $labels.alias }}
GPUVramHigh (0 active)
alert: GPUVramHigh
expr: ((nvidia_gpu_memory_used_bytes
  / nvidia_gpu_memory_total_bytes) * 100 > 90) * on(instance) group_left(alias)
  node_exporter_build_info
for: 2m
labels:
  severity: critical
annotations:
  description: L'utilisation de la mémoire vidéo est à {{ $value | printf "%.2f"
    }}%.
  summary: VRAM GPU presque pleine sur {{ $labels.alias }}
MemoryUnderMemoryPressure (0 active)
alert: MemoryUnderMemoryPressure
expr: (rate(node_vmstat_pgmajfault[1m])
  > 1000) * on(instance) group_left(alias) node_exporter_build_info
for: 1m
labels:
  severity: warning
annotations:
  description: |-
    RAM sous pression - Risque de crash
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: RAM sous pression ({{ $labels.alias }})
MysqlDown (0 active)
alert: MysqlDown
expr: (mysql_up
  == 0) * on(instance) group_left(alias) node_exporter_build_info
labels:
  severity: critical
annotations:
  description: MySQL est down sur {{ $labels.alias }}
  summary: MySQL down ({{ $labels.alias }})
MysqlReplicationStopped (0 active)
alert: MysqlReplicationStopped
expr: ((mysql_slave_status_slave_io_running
  == 0 or mysql_slave_status_slave_sql_running == 0) and mysql_slave_status_slave_io_running{alias="replica-shared"})
for: 1m
labels:
  severity: critical
annotations:
  description: La réplication MySQL est arrêtée sur {{ $labels.instance }}. Vérifiez
    l’état du replica.
  summary: Réplication MySQL arrêtée ({{ $labels.instance }})
NTP_Decalee (0 active)
alert: NTP_Decalee
expr: ((node_timex_offset_seconds
  > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds
  < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) * on(instance) group_left(alias)
  node_exporter_build_info
for: 2m
labels:
  severity: warning
annotations:
  description: |
    Horloge non synchronisée. Vérifiez la config NTP.
  summary: Heure Décalée ({{ $labels.alias }})
NTP_Off (0 active)
alert: NTP_Off
expr: (min_over_time(node_timex_sync_status[1m])
  == 0 and node_timex_maxerror_seconds >= 16) * on(instance) group_left(alias)
  node_exporter_build_info
for: 2m
labels:
  severity: warning
annotations:
  description: |
    Horloge non synchronisée. Vérifiez la config NTP.
  summary: Heure non synchronisée ({{ $labels.alias }})
RAMHigh (0 active)
alert: RAMHigh
expr: (((node_memory_MemTotal_bytes
  - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes) * 100 > 95) *
  on(instance) group_left(alias) node_exporter_build_info
for: 1m
labels:
  severity: critical
  team: devops
annotations:
  description: RAM usage is above 95%.
  summary: RAM usage high on {{ $labels.alias }}
SaturationReseau (0 active)
ServeurManquant (0 active)
alert: ServeurManquant
expr: up == 0
labels:
  severity: critical
annotations:
  description: |
    Une cible Prometheus a disparu. Un serveur a peut-être crash.
  summary: Cible Prometheus manquante ({{ $labels.alias }})