| Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
| alert: ServeurManquant
expr: up == 0
labels:
severity: critical
annotations:
description: |
Une cible Prometheus a disparu. Un serveur a peut-être crash.
summary: Cible Prometheus manquante ({{ $labels.alias }})
|
ok
|
|
23.33s ago
|
1.349ms |
| alert: EspaceDisqueFull
expr: ((node_filesystem_avail_bytes
* 100) / node_filesystem_size_bytes < 10 and on(instance, device, mountpoint)
node_filesystem_readonly == 0) * on(instance) group_left(alias) node_exporter_build_info
for: 2m
labels:
severity: warning
annotations:
description: |
Le disque est bientôt plein (< 10% left)
summary: Serveur à court d'espace disque ({{ $labels.alias }})
|
ok
|
|
23.328s ago
|
18.54ms |
| alert: EspaceDisqueFull24Hours
expr: ((node_filesystem_avail_bytes
* 100) / node_filesystem_size_bytes < 10 and on(instance, device, mountpoint)
predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600)
< 0 and on(instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance)
group_left(alias) node_exporter_build_info
for: 2m
labels:
severity: warning
annotations:
description: |
Le système de fichiers devrait manquer d'espace dans les prochaines 24 heures au taux d'écriture actuel
summary: Disque plein dans les prochaines 24 heures ({{ $labels.alias }})
|
ok
|
|
23.31s ago
|
21.62ms |
| alert: SaturationReseau
expr: ((rate(node_network_receive_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m])
+ rate(node_network_transmit_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m]))
/ node_network_speed_bytes{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"} >
0.8) * on(instance) group_left(alias) node_exporter_build_info
for: 1m
labels:
severity: warning
annotations:
description: |
L'interface réseau "{{ $labels.device }}" est surchargée.
summary: Interface réseau saturée ({{ $labels.alias }})
|
ok
|
|
23.289s ago
|
8.545ms |
| alert: NTP_Decalee
expr: ((node_timex_offset_seconds
> 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds
< -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) * on(instance) group_left(alias)
node_exporter_build_info
for: 2m
labels:
severity: warning
annotations:
description: |
Horloge non synchronisée. Vérifiez la config NTP.
summary: Heure Décalée ({{ $labels.alias }})
|
ok
|
|
23.281s ago
|
4.442ms |
| alert: NTP_Off
expr: (min_over_time(node_timex_sync_status[1m])
== 0 and node_timex_maxerror_seconds >= 16) * on(instance) group_left(alias)
node_exporter_build_info
for: 2m
labels:
severity: warning
annotations:
description: |
Horloge non synchronisée. Vérifiez la config NTP.
summary: Heure non synchronisée ({{ $labels.alias }})
|
ok
|
|
23.276s ago
|
2.662ms |
| alert: MemoryUnderMemoryPressure
expr: (rate(node_vmstat_pgmajfault[1m])
> 1000) * on(instance) group_left(alias) node_exporter_build_info
for: 1m
labels:
severity: warning
annotations:
description: |-
RAM sous pression - Risque de crash
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: RAM sous pression ({{ $labels.alias }})
|
ok
|
|
23.274s ago
|
1.599ms |
| alert: MysqlDown
expr: (mysql_up
== 0) * on(instance) group_left(alias) node_exporter_build_info
labels:
severity: critical
annotations:
description: MySQL est down sur {{ $labels.alias }}
summary: MySQL down ({{ $labels.alias }})
|
ok
|
|
23.272s ago
|
873.2us |
| alert: RAMHigh
expr: (((node_memory_MemTotal_bytes
- node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes) * 100 > 95) *
on(instance) group_left(alias) node_exporter_build_info
for: 1m
labels:
severity: critical
team: devops
annotations:
description: RAM usage is above 95%.
summary: RAM usage high on {{ $labels.alias }}
|
ok
|
|
23.272s ago
|
3.803ms |
| alert: MysqlReplicationStopped
expr: ((mysql_slave_status_slave_io_running
== 0 or mysql_slave_status_slave_sql_running == 0) and mysql_slave_status_slave_io_running{alias="replica-shared"})
for: 1m
labels:
severity: critical
annotations:
description: La réplication MySQL est arrêtée sur {{ $labels.instance }}. Vérifiez
l’état du replica.
summary: Réplication MySQL arrêtée ({{ $labels.instance }})
|
ok
|
|
23.268s ago
|
220.4us |
| alert: GPUUsageHigh
expr: (nvidia_gpu_duty_cycle
> 90) * on(instance) group_left(alias) node_exporter_build_info
for: 5m
labels:
severity: warning
annotations:
description: L'utilisation du GPU est supérieure à 90% depuis 5 minutes.
summary: Utilisation GPU élevée sur {{ $labels.alias }}
|
ok
|
|
23.268s ago
|
850.3us |
| alert: GPUVramHigh
expr: ((nvidia_gpu_memory_used_bytes
/ nvidia_gpu_memory_total_bytes) * 100 > 90) * on(instance) group_left(alias)
node_exporter_build_info
for: 2m
labels:
severity: critical
annotations:
description: L'utilisation de la mémoire vidéo est à {{ $value | printf "%.2f"
}}%.
summary: VRAM GPU presque pleine sur {{ $labels.alias }}
|
ok
|
|
23.267s ago
|
874.8us |
| alert: GPUTemperatureHigh
expr: (nvidia_gpu_temperature_celsius
> 80) * on(instance) group_left(alias) node_exporter_build_info
for: 2m
labels:
severity: critical
annotations:
description: La température du GPU est de {{ $value }}°C.
summary: Surchauffe GPU sur {{ $labels.alias }}
|
ok
|
|
23.267s ago
|
929.6us |