Alerts


/etc/prometheus/rules/alerts.yml > Blackbox exporter alerts
HTTPSNotUsed (0 active)
alert: HTTPSNotUsed
expr: probe_http_ssl{job="blackbox",module=~"https(_ipv6)?"}
  == 0
for: 10m
labels:
  severity: warning
annotations:
  description: The HTTP server at {{ $labels.instance }} did not redirect to HTTPS,
    or SSL failed.
  generic_summary: HTTPS not used
  summary: The HTTP server at {{ $labels.instance }} did not force SSL
ProbeFailure (0 active)
alert: ProbeFailure
expr: probe_success{job="blackbox"}
  == 0
for: 10m
labels:
  severity: critical
annotations:
  description: The {{ $labels.module }} probe to {{ $labels.instance }} has failed
    due to protocol errors or failed checks.
  generic_summary: Blackbox probe failed
  summary: The probe to {{ $labels.instance }} has failed
SSLCertExpiringSoon (0 active)
alert: SSLCertExpiringSoon
expr: probe_ssl_earliest_cert_expiry{job="blackbox"}
  - time() < 86400 * 15
for: 10m
labels:
  severity: warning
annotations:
  description: The SSL certificate at {{ $labels.instance }} will expire in {{ humanizeDuration
    $value }} days.
  generic_summary: SSL certificate expiring soon
  summary: The SSL certificate at {{ $labels.instance }} will expire soon
SSLCertExpiringSoon (0 active)
alert: SSLCertExpiringSoon
expr: probe_ssl_earliest_cert_expiry{job="blackbox"}
  - time() < 86400 * 7
for: 10m
labels:
  severity: critical
annotations:
  description: The SSL certificate at {{ $labels.instance }} will expire in {{ humanizeDuration
    $value }} days.
  generic_summary: SSL certificate expiring VERY soon
  summary: The SSL certificate at {{ $labels.instance }} will expire VERY soon
/etc/prometheus/rules/alerts.yml > Cronjob alerts
FailedCronJob (0 active)
alert: FailedCronJob
expr: batch_last_finish_seconds
  > batch_last_success_seconds
for: 5m
labels:
  severity: warning
annotations:
  description: The last run of cronjob {{ $labels.job }} in {{ $labels.instance }}
    has failed.
  generic_summary: Cronjob failed
  summary: Cronjob {{ $labels.job }} in {{ $labels.instance }} has failed
MissingCronJob (0 active)
alert: MissingCronJob
expr: time()
  - batch_last_start_seconds > batch_period_seconds
for: 5m
labels:
  severity: warning
annotations:
  description: The cronjob {{ $labels.job }} in {{ $labels.instance }} has not run
    in the expected period.
  generic_summary: Cronjob missing
  summary: Cronjob {{ $labels.job }} in {{ $labels.instance }} has not run
SlowCronJob (0 active)
alert: SlowCronJob
expr: batch_running_time_seconds
  > 7200
for: 5m
labels:
  severity: info
annotations:
  description: The last run of cronjob {{ $labels.job }} in {{ $labels.instance }}
    has taken more than 2 hours.
  generic_summary: Cronjob too slow
  summary: Cronjob {{ $labels.job }} in {{ $labels.instance }} is too slow
StuckCronJob (0 active)
alert: StuckCronJob
expr: batch_running_time_seconds
  > 14400
for: 5m
labels:
  severity: warning
annotations:
  description: The last run of cronjob {{ $labels.job }} in {{ $labels.instance }}
    has taken more than 4 hours, and it is considered stuck/hung.
  generic_summary: Cronjob stuck
  summary: Cronjob {{ $labels.job }} in {{ $labels.instance }} is stuck
/etc/prometheus/rules/alerts.yml > General alerts
InstanceDown (1 active)
alert: InstanceDown
expr: up ==
  0 or pg_up == 0
for: 5m
labels:
  severity: critical
annotations:
  description: '{{ $labels.instance }} of job {{ $labels.job }} has been down for
    more than 5 minutes.'
  generic_summary: Service down
  summary: Instance {{ $labels.instance }} down
Labels State Active Since Value
alertname="InstanceDown" env="test" instance="miller.infra.assekuransa.com:9100" job="node" severity="critical" firing 2025-10-03 14:06:34.863625018 +0000 UTC 0
/etc/prometheus/rules/alerts.yml > Node alerts
FSFull (0 active)
alert: FSFull
expr: instance:node_filesystem_avail_bytes_per_node_filesystem_size_bytes:ratio{job="node"}
  * 100 <= 1
for: 5m
labels:
  severity: warning
annotations:
  description: The {{ $labels.mountpoint }} filesystem in {{ $labels.instance }} has
    less than 5% available space.
  generic_summary: Filesystem almost full
  summary: Filesystem {{ $labels.mountpoint }} in {{ $labels.instance }} is almost
    full
FSFull (0 active)
alert: FSFull
expr: instance:node_filesystem_avail_bytes_per_node_filesystem_size_bytes:ratio{job="node"}
  * 100 <= 0.5
for: 5m
labels:
  severity: critical
annotations:
  description: The {{ $labels.mountpoint }} filesystem in {{ $labels.instance }} is
    full.
  generic_summary: Filesystem full
  summary: Filesystem {{ $labels.mountpoint }} in {{ $labels.instance }} is full
FSFullSoon (0 active)
alert: FSFullSoon
expr: predict_linear(instance:node_filesystem_avail_bytes:sum{job="node"}[12h],
  24 * 3600) <= 0
for: 5m
labels:
  severity: info
annotations:
  description: The {{ $labels.mountpoint }} filesystem in {{ $labels.instance }} will
    be full in 24 hours at the current rate.
  generic_summary: Filesystem full soon
  summary: Filesystem {{ $labels.mountpoint }} in {{ $labels.instance }} will fill
    soon
FSFullSoon (0 active)
alert: FSFullSoon
expr: predict_linear(instance:node_filesystem_avail_bytes:sum{job="node"}[4h],
  4 * 3600) <= 0
for: 30m
labels:
  severity: warning
annotations:
  description: The {{ $labels.mountpoint }} filesystem in {{ $labels.instance }} will
    be full in 4 hours at the current rate.
  generic_summary: Filesystem full VERY soon
  summary: Filesystem {{ $labels.mountpoint }} in {{ $labels.instance }} will fill
    VERY soon
HighCpuUsage (0 active)
alert: HighCpuUsage
expr: (1
  - instance:node_cpu_seconds_total:avg_rate5m{job="node",mode="idle"})
  * 100 > 90
for: 5m
labels:
  severity: info
annotations:
  description: The CPU usage in {{ $labels.instance }} has been over 90% for more
    than 5 minutes.
  generic_summary: CPU usage too high
  summary: CPU usage in {{ $labels.instance }} is too high
HighLoadAvg (0 active)
alert: HighLoadAvg
expr: node_load15{job="node"}
  > 100
for: 5m
labels:
  severity: info
annotations:
  description: The 15-minute load average in {{ $labels.instance }} has been over
    100 for more than 5 minutes.
  generic_summary: Load average too high
  summary: The load average in {{ $labels.instance }} is too high
MemFull (0 active)
alert: MemFull
expr: instance:node_memory_MemUsed_bytes_per_node_memory_MemTotal_bytes:ratio{job="node"}
  * 100 > 90
for: 15m
labels:
  severity: info
annotations:
  description: The memory usage in {{ $labels.instance }} has been over 90% for more
    than 15 minutes.
  generic_summary: Memory usage too high
  summary: Memory usage in {{ $labels.instance }} is too high
MemFull (0 active)
alert: MemFull
expr: instance:node_memory_MemUsed_bytes_per_node_memory_MemTotal_bytes:ratio{job="node"}
  * 100 > 95
for: 5m
labels:
  severity: warning
annotations:
  description: The memory usage in {{ $labels.instance }} has been over 95% for more
    than 5 minutes.
  generic_summary: Memory usage critical
  summary: Memory usage in {{ $labels.instance }} is critical
MemFullSoon (0 active)
alert: MemFullSoon
expr: predict_linear(instance:node_memory_MemUsed_bytes_per_node_memory_MemTotal_bytes:ratio{job="node"}[12h],
  24 * 3600) * 100 > 99
for: 5m
labels:
  severity: info
annotations:
  description: The memory usage in {{ $labels.instance }} in {{ $labels.instance }}
    will reach 100% in 24 hours at the current rate.
  generic_summary: Memory full soon
  summary: Memory in {{ $labels.instance }} will fill in 24h
MemFullSoon (0 active)
alert: MemFullSoon
expr: predict_linear(instance:node_memory_MemUsed_bytes_per_node_memory_MemTotal_bytes:ratio{job="node"}[8h],
  4 * 3600) * 100 > 99
for: 30m
labels:
  severity: warning
annotations:
  description: The memory usage in {{ $labels.instance }} in {{ $labels.instance }}
    will reach 100% in 4 hours at the current rate.
  generic_summary: Memory full VERY soon
  summary: Memory in {{ $labels.instance }} will fill in 4h
ProcessNearFDLimits (0 active)
alert: ProcessNearFDLimits
expr: process_open_fds
  / process_max_fds * 100 > 80
for: 5m
labels:
  severity: warning
annotations:
  description: The process for {{ $labels.job }} in {{ $labels.instance }} has {{
    $value }}% of available file descriptors in use.
  generic_summary: Too many files open
  summary: The process in {{ $labels.instance }} has too many files open.