groups:
  - name: ContainerHealthAlerts
    rules:
      - alert: CadvisorContainerDown
        expr: up{job="cadvisor"} == 0
        labels:
          severity: 'critical'
        annotations:
          summary: 'Alert: Cadvisor container is down'
          description: 'The Cadvisor container is down or not responding.'

      - alert: ContainerKilled
        expr: 'time() - container_last_seen > 60'
        for: 0m
        labels:
          severity: warning
        annotations:
          summary: Container killed (instance {{ $labels.instance }})
          description: "A container has disappeared\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

      - alert: ContainerAbsent
        expr: 'absent(container_last_seen)'
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: Container absent (instance {{ $labels.instance }})
          description: "A container is absent for 5 min\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

      - alert: ContainerHighMemoryUsage
        expr: '(sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80'
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: Container High Memory usage (instance {{ $labels.instance }})
          description: "Container Memory usage is above 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

      - alert: ContainerHighThrottleRate
        expr: 'rate(container_cpu_cfs_throttled_seconds_total[3m]) > 1'
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: Container high throttle rate (instance {{ $labels.instance }})
          description: "Container is being throttled\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

      - alert: ContainerLowCpuUtilization
        expr: '(sum(rate(container_cpu_usage_seconds_total{name!=""}[3m])) BY (instance, name) * 100) < 20'
        for: 7d
        labels:
          severity: info
        annotations:
          summary: Container Low CPU utilization (instance {{ $labels.instance }})
          description: "Container CPU utilization is under 20% for 1 week. Consider reducing the allocated CPU.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

      - alert: ContainerLowMemoryUsage
        expr: '(sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) < 20'
        for: 7d
        labels:
          severity: info
        annotations:
          summary: Container Low Memory usage (instance {{ $labels.instance }})
          description: "Container Memory usage is under 20% for 1 week. Consider reducing the allocated memory.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

  - name: MySQLDatabaseHealth
    rules:
      - alert: MysqlDown
        expr: 'mysql_up == 0'
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: MySQL down (instance {{ $labels.instance }})
          description: "MySQL instance is down on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

      - alert: MysqlTooManyConnections(>80%)
        expr: 'max_over_time(mysql_global_status_threads_connected[1m]) / mysql_global_variables_max_connections * 100 > 80'
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: MySQL too many connections (> 80%) (instance {{ $labels.instance }})
          description: "More than 80% of MySQL connections are in use on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

      - alert: MysqlHighThreadsRunning
        expr: 'max_over_time(mysql_global_status_threads_running[1m]) / mysql_global_variables_max_connections * 100 > 60'
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: MySQL high threads running (instance {{ $labels.instance }})
          description: "More than 60% of MySQL connections are in running state on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

      - alert: MysqlSlowQueries
        expr: 'increase(mysql_global_status_slow_queries[1m]) > 0'
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: MySQL slow queries (instance {{ $labels.instance }})
          description: "MySQL server mysql has some new slow query.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

      - alert: MysqlInnodbLogWaits
        expr: 'rate(mysql_global_status_innodb_log_waits[15m]) > 10'
        for: 0m
        labels:
          severity: warning
        annotations:
          summary: MySQL InnoDB log waits (instance {{ $labels.instance }})
          description: "MySQL innodb log writes stalling\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

      - alert: MysqlRestarted
        expr: 'mysql_global_status_uptime < 60'
        for: 0m
        labels:
          severity: info
        annotations:
          summary: MySQL restarted (instance {{ $labels.instance }})
          description: "MySQL has just been restarted, less than one minute ago on {{ $labels.instance }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"