线上监控rules

记录下当前线上的监控rules

  • exporter监控;
  • node/虚机层面;
  • pod层面;
  • prometheus层面;
  • volume层面;
  • website层面;
root@hk-eks-ctl:/home/monitor/prometheus# cat prometheus-rules.yaml
apiVersion: v1
kind: ConfigMap
metadata:
  name: prometheus-rules
  namespace: monitor
data:
  general.rules: |
    groups:
    - name: general.rules
      rules:
      - alert: InstanceDown
        expr: |
          up{job=~"aws-hk-monitor|k8s-nodes"} == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "Instance {{ $labels.instance }} 停止工作"
          description: "{{ $labels.instance }} job {{ $labels.job }} 已经停止1分钟以上."

  node.rules: |
    groups:
    - name: node.rules
      rules:
      - alert: NodeFilesystemUsage
        expr: |
          100 - (node_filesystem_free_bytes / node_filesystem_size_bytes) * 100 > 80
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: "Instance {{ $labels.instance }} : {{ $labels.mountpoint }} 分区使用率过高"
          description: "{{ $labels.instance }}: {{ $labels.mountpoint }} 分区使用大于80% (当前值: {{ $value }})"
      - alert: NodeMemoryUsage
        expr: |
          100 - (node_memory_MemFree_bytes+node_memory_Cached_bytes+node_memory_Buffers_bytes) / node_memory_MemTotal_bytes * 100 > 85
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Instance {{ $labels.instance }} 内存使用率过高"
          description: "{{ $labels.instance }}内存使用大于85% (当前值: {{ $value }})"
      - alert: NodeCPUUsage
        expr: |
          100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance) * 100) > 85
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "Instance {{ $labels.instance }} CPU使用率过高"
          description: "{{ $labels.instance }}CPU使用大于85% (当前值: {{ $value }})"
      - alert: TCP_Estab
        expr: |
          node_netstat_Tcp_CurrEstab > 1500
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Instance {{ $labels.instance }} TCP_Estab链接过高"
          description: "{{ $labels.instance }} TCP_Estab链接过高!(当前值: {{ $value }})"
      - alert: TCP_TIME_WAIT
        expr: |
          node_sockstat_TCP_tw > 1500
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Instance {{ $labels.instance }} TCP_TIME_WAIT过高"
          description: "{{ $labels.instance }} TCP_TIME_WAIT过高!(当前值: {{ $value }})"
      - alert: KubeNodeNotReady
        expr: |
          kube_node_status_condition{condition="Ready",status="true"} == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          description: '{{ $labels.node }} NotReady已经1分钟.'
      - alert: UnusualDiskReadRate
        expr: |
          sum by (job,instance) (irate(node_disk_read_bytes_total[5m])) / 1024 / 1024 > 50
        for: 5m
        labels:
          severity: critical
        annotations:
          description: '{{ $labels.instance }} 持续5分钟磁盘读取数据(> 50 MB/s) (当前值: {{ $value }})'
      - alert: UnusualDiskWriteRate
        expr: |
          sum by (job,instance) (irate(node_disk_written_bytes_total[4m])) / 1024 / 1024 > 50
        for: 5m
        labels:
          severity: critical
        annotations:
          description: '{{ $labels.instance }} 持续5分钟磁盘写入数据(> 50 MB/s) (当前值: {{ $value }})'
      - alert: UnusualNetworkThroughputIn
        expr: |
          sum by (job,instance) (irate(node_network_receive_bytes_total{job=~"aws-hk-monitor|k8s-nodes"}[5m])) / 1024 / 1024 > 50
        for: 5m
        labels:
          severity: critical
        annotations:
          description: '{{ $labels.instance }} 持续5分钟网络带宽接收数据(> 50 MB/s) (当前值: {{ $value }})'
      - alert: UnusualNetworkThroughputOut
        expr: |
          sum by (job,instance) (irate(node_network_transmit_bytes_total{job=~"aws-hk-monitor|k8s-nodes"}[5m])) / 1024 / 1024 > 50
        for: 5m
        labels:
          severity: critical
        annotations:
          description: '{{ $labels.instance }} 持续5分钟网络带宽发送数据(> 50 MB/s) (当前值: {{ $value }})'
      - alert: SystemdServiceCrashed
        expr: |
          node_systemd_unit_state{state="failed"} == 1
        for: 5m
        labels:
          severity: warning
        annotations:
          description: '{{ $labels.instance }} 上的{{$labels.name}}服务有问题已经5分钟,请及时处理'
  prometheus.rules: |
    groups:
    - name: prometheus.rules
      rules:
      - alert: PrometheusErrorSendingAlertsToAnyAlertmanagers
        expr: |
           (rate(prometheus_notifications_errors_total{instance="localhost:9090", job="prometheus"}[5m]) / rate(prometheus_notifications_sent_total{instance="localhost:9090", job="prometheus"}[5m])) * 100 > 3
        for: 5m
        labels:
          severity: warning
        annotations:
          description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.'
      - alert: PrometheusNotConnectedToAlertmanagers
        expr: |
           max_over_time(prometheus_notifications_alertmanagers_discovered{instance="localhost:9090", job="prometheus"}[5m]) < 1
        for: 5m
        labels:
          severity: critical
        annotations:
          description: "Prometheus {{$labels.namespace}}/{{$labels.pod}} 链接alertmanager异常!"
      - alert: PrometheusRuleFailures
        expr: |
           increase(prometheus_rule_evaluation_failures_total{instance="localhost:9090", job="prometheus"}[5m]) > 0
        for: 5m
        labels:
          severity: critical
        annotations:
          description: 'Prometheus {{$labels.namespace}}/{{$labels.pod}} 在5分钟执行失败的规则次数 {{ printf "%.0f" $value }}'
  website.rules: |
    groups:
    - name: website.rules
      rules:
      - alert: "ssl证书过期警告"
        expr: (probe_ssl_earliest_cert_expiry - time())/86400 <30
        for: 1h
        labels:
          severity: warn
        annotations:
          description: '域名{{$labels.instance}}的证书还有{{ printf "%.1f" $value }}天就过期了,请尽快更新证书'
          summary: "ssl证书过期警告"
      - alert: blackbox_network_stats
        expr: probe_success == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "接口/主机/端口/域名 {{ $labels.instance }} 不能访问"
          description: "接口/主机/端口/域名 {{ $labels.instance }} 不能访问,请尽快检测!"
      - alert: curlHttpStatus
        expr:  probe_http_status_code{job="blackbox-http"}>=422 and probe_success{job="blackbox-http"}==0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: '业务报警: 网站不可访问'
          description: '{{$labels.instance}} 不可访问,请及时查看,当前状态码为{{$value}}'
  pod.rules: |
    groups:
    - name: pod.rules
      rules:
      - alert: PodCPUUsage
        expr: |
           sum(rate(container_cpu_usage_seconds_total{image!=""}[5m]) * 100) by (pod, namespace) > 80
        for: 5m
        labels:
          severity: warning
        annotations:
          description: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} CPU使用大于80% (当前值: {{ $value }})"
      - alert: PodMemoryUsage
        expr: |
           sum(container_memory_rss{image!=""}) by(pod, namespace) / sum(container_spec_memory_limit_bytes{image!=""}) by(pod, namespace) * 100 != +inf > 80
        for: 5m
        labels:
          severity: critical
        annotations:
          description: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} 内存使用大于80% (当前值: {{ $value }})"
      - alert: KubeDeploymentError
        expr: |
           kube_deployment_spec_replicas{job="kubernetes-service-endpoints"} != kube_deployment_status_replicas_available{job="kubernetes-service-endpoints"}
        for: 3m
        labels:
          severity: warning
        annotations:
          description: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }}控制器与实际数量不相符 (当前值: {{ $value }})"
      - alert: coreDnsError
        expr: |
           kube_pod_container_status_running{container="coredns"} == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          description: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} coreDns服务异常 (当前值: {{ $value }})"
      - alert: kubeProxyError
        expr: |
           kube_pod_container_status_running{container="kube-proxy"} == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          description: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} kube-proxy服务异常 (当前值: {{ $value }})"
      - alert: filebeatError
        expr: |
           kube_pod_container_status_running{container="filebeat"} == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          description: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} filebeat服务异常 (当前值: {{ $value }})"
      - alert: PodNetworkReceive
        expr: |
           sum(rate(container_network_receive_bytes_total{image!="",name=~"^k8s_.*"}[5m]) /1000) by (pod,namespace) > 30000
        for: 5m
        labels:
          severity: warning
        annotations:
          description: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} 入口流量大于30MB/s (当前值: {{ $value }}K/s)"
      - alert: PodNetworkTransmit
        expr: |
           sum(rate(container_network_transmit_bytes_total{image!="",name=~"^k8s_.*"}[5m]) /1000) by (pod,namespace) > 30000
        for: 5m
        labels:
          severity: warning
        annotations:
          description: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} 出口流量大于30MB/s (当前值: {{ $value }}/K/s)"
      - alert: PodRestart
        expr: |
           sum(changes(kube_pod_container_status_restarts_total[1m])) by (pod,namespace) > 1
        for: 1m
        labels:
          severity: warning
        annotations:
          description: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} Pod重启 (当前值: {{ $value }})"
      - alert: PodFailed
        expr: |
           sum(kube_pod_status_phase{phase="Failed"}) by (pod,namespace) > 0
        for: 5s
        labels:
          severity: critical
        annotations:
          description: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} Pod状态Failed (当前值: {{ $value }})"
      - alert: PodPending
        expr: |
           sum(kube_pod_status_phase{phase="Pending"}) by (pod,namespace) > 0
        for: 1m
        labels:
          severity: critical
        annotations:
          description: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} Pod状态Pending (当前值: {{ $value }})"

      - alert: PodErrImagePull
        expr: |
           sum by(namespace,pod) (kube_pod_container_status_waiting_reason{reason="ErrImagePull"}) == 1
        for: 1m
        labels:
          severity: warning
        annotations:
          description: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }}  Pod状态ErrImagePull (当前值: {{ $value }})"
      - alert: PodImagePullBackOff
        expr: |
           sum by(namespace,pod) (kube_pod_container_status_waiting_reason{reason="ImagePullBackOff"}) == 1
        for: 1m
        labels:
          severity: warning
        annotations:
          description: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }}  Pod状态ImagePullBackOff (当前值: {{ $value }})"
      - alert: PodCrashLoopBackOff
        expr: |
           sum by(namespace,pod) (kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff"}) == 1
        for: 1m
        labels:
          severity: warning
        annotations:
          description: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }}  Pod状态CrashLoopBackOff (当前值: {{ $value }})"
      - alert: PodInvalidImageName
        expr: |
           sum by(namespace,pod) (kube_pod_container_status_waiting_reason{reason="InvalidImageName"}) == 1
        for: 1m
        labels:
          severity: warning
        annotations:
          description: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }}  Pod状态InvalidImageName (当前值: {{ $value }})"
      - alert: PodCreateContainerConfigError
        expr: |
           sum by(namespace,pod) (kube_pod_container_status_waiting_reason{reason="CreateContainerConfigError"}) == 1
        for: 1m
        labels:
          severity: warning
        annotations:
          description: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }}  Pod状态CreateContainerConfigError (当前值: {{ $value }})"
  volume.rules: |
    groups:
    - name: volume.rules
      rules:
      - alert: PersistentVolumeClaimLost
        expr: |
           sum by(namespace, persistentvolumeclaim) (kube_persistentvolumeclaim_status_phase{phase="Lost"}) == 1
        for: 2m
        labels:
          severity: warning
        annotations:
          description: "PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is lost\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
      - alert: PersistentVolumeClaimPendig
        expr: |
           sum by(namespace, persistentvolumeclaim) (kube_persistentvolumeclaim_status_phase{phase="Pendig"}) == 1
        for: 2m
        labels:
          severity: warning
        annotations:
          description: "PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is pendig\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
      - alert: PersistentVolume Failed
        expr: |
           sum(kube_persistentvolume_status_phase{phase="Failed",job="kubernetes-service-endpoints"}) by (persistentvolume) == 1
        for: 2m
        labels:
          severity: warning
        annotations:
          description: "Persistent volume is failed state\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
      - alert: PersistentVolume Pending
        expr: |
           sum(kube_persistentvolume_status_phase{phase="Pending",job="kubernetes-service-endpoints"}) by (persistentvolume) == 1
        for: 2m
        labels:
          severity: warning
        annotations:
          description: "Persistent volume is pending state\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"