线上监控rules
记录下当前线上的监控rules
- exporter监控;
- node/虚机层面;
- pod层面;
- prometheus层面;
- volume层面;
- website层面;
root@hk-eks-ctl:/home/monitor/prometheus# cat prometheus-rules.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-rules
namespace: monitor
data:
general.rules: |
groups:
- name: general.rules
rules:
- alert: InstanceDown
expr: |
up{job=~"aws-hk-monitor|k8s-nodes"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Instance {{ $labels.instance }} 停止工作"
description: "{{ $labels.instance }} job {{ $labels.job }} 已经停止1分钟以上."
node.rules: |
groups:
- name: node.rules
rules:
- alert: NodeFilesystemUsage
expr: |
100 - (node_filesystem_free_bytes / node_filesystem_size_bytes) * 100 > 80
for: 1m
labels:
severity: warning
annotations:
summary: "Instance {{ $labels.instance }} : {{ $labels.mountpoint }} 分区使用率过高"
description: "{{ $labels.instance }}: {{ $labels.mountpoint }} 分区使用大于80% (当前值: {{ $value }})"
- alert: NodeMemoryUsage
expr: |
100 - (node_memory_MemFree_bytes+node_memory_Cached_bytes+node_memory_Buffers_bytes) / node_memory_MemTotal_bytes * 100 > 85
for: 5m
labels:
severity: warning
annotations:
summary: "Instance {{ $labels.instance }} 内存使用率过高"
description: "{{ $labels.instance }}内存使用大于85% (当前值: {{ $value }})"
- alert: NodeCPUUsage
expr: |
100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance) * 100) > 85
for: 10m
labels:
severity: warning
annotations:
summary: "Instance {{ $labels.instance }} CPU使用率过高"
description: "{{ $labels.instance }}CPU使用大于85% (当前值: {{ $value }})"
- alert: TCP_Estab
expr: |
node_netstat_Tcp_CurrEstab > 1500
for: 5m
labels:
severity: warning
annotations:
summary: "Instance {{ $labels.instance }} TCP_Estab链接过高"
description: "{{ $labels.instance }} TCP_Estab链接过高!(当前值: {{ $value }})"
- alert: TCP_TIME_WAIT
expr: |
node_sockstat_TCP_tw > 1500
for: 5m
labels:
severity: warning
annotations:
summary: "Instance {{ $labels.instance }} TCP_TIME_WAIT过高"
description: "{{ $labels.instance }} TCP_TIME_WAIT过高!(当前值: {{ $value }})"
- alert: KubeNodeNotReady
expr: |
kube_node_status_condition{condition="Ready",status="true"} == 0
for: 1m
labels:
severity: critical
annotations:
description: '{{ $labels.node }} NotReady已经1分钟.'
- alert: UnusualDiskReadRate
expr: |
sum by (job,instance) (irate(node_disk_read_bytes_total[5m])) / 1024 / 1024 > 50
for: 5m
labels:
severity: critical
annotations:
description: '{{ $labels.instance }} 持续5分钟磁盘读取数据(> 50 MB/s) (当前值: {{ $value }})'
- alert: UnusualDiskWriteRate
expr: |
sum by (job,instance) (irate(node_disk_written_bytes_total[4m])) / 1024 / 1024 > 50
for: 5m
labels:
severity: critical
annotations:
description: '{{ $labels.instance }} 持续5分钟磁盘写入数据(> 50 MB/s) (当前值: {{ $value }})'
- alert: UnusualNetworkThroughputIn
expr: |
sum by (job,instance) (irate(node_network_receive_bytes_total{job=~"aws-hk-monitor|k8s-nodes"}[5m])) / 1024 / 1024 > 50
for: 5m
labels:
severity: critical
annotations:
description: '{{ $labels.instance }} 持续5分钟网络带宽接收数据(> 50 MB/s) (当前值: {{ $value }})'
- alert: UnusualNetworkThroughputOut
expr: |
sum by (job,instance) (irate(node_network_transmit_bytes_total{job=~"aws-hk-monitor|k8s-nodes"}[5m])) / 1024 / 1024 > 50
for: 5m
labels:
severity: critical
annotations:
description: '{{ $labels.instance }} 持续5分钟网络带宽发送数据(> 50 MB/s) (当前值: {{ $value }})'
- alert: SystemdServiceCrashed
expr: |
node_systemd_unit_state{state="failed"} == 1
for: 5m
labels:
severity: warning
annotations:
description: '{{ $labels.instance }} 上的{{$labels.name}}服务有问题已经5分钟,请及时处理'
prometheus.rules: |
groups:
- name: prometheus.rules
rules:
- alert: PrometheusErrorSendingAlertsToAnyAlertmanagers
expr: |
(rate(prometheus_notifications_errors_total{instance="localhost:9090", job="prometheus"}[5m]) / rate(prometheus_notifications_sent_total{instance="localhost:9090", job="prometheus"}[5m])) * 100 > 3
for: 5m
labels:
severity: warning
annotations:
description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.'
- alert: PrometheusNotConnectedToAlertmanagers
expr: |
max_over_time(prometheus_notifications_alertmanagers_discovered{instance="localhost:9090", job="prometheus"}[5m]) < 1
for: 5m
labels:
severity: critical
annotations:
description: "Prometheus {{$labels.namespace}}/{{$labels.pod}} 链接alertmanager异常!"
- alert: PrometheusRuleFailures
expr: |
increase(prometheus_rule_evaluation_failures_total{instance="localhost:9090", job="prometheus"}[5m]) > 0
for: 5m
labels:
severity: critical
annotations:
description: 'Prometheus {{$labels.namespace}}/{{$labels.pod}} 在5分钟执行失败的规则次数 {{ printf "%.0f" $value }}'
website.rules: |
groups:
- name: website.rules
rules:
- alert: "ssl证书过期警告"
expr: (probe_ssl_earliest_cert_expiry - time())/86400 <30
for: 1h
labels:
severity: warn
annotations:
description: '域名{{$labels.instance}}的证书还有{{ printf "%.1f" $value }}天就过期了,请尽快更新证书'
summary: "ssl证书过期警告"
- alert: blackbox_network_stats
expr: probe_success == 0
for: 1m
labels:
severity: critical
annotations:
summary: "接口/主机/端口/域名 {{ $labels.instance }} 不能访问"
description: "接口/主机/端口/域名 {{ $labels.instance }} 不能访问,请尽快检测!"
- alert: curlHttpStatus
expr: probe_http_status_code{job="blackbox-http"}>=422 and probe_success{job="blackbox-http"}==0
for: 1m
labels:
severity: critical
annotations:
summary: '业务报警: 网站不可访问'
description: '{{$labels.instance}} 不可访问,请及时查看,当前状态码为{{$value}}'
pod.rules: |
groups:
- name: pod.rules
rules:
- alert: PodCPUUsage
expr: |
sum(rate(container_cpu_usage_seconds_total{image!=""}[5m]) * 100) by (pod, namespace) > 80
for: 5m
labels:
severity: warning
annotations:
description: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} CPU使用大于80% (当前值: {{ $value }})"
- alert: PodMemoryUsage
expr: |
sum(container_memory_rss{image!=""}) by(pod, namespace) / sum(container_spec_memory_limit_bytes{image!=""}) by(pod, namespace) * 100 != +inf > 80
for: 5m
labels:
severity: critical
annotations:
description: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} 内存使用大于80% (当前值: {{ $value }})"
- alert: KubeDeploymentError
expr: |
kube_deployment_spec_replicas{job="kubernetes-service-endpoints"} != kube_deployment_status_replicas_available{job="kubernetes-service-endpoints"}
for: 3m
labels:
severity: warning
annotations:
description: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }}控制器与实际数量不相符 (当前值: {{ $value }})"
- alert: coreDnsError
expr: |
kube_pod_container_status_running{container="coredns"} == 0
for: 1m
labels:
severity: critical
annotations:
description: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} coreDns服务异常 (当前值: {{ $value }})"
- alert: kubeProxyError
expr: |
kube_pod_container_status_running{container="kube-proxy"} == 0
for: 1m
labels:
severity: critical
annotations:
description: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} kube-proxy服务异常 (当前值: {{ $value }})"
- alert: filebeatError
expr: |
kube_pod_container_status_running{container="filebeat"} == 0
for: 1m
labels:
severity: critical
annotations:
description: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} filebeat服务异常 (当前值: {{ $value }})"
- alert: PodNetworkReceive
expr: |
sum(rate(container_network_receive_bytes_total{image!="",name=~"^k8s_.*"}[5m]) /1000) by (pod,namespace) > 30000
for: 5m
labels:
severity: warning
annotations:
description: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} 入口流量大于30MB/s (当前值: {{ $value }}K/s)"
- alert: PodNetworkTransmit
expr: |
sum(rate(container_network_transmit_bytes_total{image!="",name=~"^k8s_.*"}[5m]) /1000) by (pod,namespace) > 30000
for: 5m
labels:
severity: warning
annotations:
description: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} 出口流量大于30MB/s (当前值: {{ $value }}/K/s)"
- alert: PodRestart
expr: |
sum(changes(kube_pod_container_status_restarts_total[1m])) by (pod,namespace) > 1
for: 1m
labels:
severity: warning
annotations:
description: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} Pod重启 (当前值: {{ $value }})"
- alert: PodFailed
expr: |
sum(kube_pod_status_phase{phase="Failed"}) by (pod,namespace) > 0
for: 5s
labels:
severity: critical
annotations:
description: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} Pod状态Failed (当前值: {{ $value }})"
- alert: PodPending
expr: |
sum(kube_pod_status_phase{phase="Pending"}) by (pod,namespace) > 0
for: 1m
labels:
severity: critical
annotations:
description: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} Pod状态Pending (当前值: {{ $value }})"
- alert: PodErrImagePull
expr: |
sum by(namespace,pod) (kube_pod_container_status_waiting_reason{reason="ErrImagePull"}) == 1
for: 1m
labels:
severity: warning
annotations:
description: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} Pod状态ErrImagePull (当前值: {{ $value }})"
- alert: PodImagePullBackOff
expr: |
sum by(namespace,pod) (kube_pod_container_status_waiting_reason{reason="ImagePullBackOff"}) == 1
for: 1m
labels:
severity: warning
annotations:
description: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} Pod状态ImagePullBackOff (当前值: {{ $value }})"
- alert: PodCrashLoopBackOff
expr: |
sum by(namespace,pod) (kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff"}) == 1
for: 1m
labels:
severity: warning
annotations:
description: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} Pod状态CrashLoopBackOff (当前值: {{ $value }})"
- alert: PodInvalidImageName
expr: |
sum by(namespace,pod) (kube_pod_container_status_waiting_reason{reason="InvalidImageName"}) == 1
for: 1m
labels:
severity: warning
annotations:
description: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} Pod状态InvalidImageName (当前值: {{ $value }})"
- alert: PodCreateContainerConfigError
expr: |
sum by(namespace,pod) (kube_pod_container_status_waiting_reason{reason="CreateContainerConfigError"}) == 1
for: 1m
labels:
severity: warning
annotations:
description: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} Pod状态CreateContainerConfigError (当前值: {{ $value }})"
volume.rules: |
groups:
- name: volume.rules
rules:
- alert: PersistentVolumeClaimLost
expr: |
sum by(namespace, persistentvolumeclaim) (kube_persistentvolumeclaim_status_phase{phase="Lost"}) == 1
for: 2m
labels:
severity: warning
annotations:
description: "PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is lost\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PersistentVolumeClaimPendig
expr: |
sum by(namespace, persistentvolumeclaim) (kube_persistentvolumeclaim_status_phase{phase="Pendig"}) == 1
for: 2m
labels:
severity: warning
annotations:
description: "PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is pendig\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PersistentVolume Failed
expr: |
sum(kube_persistentvolume_status_phase{phase="Failed",job="kubernetes-service-endpoints"}) by (persistentvolume) == 1
for: 2m
labels:
severity: warning
annotations:
description: "Persistent volume is failed state\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PersistentVolume Pending
expr: |
sum(kube_persistentvolume_status_phase{phase="Pending",job="kubernetes-service-endpoints"}) by (persistentvolume) == 1
for: 2m
labels:
severity: warning
annotations:
description: "Persistent volume is pending state\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
本博客所有文章除特别声明外,均采用 CC BY-SA 4.0 协议 ,转载请注明出处!