y75.第四章 Prometheus大厂监控体系及实战 -- prometheus报警设置(六)
发布时间
阅读量:
阅读量
6.prometheus报警设置
6.1 prometheus报警设置
通过Prometheus平台触发的一条告警流程如下所示:
当Prometheus检测到系统达到阈值时,在超过持续时间后会通过AlertManager将该告警分组/抑制/静默地发送至指定的媒体类型(如邮件、钉钉或微信等)。
分组(group): 将类似性质的警报合并为单个通知,比如网络通知、主机通知、服务通知。
静默(silences): 是一种简单的特定时间静音的机制,例如:服务器要升级维护可以先设置这个时间段告警静默。
抑制(inhibition): 当警报发出后,停止重复发送由此警报引发的其他警报即合并一个故障引起的多个报警事件,可以消除冗余告警
AI助手

6.2 下载并启动报警组件alertmanager
https://prometheus.io/download/#alertmanager
root@node2:~# cd /apps/
root@node2:/apps# wget https://github.com/prometheus/alertmanager/releases/download/v0.24.0/alertmanager-0.24.0.linux-amd64.tar.gz
root@node2:/apps# tar xf alertmanager-0.24.0.linux-amd64.tar.gz
root@node2:/apps# ln -sv /apps/alertmanager-0.24.0.linux-amd64 /apps/alertmanager
'/apps/alertmanager' -> '/apps/alertmanager-0.24.0.linux-amd64'
root@node2:/apps# vim /etc/systemd/system/alertmanager.service
[Unit]
Description=Prometheus Server
Documentation=https://prometheus.io/docs/introduction/overview/
After=network.target
[Service]
Restart=on-failure
WorkingDirectory=/apps/alertmanager
ExecStart=/apps/alertmanager/alertmanager
[Install]
WantedBy=multi-user.target
root@node2:/apps# systemctl enable --now alertmanager
AI助手
6.3 配置alertmanager
https://prometheus.io/docs/alerting/configuration/ #官方配置文档
global:
smtp_from: #发件人邮箱地址
smtp_smarthost: #邮箱smtp地址。
smtp_auth_username: #发件人的登陆用户名,默认和发件人地址一致。
smtp_auth_password: #发件人的登陆密码,有时候是授权码。
smtp_require_tls: #是否需要tls协议。默认是true。
wechart_url: #企业微信API地址。
wechart_api_secert: #企业微信API secert。
wechart_api_corp_id: #企业微信corp id信息。
resolve_timeout: #在指定时间内没有产生新的事件就发送恢复通知
AI助手
6.3.1 配置示例
root@node2:/apps# cd alertmanager
root@node2:/apps/alertmanager# cat alertmanager.yml
global:
resolve_timeout: 5m
smtp_smarthost: 'smtp.qq.com:465'
smtp_from: '88563128@qq.com'
smtp_auth_username: '88563128@qq.com'
smtp_auth_password: 'xxxx' #邮箱授权码
smtp_hello: '@qq.com'
smtp_require_tls: false
route: #route用来设置报警的分发策略
group_by: ['alertname'] #采用哪个标签来作为分组依据
group_wait: 10s #组告警等待时间。也就是告警产生后等待10s,如果有同组告警一起发出
group_interval: 10s #两组告警的间隔时间
repeat_interval: 2m #重复告警的间隔时间,减少相同邮件的发送频率
receiver: 'web.hook' #设置接收人
receivers:
- name: 'web.hook'
#webhook_configs:
#- url: 'http://127.0.0.1:5001/'
email_configs:
- to: '88563128@qq.com'
inhibit_rules: #抑制的规则
- source_match: #源匹配级别,当匹配成功发出通知,但是其他的通知将被抑制
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
root@node2:/apps/alertmanager# ./amtool check-config alertmanager.yml
Checking 'alertmanager.yml' SUCCESS
Found:
- global config
- route
- 1 inhibit rules
- 1 receivers
- 0 templates
AI助手
6.3.2 重启alertmanager服务并验证
root@node2:/apps/alertmanager# systemctl restart alertmanager
#验证alertmanager的9093端口已经监听
root@node2:/apps/alertmanager# lsof -i:9093
COMMAND PID USER FD TYPE DEVICE SIZE/OFF NODE NAME
alertmana 3546 root 8u IPv6 50436 0t0 TCP *:9093 (LISTEN)
AI助手

6.3.3 配置prometheus报警规则
root@prometheus1:/apps/prometheus# mkdir roles
root@prometheus1:/apps/prometheus# vim prometheus.yml
...
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- 172.31.2.182:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
# - "first_rules.yml"
# - "second_rules.yml"
- "/apps/prometheus/roles/*.yml" #指定规则文件
AI助手
6.3.4 创建报警规则文件
root@prometheus1:/apps/prometheus# vim roles/roles.yml
groups:
- name: alertmanager_pod.rules
rules:
- alert: Pod_all_cpu_usage
expr: (sum by(name)(rate(container_cpu_usage_seconds_total{image!=""}[5m]))*100) > 10
for: 2m
labels:
severity: critical
service: pods
annotations:
description: 容器 {{ $labels.name }} CPU 资源利用率大于 10% , (current value is {{ $value }})
summary: Dev CPU 负载告警
- alert: Pod_all_memory_usage
#expr: sort_desc(avg by(name)(irate(container_memory_usage_bytes{name!=""}[5m]))*100) > 10 #内存大于10%
expr: sort_desc(avg by(name)(irate(node_memory_MemFree_bytes {name!=""}[5m]))) > 2147483648 #内存大于2G(2*1024*1024*1024)
for: 2m
labels:
severity: critical
annotations:
#description: 容器 {{ $labels.name }} Memory 资源利用率大于 10% , (current value is {{ $value }})
description: 容器 {{ $labels.name }} Memory 资源利用率大于 2G , (current value is {{ $value }})
summary: Dev Memory 负载告警
- alert: Pod_all_network_receive_usage
expr: sum by (name)(irate(container_network_receive_bytes_total{container_name="POD"}[1m])) > 52428800 #网络利用率大于50m(50*1024*1024)
for: 2m
labels:
severity: critical
annotations:
description: 容器 {{ $labels.name }} network_receive 资源利用率大于 50M ,(current value is {{ $value }})
- alert: node内存可用大小
expr: node_memory_MemFree_bytes < 4294967296 #node内存小于4G(4*1024*1024*1024)
for: 2m
labels:
severity: critical
annotations:
description: node可用内存小于4G
AI助手
6.3.5 报警规则验证
#验证报警规则设置:
root@prometheus1:/apps/prometheus# ./promtool check rules roles/roles.yml
Checking roles/mail.yml
SUCCESS: 4 rules found
AI助手
6.3.6 重启prometheus
root@prometheus1:/apps/prometheus# systemctl restart prometheus
AI助手
6.3.7 验证报警规则匹配
root@node2:/apps/alertmanager# ./amtool alert --alertmanager.url=http://172.31.2.182:9093
Alertname Starts At Summary State
node内存可用大小 2022-05-27 14:47:17 UTC active
node内存可用大小 2022-05-27 14:47:17 UTC active
node内存可用大小 2022-05-27 14:47:17 UTC active
node内存可用大小 2022-05-27 14:47:17 UTC active
node内存可用大小 2022-05-27 14:47:17 UTC active
node内存可用大小 2022-05-27 14:47:17 UTC active
AI助手
6.3.8 prometheus首页状态
promethus报警状态
· Inactive:没有异常。
· Pending:已触发阈值,但未满足告警持续时间(即rule中的for字段)
· Firing:已触发阈值且满足条件并发送至alertmanager
AI助手

6.3.9 prometheus web界面验证报警规则
http://172.31.2.101:9090/rules

6.3.10 Alertmanager界面验证报警信息

6.3.11 验证收到的报警邮件

全部评论 (0)
还没有任何评论哟~
