一、告警通知
1.邮件通知
- 配置信息
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
# 配置信息
### 配置文件 alertmanager.yml
/prometheus/alertmanager/alertmanager.yml
touch alertmanager.yml
global: # 全局配置项
resolve_timeout: 5m #超时,默认5min
smtp_smarthost: 'smtp.163.com:465'
smtp_from: 'hollysys_test@163.com'
smtp_auth_username: 'hollysys_test@163.com'
smtp_auth_password: 'XXXXXXXX' # 授权码:XXXXXXXX
smtp_require_tls: false
templates: # 定义模板信息
- 'template/*.tmpl' # 路径
route: # 路由
group_by: ['alertname'] # 报警分组依据
group_wait: 10s #组等待时间
group_interval: 10s # 发送前等待时间
repeat_interval: 1h #重复周期
receiver: 'mail' # 默认警报接收者
receivers: # 警报接收者
- name: 'mail' #警报名称
email_configs:
- to: '' #接收警报的email
html: '' # 模板
send_resolved: true
inhibit_rules: # 告警抑制
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
- 自定义邮件模板
2.钉钉通知
2.1.获取钉钉机器人webhook
- 配置信息
2.2.prometheus-webhook-dingtalk
1
2
3
4
5
6
7
8
# prometheus-webhook-dingtalk
# timonwong /prometheus-webhook-dingtalk
https://github.com/timonwong/prometheus-webhook-dingtalk
https://hub.docker.com/r/timonwong/prometheus-webhook-dingtalk
# Webhook
https://oapi.dingtalk.com/robot/send?access_token=d8e443b4fe8512dba6c764afad94bd361fbf71c6f612c8de3bcf88d8ae545ed53
1
2
3
4
5
6
# 拉取镜像
docker pull timonwong/prometheus-webhook-dingtalk
# 运行docker
docker run -d -p 8060:8060 --name webhook-dingding timonwong/prometheus-webhook-dingtalk:latest \
--ding.profile="webhook1=https://oapi.dingtalk.com/robot/send?access_token=d8e443b4fe8512dba6c764afad94bd361fbf71c6f612c8de3bcf88d8ae545ed53"
2.3.配置信息
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
# 配置信息
### 配置文件 alertmanager.yml
/prometheus/alertmanager/alertmanager.yml
touch alertmanager.yml
global: # 全局配置项
resolve_timeout: 5m #超时,默认5min
route: # 路由
receiver: webhook
group_by: ['alertname'] # 报警分组依据
group_wait: 10s #组等待时间
group_interval: 10s # 发送前等待时间
repeat_interval: 1h #重复周期
routes:
- receiver: webhook
group_wait: 10s
receivers:
- name: webhook
webhook_configs:
- url: http://172.17.88.22:8060/dingtalk/webhook1/send
send_resolved: true
2.4.自定义模板
3.自定义
- 配置信息
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
# 配置信息
### 配置文件 alertmanager.yml
/prometheus/alertmanager/alertmanager.yml
touch alertmanager.yml
global: # 全局配置项
resolve_timeout: 5m #超时,默认5min
route: # 路由
group_by: ['alertname'] # 报警分组依据
group_wait: 10s #组等待时间
group_interval: 10s # 发送前等待时间
repeat_interval: 1h #重复周期
receiver: webhook
receivers:
- name: webhook
webhook_configs:
- url: http://172.17.88.22:8888/monitor
- 自定义接口
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
# AlertController
@Slf4j
@Controller
@RequestMapping("/")
public class AlertController {
@RequestMapping(value = "/monitor", produces = "application/json;charset=UTF-8")
@ResponseBody
public String monitor(@RequestBody String json) {
log.info("alert notify params: {}", json);
Map<String, Object> result = new HashMap<>();
result.put("msg", "报警失败");
result.put("code", 0);
return JSON.toJSONString(result);
}
}
# 接口地址
url: http://172.17.88.22:8888/monitor
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# 报警数据
# 触发报警数据:
{
"receiver": "webhook",
"status": "firing",
"alerts": [{
"status": "firing",
"labels": {
"alertname": "node-up",
"instance": "node-3",
"job": "centos-3",
"severity": "1",
"team": "node"
},
"annotations": {
"description": "node-3 检测到异常停止!请重点关注!!!",
"summary": "node-3 已停止运行! alert"
},
"startsAt": "2020-08-20T07:09:35.987923059Z",
"endsAt": "0001-01-01T00:00:00Z",
"generatorURL": "http://test-1:9090/graph?g0.expr=up%7Bjob%3D%22centos-3%22%7D+%3D%3D+0\u0026g0.tab=1",
"fingerprint": "d0412b7cebb1a039"
}],
"groupLabels": {
"alertname": "node-up"
},
"commonLabels": {
"alertname": "node-up",
"instance": "node-3",
"job": "centos-3",
"severity": "1",
"team": "node"
},
"commonAnnotations": {
"description": "node-3 检测到异常停止!请重点关注!!!",
"summary": "node-3 已停止运行! alert"
},
"externalURL": "http://test-1:9093",
"version": "4",
"groupKey": "{}:{alertname=\"node-up\"}",
"truncatedAlerts": 0
}
# 恢复报警数据:
{
"receiver": "webhook",
"status": "resolved",
"alerts": [{
"status": "resolved",
"labels": {
"alertname": "node-up",
"instance": "node-3",
"job": "centos-3",
"severity": "1",
"team": "node"
},
"annotations": {
"description": "node-3 检测到异常停止!请重点关注!!!",
"summary": "node-3 已停止运行! alert"
},
"startsAt": "2020-08-20T07:09:35.987923059Z",
"endsAt": "2020-08-20T07:14:05.987923059Z",
"generatorURL": "http://test-1:9090/graph?g0.expr=up%7Bjob%3D%22centos-3%22%7D+%3D%3D+0\u0026g0.tab=1",
"fingerprint": "d0412b7cebb1a039"
}],
"groupLabels": {
"alertname": "node-up"
},
"commonLabels": {
"alertname": "node-up",
"instance": "node-3",
"job": "centos-3",
"severity": "1",
"team": "node"
},
"commonAnnotations": {
"description": "node-3 检测到异常停止!请重点关注!!!",
"summary": "node-3 已停止运行! alert"
},
"externalURL": "http://test-1:9093",
"version": "4",
"groupKey": "{}:{alertname=\"node-up\"}",
"truncatedAlerts": 0
}
二、告警规则
1.Prometheus
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# 文件
prometheus.yml
groups:
- name: Prometheus #报警规则组的名字
rules:
- alert: PrometheusTargetMissing
expr: up == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Prometheus target missing (instance )"
description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = \n LABELS: "
- alert: PrometheusAllTargetsMissing
expr: count by (job) (up) == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Prometheus all targets missing (instance )"
description: "A Prometheus job does not have living target anymore.\n VALUE = \n LABELS: "
- alert: PrometheusNotConnectedToAlertmanager
expr: prometheus_notifications_alertmanagers_discovered < 1
for: 1m
labels:
severity: critical
annotations:
summary: "Prometheus not connected to alertmanager (instance )"
description: "Prometheus cannot connect the alertmanager\n VALUE = \n LABELS: "
- alert: PrometheusAlertmanagerNotificationFailing
expr: rate(alertmanager_notifications_failed_total[1m]) > 0
for: 1m
labels:
severity: critical
annotations:
summary: "Prometheus AlertManager notification failing (instance )"
description: "Alertmanager is failing sending notifications\n VALUE = \n LABELS: "
2.Linux
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
# 文件
node-exporter.yml
groups:
- name: CentOS #报警规则组的名字
rules:
- alert: HostOutOfMemory
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
for: 5m
labels:
severity: warning
annotations:
summary: "Host out of memory (instance )"
description: "Node memory is filling up (< 10% left)\n VALUE = \n LABELS: "
- alert: HostMemoryUnderMemoryPressure
expr: rate(node_vmstat_pgmajfault[1m]) > 1000
for: 5m
labels:
severity: warning
annotations:
summary: "Host memory under memory pressure (instance )"
description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = \n LABELS: "
- alert: HostUnusualNetworkThroughputIn
expr: sum by (instance) (irate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
for: 5m
labels:
severity: warning
annotations:
summary: "Host unusual network throughput in (instance )"
description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = \n LABELS: "
- alert: HostUnusualNetworkThroughputOut
expr: sum by (instance) (irate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
for: 5m
labels:
severity: warning
annotations:
summary: "Host unusual network throughput out (instance )"
description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = \n LABELS: "
- alert: HostUnusualDiskReadRate
expr: sum by (instance) (irate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50
for: 5m
labels:
severity: warning
annotations:
summary: "Host unusual disk read rate (instance )"
description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = \n LABELS: "
- alert: HostUnusualDiskWriteRate
expr: sum by (instance) (irate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50
for: 5m
labels:
severity: warning
annotations:
summary: "Host unusual disk write rate (instance )"
description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = \n LABELS: "
- alert: HostOutOfDiskSpace
expr: (node_filesystem_avail_bytes{mountpoint="/rootfs"} * 100) / node_filesystem_size_bytes{mountpoint="/rootfs"} < 10
for: 5m
labels:
severity: warning
annotations:
summary: "Host out of disk space (instance )"
description: "Disk is almost full (< 10% left)\n VALUE = \n LABELS: "
- alert: HostDiskWillFillIn4Hours
expr: predict_linear(node_filesystem_free_bytes{fstype!~"tmpfs"}[1h], 4 * 3600) < 0
for: 5m
labels:
severity: warning
annotations:
summary: "Host disk will fill in 4 hours (instance )"
description: "Disk will fill in 4 hours at current write rate\n VALUE = \n LABELS: "
- alert: HostOutOfInodes
expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint ="/rootfs"} * 100 < 10
for: 5m
labels:
severity: warning
annotations:
summary: "Host out of inodes (instance )"
description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = \n LABELS: "
- alert: HostUnusualDiskReadLatency
expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 100
for: 5m
labels:
severity: warning
annotations:
summary: "Host unusual disk read latency (instance )"
description: "Disk latency is growing (read operations > 100ms)\n VALUE = \n LABELS: "
- alert: HostUnusualDiskWriteLatency
expr: rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 100
for: 5m
labels:
severity: warning
annotations:
summary: "Host unusual disk write latency (instance )"
description: "Disk latency is growing (write operations > 100ms)\n VALUE = \n LABELS: "
- alert: HostHighCpuLoad
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "Host high CPU load (instance )"
description: "CPU load is > 80%\n VALUE = \n LABELS: "
# 1000 context switches is an arbitrary number.
# Alert threshold depends on nature of application.
# Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58
- alert: HostContextSwitching
expr: (rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 1000
for: 5m
labels:
severity: warning
annotations:
summary: "Host context switching (instance )"
description: "Context switching is growing on node (> 1000 / s)\n VALUE = \n LABELS: "
- alert: HostSwapIsFillingUp
expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
for: 5m
labels:
severity: warning
annotations:
summary: "Host swap is filling up (instance )"
description: "Swap is filling up (>80%)\n VALUE = \n LABELS: "
- alert: HostSystemdServiceCrashed
expr: node_systemd_unit_state{state="failed"} == 1
for: 5m
labels:
severity: warning
annotations:
summary: "Host SystemD service crashed (instance )"
description: "SystemD service crashed\n VALUE = \n LABELS: "
- alert: HostPhysicalComponentTooHot
expr: node_hwmon_temp_celsius > 75
for: 5m
labels:
severity: warning
annotations:
summary: "Host physical component too hot (instance )"
description: "Physical hardware component too hot\n VALUE = \n LABELS: "
- alert: HostNodeOvertemperatureAlarm
expr: node_hwmon_temp_alarm == 1
for: 5m
labels:
severity: critical
annotations:
summary: "Host node overtemperature alarm (instance )"
description: "Physical node temperature alarm triggered\n VALUE = \n LABELS: "
- alert: HostRaidArrayGotInactive
expr: node_md_state{state="inactive"} > 0
for: 5m
labels:
severity: critical
annotations:
summary: "Host RAID array got inactive (instance )"
description: "RAID array is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.\n VALUE = \n LABELS: "
- alert: HostRaidDiskFailure
expr: node_md_disks{state="fail"} > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Host RAID disk failure (instance )"
description: "At least one device in RAID array on failed. Array needs attention and possibly a disk swap\n VALUE = \n LABELS: "
- alert: HostKernelVersionDeviations
expr: count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1
for: 5m
labels:
severity: warning
annotations:
summary: "Host kernel version deviations (instance )"
description: "Different kernel versions are running\n VALUE = \n LABELS: "
- alert: HostOomKillDetected
expr: increase(node_vmstat_oom_kill[5m]) > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Host OOM kill detected (instance )"
description: "OOM kill detected\n VALUE = \n LABELS: "
- alert: HostEdacCorrectableErrorsDetected
expr: increase(node_edac_correctable_errors_total[5m]) > 0
for: 5m
labels:
severity: info
annotations:
summary: "Host EDAC Correctable Errors detected (instance )"
description: " has had correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = \n LABELS: "
- alert: HostEdacUncorrectableErrorsDetected
expr: node_edac_uncorrectable_errors_total > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Host EDAC Uncorrectable Errors detected (instance )"
description: " has had uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = \n LABELS: "
- alert: HostNetworkReceiveErrors
expr: increase(node_network_receive_errs_total[5m]) > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Host Network Receive Errors (instance )"
description: " interface has encountered receive errors in the last five minutes.\n VALUE = \n LABELS: "
- alert: HostNetworkTransmitErrors
expr: increase(node_network_transmit_errs_total[5m]) > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Host Network Transmit Errors (instance )"
description: " interface has encountered transmit errors in the last five minutes.\n VALUE = \n LABELS: "
3.Docker
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# 文件
cadvisor.yml
groups:
- name: Docker #报警规则组的名字
rules:
- alert: ContainerKilled
expr: time() - container_last_seen > 60
for: 5m
labels:
severity: warning
annotations:
summary: "Container killed (instance )"
description: "A container has disappeared\n VALUE = \n LABELS: "
- alert: ContainerCpuUsage
expr: (sum(rate(container_cpu_usage_seconds_total[3m])) BY (instance, name) * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "Container CPU usage (instance )"
description: "Container CPU usage is above 80%\n VALUE = \n LABELS: "
- alert: ContainerMemoryUsage
expr: (sum(container_memory_usage_bytes) BY (instance, name) / sum(container_spec_memory_limit_bytes) BY (instance, name) * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "Container Memory usage (instance )"
description: "Container Memory usage is above 80%\n VALUE = \n LABELS: "
- alert: ContainerVolumeUsage
expr: (1 - (sum(container_fs_inodes_free) BY (instance) / sum(container_fs_inodes_total) BY (instance)) * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "Container Volume usage (instance )"
description: "Container Volume usage is above 80%\n VALUE = \n LABELS: "
- alert: ContainerVolumeIoUsage
expr: (sum(container_fs_io_current) BY (instance, name) * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "Container Volume IO usage (instance )"
description: "Container Volume IO usage is above 80%\n VALUE = \n LABELS: "
- alert: ContainerHighThrottleRate
expr: rate(container_cpu_cfs_throttled_seconds_total[3m]) > 1
for: 5m
labels:
severity: warning
annotations:
summary: "Container high throttle rate (instance )"
description: "Container is being throttled\n VALUE = \n LABELS: "
4.Nginx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
# 文件
nginx-vts-exporter.yml
groups:
- name: Nginx #报警规则组的名字
rules:
- alert: NginxHighHttp4xxErrorRate
expr: sum(rate(nginx_http_requests_total{status=~"^4.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5
for: 5m
labels:
severity: critical
annotations:
summary: "Nginx high HTTP 4xx error rate (instance )"
description: "Too many HTTP requests with status 4xx (> 5%)\n VALUE = \n LABELS: "
- alert: NginxHighHttp5xxErrorRate
expr: sum(rate(nginx_http_requests_total{status=~"^5.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5
for: 5m
labels:
severity: critical
annotations:
summary: "Nginx high HTTP 5xx error rate (instance )"
description: "Too many HTTP requests with status 5xx (> 5%)\n VALUE = \n LABELS: "
- alert: NginxLatencyHigh
expr: histogram_quantile(0.99, sum(rate(nginx_http_request_duration_seconds_bucket[30m])) by (host, node)) > 10
for: 5m
labels:
severity: warning
annotations:
summary: "Nginx latency high (instance )"
description: "Nginx p99 latency is higher than 10 seconds\n VALUE = \n LABELS: "
5.Redis
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# 文件
redis-exporter.yml
groups:
- name: Redis #报警规则组的名字
rules:
- alert: RedisDown
expr: redis_up == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Redis down (instance )"
description: "Redis instance is down\n VALUE = \n LABELS: "
- alert: RedisMissingMaster
expr: count(redis_instance_info{role="master"}) == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Redis missing master (instance )"
description: "Redis cluster has no node marked as master.\n VALUE = \n LABELS: "
- alert: RedisTooManyMasters
expr: count(redis_instance_info{role="master"}) > 1
for: 5m
labels:
severity: critical
annotations:
summary: "Redis too many masters (instance )"
description: "Redis cluster has too many nodes marked as master.\n VALUE = \n LABELS: "
- alert: RedisDisconnectedSlaves
expr: count without (instance, job) (redis_connected_slaves) - sum without (instance, job) (redis_connected_slaves) - 1 > 1
for: 5m
labels:
severity: critical
annotations:
summary: "Redis disconnected slaves (instance )"
description: "Redis not replicating for all slaves. Consider reviewing the redis replication status.\n VALUE = \n LABELS: "
- alert: RedisReplicationBroken
expr: delta(redis_connected_slaves[1m]) < 0
for: 5m
labels:
severity: critical
annotations:
summary: "Redis replication broken (instance )"
description: "Redis instance lost a slave\n VALUE = \n LABELS: "
- alert: RedisClusterFlapping
expr: changes(redis_connected_slaves[5m]) > 2
for: 5m
labels:
severity: critical
annotations:
summary: "Redis cluster flapping (instance )"
description: "Changes have been detected in Redis replica connection. This can occur when replica nodes lose connection to the master and reconnect (a.k.a flapping).\n VALUE = \n LABELS: "
- alert: RedisMissingBackup
expr: time() - redis_rdb_last_save_timestamp_seconds > 60 * 60 * 24
for: 5m
labels:
severity: critical
annotations:
summary: "Redis missing backup (instance )"
description: "Redis has not been backuped for 24 hours\n VALUE = \n LABELS: "
- alert: RedisOutOfMemory
expr: redis_memory_used_bytes / redis_total_system_memory_bytes * 100 > 90
for: 5m
labels:
severity: warning
annotations:
summary: "Redis out of memory (instance )"
description: "Redis is running out of memory (> 90%)\n VALUE = \n LABELS: "
- alert: RedisTooManyConnections
expr: redis_connected_clients > 100
for: 5m
labels:
severity: warning
annotations:
summary: "Redis too many connections (instance )"
description: "Redis instance has too many connections\n VALUE = \n LABELS: "
- alert: RedisNotEnoughConnections
expr: redis_connected_clients < 5
for: 5m
labels:
severity: warning
annotations:
summary: "Redis not enough connections (instance )"
description: "Redis instance should have more connections (> 5)\n VALUE = \n LABELS: "
- alert: RedisRejectedConnections
expr: increase(redis_rejected_connections_total[1m]) > 0
for: 5m
labels:
severity: critical
annotations:
summary: "Redis rejected connections (instance )"
description: "Some connections to Redis has been rejected\n VALUE = \n LABELS: "
6.PostgreSQL
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
# 文件
postgres-exporter.yml
groups:
- name: PostgreSQL #报警规则组的名字
rules:
- alert: PostgresqlDown
expr: pg_up == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Postgresql down (instance )"
description: "Postgresql instance is down\n VALUE = \n LABELS: "
- alert: PostgresqlRestarted
expr: time() - pg_postmaster_start_time_seconds < 60
for: 5m
labels:
severity: critical
annotations:
summary: "Postgresql restarted (instance )"
description: "Postgresql restarted\n VALUE = \n LABELS: "
- alert: PostgresqlExporterError
expr: pg_exporter_last_scrape_error > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Postgresql exporter error (instance )"
description: "Postgresql exporter is showing errors. A query may be buggy in query.yaml\n VALUE = \n LABELS: "
- alert: PostgresqlReplicationLag
expr: (pg_replication_lag) > 10 and ON(instance) (pg_replication_is_replica == 1)
for: 5m
labels:
severity: warning
annotations:
summary: "Postgresql replication lag (instance )"
description: "PostgreSQL replication lag is going up (> 10s)\n VALUE = \n LABELS: "
- alert: PostgresqlTableNotVaccumed
expr: time() - pg_stat_user_tables_last_autovacuum > 60 * 60 * 24
for: 5m
labels:
severity: warning
annotations:
summary: "Postgresql table not vaccumed (instance )"
description: "Table has not been vaccum for 24 hours\n VALUE = \n LABELS: "
- alert: PostgresqlTableNotAnalyzed
expr: time() - pg_stat_user_tables_last_autoanalyze > 60 * 60 * 24
for: 5m
labels:
severity: warning
annotations:
summary: "Postgresql table not analyzed (instance )"
description: "Table has not been analyzed for 24 hours\n VALUE = \n LABELS: "
- alert: PostgresqlTooManyConnections
expr: sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) > pg_settings_max_connections * 0.9
for: 5m
labels:
severity: warning
annotations:
summary: "Postgresql too many connections (instance )"
description: "PostgreSQL instance has too many connections\n VALUE = \n LABELS: "
- alert: PostgresqlNotEnoughConnections
expr: sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 5
for: 5m
labels:
severity: warning
annotations:
summary: "Postgresql not enough connections (instance )"
description: "PostgreSQL instance should have more connections (> 5)\n VALUE = \n LABELS: "
- alert: PostgresqlDeadLocks
expr: rate(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Postgresql dead locks (instance )"
description: "PostgreSQL has dead-locks\n VALUE = \n LABELS: "
- alert: PostgresqlSlowQueries
expr: pg_slow_queries > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Postgresql slow queries (instance )"
description: "PostgreSQL executes slow queries\n VALUE = \n LABELS: "
- alert: PostgresqlHighRollbackRate
expr: rate(pg_stat_database_xact_rollback{datname!~"template.*"}[3m]) / rate(pg_stat_database_xact_commit{datname!~"template.*"}[3m]) > 0.02
for: 5m
labels:
severity: warning
annotations:
summary: "Postgresql high rollback rate (instance )"
description: "Ratio of transactions being aborted compared to committed is > 2 %\n VALUE = \n LABELS: "
- alert: PostgresqlCommitRateLow
expr: rate(pg_stat_database_xact_commit[1m]) < 10
for: 5m
labels:
severity: critical
annotations:
summary: "Postgresql commit rate low (instance )"
description: "Postgres seems to be processing very few transactions\n VALUE = \n LABELS: "
- alert: PostgresqlLowXidConsumption
expr: rate(pg_txid_current[1m]) < 5
for: 5m
labels:
severity: warning
annotations:
summary: "Postgresql low XID consumption (instance )"
description: "Postgresql seems to be consuming transaction IDs very slowly\n VALUE = \n LABELS: "
- alert: PostgresqllowXlogConsumption
expr: rate(pg_xlog_position_bytes[1m]) < 100
for: 5m
labels:
severity: warning
annotations:
summary: "Postgresqllow XLOG consumption (instance )"
description: "Postgres seems to be consuming XLOG very slowly\n VALUE = \n LABELS: "
- alert: PostgresqlWaleReplicationStopped
expr: rate(pg_xlog_position_bytes[1m]) == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Postgresql WALE replication stopped (instance )"
description: "WAL-E replication seems to be stopped\n VALUE = \n LABELS: "
- alert: PostgresqlHighRateStatementTimeout
expr: rate(postgresql_errors_total{type="statement_timeout"}[5m]) > 3
for: 5m
labels:
severity: critical
annotations:
summary: "Postgresql high rate statement timeout (instance )"
description: "Postgres transactions showing high rate of statement timeouts\n VALUE = \n LABELS: "
- alert: PostgresqlHighRateDeadlock
expr: rate(postgresql_errors_total{type="deadlock_detected"}[1m]) * 60 > 1
for: 5m
labels:
severity: critical
annotations:
summary: "Postgresql high rate deadlock (instance )"
description: "Postgres detected deadlocks\n VALUE = \n LABELS: "
- alert: PostgresqlReplicationLabBytes
expr: (pg_xlog_position_bytes and pg_replication_is_replica == 0) - GROUP_RIGHT(instance) (pg_xlog_position_bytes and pg_replication_is_replica == 1) > 1e+09
for: 5m
labels:
severity: critical
annotations:
summary: "Postgresql replication lab bytes (instance )"
description: "Postgres Replication lag (in bytes) is high\n VALUE = \n LABELS: "
- alert: PostgresqlUnusedReplicationSlot
expr: pg_replication_slots_active == 0
for: 5m
labels:
severity: warning
annotations:
summary: "Postgresql unused replication slot (instance )"
description: "Unused Replication Slots\n VALUE = \n LABELS: "
- alert: PostgresqlTooManyDeadTuples
expr: ((pg_stat_user_tables_n_dead_tup > 10000) / (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup)) >= 0.1 unless ON(instance) (pg_replication_is_replica == 1)
for: 5m
labels:
severity: warning
annotations:
summary: "Postgresql too many dead tuples (instance )"
description: "PostgreSQL dead tuples is too large\n VALUE = \n LABELS: "
- alert: PostgresqlSplitBrain
expr: count(pg_replication_is_replica == 0) != 1
for: 5m
labels:
severity: critical
annotations:
summary: "Postgresql split brain (instance )"
description: "Split Brain, too many primary Postgresql databases in read-write mode\n VALUE = \n LABELS: "
- alert: PostgresqlPromotedNode
expr: pg_replication_is_replica and changes(pg_replication_is_replica[1m]) > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Postgresql promoted node (instance )"
description: "Postgresql standby server has been promoted as primary node\n VALUE = \n LABELS: "
- alert: PostgresqlConfigurationChanged
expr: {__name__=~"pg_settings_.*"} != ON(__name__) {__name__=~"pg_settings_([^t]|t[^r]|tr[^a]|tra[^n]|tran[^s]|trans[^a]|transa[^c]|transac[^t]|transact[^i]|transacti[^o]|transactio[^n]|transaction[^_]|transaction_[^r]|transaction_r[^e]|transaction_re[^a]|transaction_rea[^d]|transaction_read[^_]|transaction_read_[^o]|transaction_read_o[^n]|transaction_read_on[^l]|transaction_read_onl[^y]).*"} OFFSET 5m
for: 5m
labels:
severity: warning
annotations:
summary: "Postgresql configuration changed (instance )"
description: "Postgres Database configuration change has occurred\n VALUE = \n LABELS: "
- alert: PostgresqlSslCompressionActive
expr: sum(pg_stat_ssl_compression) > 0
for: 5m
labels:
severity: critical
annotations:
summary: "Postgresql SSL compression active (instance )"
description: "Database connections with SSL compression enabled. This may add significant jitter in replication delay. Replicas should turn off SSL compression via `sslcompression=0` in `recovery.conf`.\n VALUE = \n LABELS: "
- alert: PostgresqlTooManyLocksAcquired
expr: ((sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20
for: 5m
labels:
severity: critical
annotations:
summary: "Postgresql too many locks acquired (instance )"
description: "Too many locks acquired on the database. If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction.\n VALUE = \n LABELS: "
7.MySQL
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# 文件
mysqld-exporter.yml
groups:
- name: MySQL #报警规则组的名字
rules:
- alert: MysqlDown
expr: mysql_up == 0
for: 5m
labels:
severity: critical
annotations:
summary: "MySQL down (instance )"
description: "MySQL instance is down on \n VALUE = \n LABELS: "
- alert: MysqlTooManyConnections
expr: avg by (instance) (max_over_time(mysql_global_status_threads_connected[5m])) / avg by (instance) (mysql_global_variables_max_connections) * 100 > 80
for: 5m
labels:
severity: warning
annotations:
summary: "MySQL too many connections (instance )"
description: "More than 80% of MySQL connections are in use on \n VALUE = \n LABELS: "
- alert: MysqlHighThreadsRunning
expr: avg by (instance) (max_over_time(mysql_global_status_threads_running[5m])) / avg by (instance) (mysql_global_variables_max_connections) * 100 > 60
for: 5m
labels:
severity: warning
annotations:
summary: "MySQL high threads running (instance )"
description: "More than 60% of MySQL connections are in running state on \n VALUE = \n LABELS: "
- alert: MysqlSlaveIoThreadNotRunning
expr: mysql_slave_status_master_server_id > 0 and ON (instance) mysql_slave_status_slave_io_running == 0
for: 5m
labels:
severity: critical
annotations:
summary: "MySQL Slave IO thread not running (instance )"
description: "MySQL Slave IO thread not running on \n VALUE = \n LABELS: "
- alert: MysqlSlaveSqlThreadNotRunning
expr: mysql_slave_status_master_server_id > 0 and ON (instance) mysql_slave_status_slave_sql_running == 0
for: 5m
labels:
severity: critical
annotations:
summary: "MySQL Slave SQL thread not running (instance )"
description: "MySQL Slave SQL thread not running on \n VALUE = \n LABELS: "
- alert: MysqlSlaveReplicationLag
expr: mysql_slave_status_master_server_id > 0 and ON (instance) (mysql_slave_status_seconds_behind_master - mysql_slave_status_sql_delay) > 300
for: 5m
labels:
severity: warning
annotations:
summary: "MySQL Slave replication lag (instance )"
description: "MysqL replication lag on \n VALUE = \n LABELS: "
- alert: MysqlSlowQueries
expr: mysql_global_status_slow_queries > 0
for: 5m
labels:
severity: warning
annotations:
summary: "MySQL slow queries (instance )"
description: "MySQL server is having some slow queries.\n VALUE = \n LABELS: "
- alert: MysqlRestarted
expr: mysql_global_status_uptime < 60
for: 5m
labels:
severity: warning
annotations:
summary: "MySQL restarted (instance )"
description: "MySQL has just been restarted, less than one minute ago on .\n VALUE = \n LABELS: "
8.RabbitMQ
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# 文件
rabbitmq-exporter.yml
groups:
- name: Rabbitmq #报警规则组的名字
rules:
- alert: RabbitmqDown
expr: rabbitmq_up == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Rabbitmq down (instance )"
description: "RabbitMQ node down\n VALUE = \n LABELS: "
- alert: RabbitmqClusterDown
expr: sum(rabbitmq_running) < 3
for: 5m
labels:
severity: critical
annotations:
summary: "Rabbitmq cluster down (instance )"
description: "Less than 3 nodes running in RabbitMQ cluster\n VALUE = \n LABELS: "
- alert: RabbitmqClusterPartition
expr: rabbitmq_partitions > 0
for: 5m
labels:
severity: critical
annotations:
summary: "Rabbitmq cluster partition (instance )"
description: "Cluster partition\n VALUE = \n LABELS: "
- alert: RabbitmqOutOfMemory
expr: rabbitmq_node_mem_used / rabbitmq_node_mem_limit * 100 > 90
for: 5m
labels:
severity: warning
annotations:
summary: "Rabbitmq out of memory (instance )"
description: "Memory available for RabbmitMQ is low (< 10%)\n VALUE = \n LABELS: "
- alert: RabbitmqTooManyConnections
expr: rabbitmq_connectionsTotal > 1000
for: 5m
labels:
severity: warning
annotations:
summary: "Rabbitmq too many connections (instance )"
description: "RabbitMQ instance has too many connections (> 1000)\n VALUE = \n LABELS: "
- alert: RabbitmqDeadLetterQueueFillingUp
expr: rabbitmq_queue_messages{queue="my-dead-letter-queue"} > 10
for: 5m
labels:
severity: critical
annotations:
summary: "Rabbitmq dead letter queue filling up (instance )"
description: "Dead letter queue is filling up (> 10 msgs)\n VALUE = \n LABELS: "
- alert: RabbitmqTooManyMessagesInQueue
expr: rabbitmq_queue_messages_ready{queue="my-queue"} > 1000
for: 5m
labels:
severity: warning
annotations:
summary: "Rabbitmq too many messages in queue (instance )"
description: "Queue is filling up (> 1000 msgs)\n VALUE = \n LABELS: "
- alert: RabbitmqSlowQueueConsuming
expr: time() - rabbitmq_queue_head_message_timestamp{queue="my-queue"} > 60
for: 5m
labels:
severity: warning
annotations:
summary: "Rabbitmq slow queue consuming (instance )"
description: "Queue messages are consumed slowly (> 60s)\n VALUE = \n LABELS: "
- alert: RabbitmqNoConsumer
expr: rabbitmq_queue_consumers == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Rabbitmq no consumer (instance )"
description: "Queue has no consumer\n VALUE = \n LABELS: "
- alert: RabbitmqTooManyConsumers
expr: rabbitmq_queue_consumers > 1
for: 5m
labels:
severity: critical
annotations:
summary: "Rabbitmq too many consumers (instance )"
description: "Queue should have only 1 consumer\n VALUE = \n LABELS: "
- alert: RabbitmqUnactiveExchange
expr: rate(rabbitmq_exchange_messages_published_in_total{exchange="my-exchange"}[1m]) < 5
for: 5m
labels:
severity: warning
annotations:
summary: "Rabbitmq unactive exchange (instance )"
description: "Exchange receive less than 5 msgs per second\n VALUE = \n LABELS: "
9.JVM
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# 文件
jvm.yml
groups:
- name: JVM #报警规则组的名字
rules:
- alert: JvmMemoryFillingUp
expr: jvm_memory_bytes_used / jvm_memory_bytes_max{area="heap"} > 0.8
for: 5m
labels:
severity: warning
annotations:
summary: "JVM memory filling up (instance )"
description: "JVM memory is filling up (> 80%)\n VALUE = \n LABELS: "
10.Elasticsearch
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# 文件
elasticsearch-exporter.yml
groups:
- name: Elasticsearch #报警规则组的名字
rules:
- alert: ElasticsearchHeapUsageTooHigh
expr: (elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) * 100 > 90
for: 5m
labels:
severity: critical
annotations:
summary: "Elasticsearch Heap Usage Too High (instance )"
description: "The heap usage is over 90% for 5m\n VALUE = \n LABELS: "
- alert: ElasticsearchHeapUsageWarning
expr: (elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) * 100 > 80
for: 5m
labels:
severity: warning
annotations:
summary: "Elasticsearch Heap Usage warning (instance )"
description: "The heap usage is over 80% for 5m\n VALUE = \n LABELS: "
- alert: ElasticsearchDiskSpaceLow
expr: elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 20
for: 5m
labels:
severity: warning
annotations:
summary: "Elasticsearch disk space low (instance )"
description: "The disk usage is over 80%\n VALUE = \n LABELS: "
- alert: ElasticsearchDiskOutOfSpace
expr: elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 10
for: 5m
labels:
severity: critical
annotations:
summary: "Elasticsearch disk out of space (instance )"
description: "The disk usage is over 90%\n VALUE = \n LABELS: "
- alert: ElasticsearchClusterRed
expr: elasticsearch_cluster_health_status{color="red"} == 1
for: 5m
labels:
severity: critical
annotations:
summary: "Elasticsearch Cluster Red (instance )"
description: "Elastic Cluster Red status\n VALUE = \n LABELS: "
- alert: ElasticsearchClusterYellow
expr: elasticsearch_cluster_health_status{color="yellow"} == 1
for: 5m
labels:
severity: warning
annotations:
summary: "Elasticsearch Cluster Yellow (instance )"
description: "Elastic Cluster Yellow status\n VALUE = \n LABELS: "
- alert: ElasticsearchHealthyNodes
expr: elasticsearch_cluster_health_number_of_nodes < number_of_nodes
for: 5m
labels:
severity: critical
annotations:
summary: "Elasticsearch Healthy Nodes (instance )"
description: "Number Healthy Nodes less then number_of_nodes\n VALUE = \n LABELS: "
- alert: ElasticsearchHealthyDataNodes
expr: elasticsearch_cluster_health_number_of_data_nodes < number_of_data_nodes
for: 5m
labels:
severity: critical
annotations:
summary: "Elasticsearch Healthy Data Nodes (instance )"
description: "Number Healthy Data Nodes less then number_of_data_nodes\n VALUE = \n LABELS: "
- alert: ElasticsearchRelocationShards
expr: elasticsearch_cluster_health_relocating_shards > 0
for: 5m
labels:
severity: critical
annotations:
summary: "Elasticsearch relocation shards (instance )"
description: "Number of relocation shards for 20 min\n VALUE = \n LABELS: "
- alert: ElasticsearchInitializingShards
expr: elasticsearch_cluster_health_initializing_shards > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Elasticsearch initializing shards (instance )"
description: "Number of initializing shards for 10 min\n VALUE = \n LABELS: "
- alert: ElasticsearchUnassignedShards
expr: elasticsearch_cluster_health_unassigned_shards > 0
for: 5m
labels:
severity: critical
annotations:
summary: "Elasticsearch unassigned shards (instance )"
description: "Number of unassigned shards for 2 min\n VALUE = \n LABELS: "
- alert: ElasticsearchPendingTasks
expr: elasticsearch_cluster_health_number_of_pending_tasks > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Elasticsearch pending tasks (instance )"
description: "Number of pending tasks for 10 min. Cluster works slowly.\n VALUE = \n LABELS: "
- alert: ElasticsearchNoNewDocuments
expr: rate(elasticsearch_indices_docs{es_data_node="true"}[10m]) < 1
for: 5m
labels:
severity: warning
annotations:
summary: "Elasticsearch no new documents (instance )"
description: "No new documents for 10 min!\n VALUE = \n LABELS: "