level=warn ts=2021-06-25T02:52:34.335650745Z caller=manager.go:339 component=“rule manager” group=alert.rules msg=“Evaluating rule failed” rule=“alert: DM_binlog_file_gap_between_relay_syncer (阿里云p)\ expr: dm_relay_binlog_file{node=“relay”} - on(instance, job) dm_syncer_binlog_file{node=“syncer”}\ > 1\ for: 10m\ labels:\ env: production-cluster\ expr: dm_relay_binlog_file{node=“relay”} - ON(instance, job) dm_syncer_binlog_file{node=“syncer”}\ > 1\ level: critical\ annotations:\ description: ‘cluster: production-cluster, instance: {{ $labels.instance }}, task:\ {{ $labels.task }}, values: {{ $value }}’\ summary: dm syncer binlog file not catch up relay exceed 10 min (阿里云p)\ value: ‘{{ $value }}’\ ” err=“many-to-many matching not allowed: matching labels must be unique on one side”
groups:
- name: alert.rules
rules:-
alert: DM_remain_storage_of_relay_log (测试环境)
expr: dm_relay_space{type=“available”} < 10102410241024
labels:
env: test-cluster
level: critical
expr: dm_relay_space{type=“available”} < 10102410241024
annotations:
description: ‘cluster: test-cluster, instance: {{ $labels.instance }}, values: {{ $value }}’
value: ‘{{ $value }}’
summary: DM remain storage of relay log -
alert: DM_relay_process_exits_with_error (测试环境)
expr: changes(dm_relay_exit_with_error_count[5m]) > 0
labels:
env: test-cluster
level: critical
expr: changes(dm_relay_exit_with_error_count[5m]) > 0
annotations:
description: ‘cluster: test-cluster, instance: {{ $labels.instance }}, values: {{ $value }}’
value: ‘{{ $value }}’
summary: DM relay process exits with error
-
部分告警规则
- 使用什么部署的? tiup 吗? 版本是什么?
- 修改了哪个目录的哪个文件?
- 操作过程是什么? reload 或者重启了哪个组件吗?
