【遇到的问题】复现
https://asktug.com/t/topic/573034
,https://docs.pingcap.com/zh/tidb/stable/online-unsafe-recovery 时,未触发unsafe remove-failed-stores
【TiDB版本】v5.4.0
【 TiDB 使用环境`】测试环境,虚机centos 7
配置文件:
global:
user: tidb
ssh_port: 22
deploy_dir: /data/tidb-deploy
data_dir: /data/tidb-data/
os: linux
arch: amd64
monitored:
node_exporter_port: 39100
blackbox_exporter_port: 39115
deploy_dir: /data/tidb-deploy/monitor-39100
data_dir: /data/tidb-data/monitor_data
log_dir: /data/tidb-deploy/monitor-39100/log
server_configs:
tidb:
oom-use-tmp-storage: true
performance.max-procs: 0
performance.txn-total-size-limit: 2097152
prepared-plan-cache.enabled: true
tikv-client.copr-cache.capacity-mb: 128.0
tikv-client.max-batch-wait-time: 0
tmp-storage-path: /data/tidb-data/tmp_oom
split-table: true
tikv:
coprocessor.split-region-on-table: true
readpool.coprocessor.use-unified-pool: true
readpool.storage.use-unified-pool: false
server.grpc-compression-type: none
storage.block-cache.shared: true
pd:
enable-cross-table-merge: false
replication.enable-placement-rules: true
schedule.leader-schedule-limit: 4
schedule.region-schedule-limit: 2048
schedule.replica-schedule-limit: 64
replication.location-labels: ["dc","logic","rack","host"]
tiflash: {}
tiflash-learner: {}
pump: {}
drainer: {}
cdc: {}
tidb_servers:
- host: 192.168.8.11
ssh_port: 22
port: 4000
status_port: 10080
deploy_dir: /data/tidb-deploy/tidb-4000
- host: 192.168.8.12
ssh_port: 22
port: 4000
status_port: 10080
deploy_dir: /data/tidb-deploy/tidb-4000
- host: 192.168.8.13
ssh_port: 22
port: 4000
status_port: 10080
deploy_dir: /data/tidb-deploy/tidb-4000
tikv_servers:
- host: 192.168.8.11
ssh_port: 22
port: 20160
status_port: 20180
deploy_dir: /data/tidb-deploy/tikv-20160
data_dir: /data/tidb-data/tikv_data
config:
server.labels: { dc: "dc1",logic: "logic1",rack: "r1",host: "192_168_8_11" }
- host: 192.168.8.12
ssh_port: 22
port: 20160
status_port: 20180
deploy_dir: /data/tidb-deploy/tikv-20160
data_dir: /data/tidb-data/tikv_data
config:
server.labels: { dc: "dc1",logic: "logic2",rack: "r1",host: "192_168_8_12" }
- host: 192.168.8.13
ssh_port: 22
port: 20160
status_port: 20180
deploy_dir: /data/tidb-deploy/tikv-20160
data_dir: /data/tidb-data/tikv_data
config:
server.labels: { dc: "dc2",logic: "logic3",rack: "r1",host: "192_168_8_13" }
- host: 192.168.8.13
ssh_port: 22
port: 20161
status_port: 20181
deploy_dir: /data/tidb-deploy/tikv-20161
data_dir: /data/tidb-data/tikv_data-20161
config:
server.labels: { dc: "dc2",logic: "logic4",rack: "r1",host: "192_168_8_13" }
pd_servers:
- host: 192.168.8.11
ssh_port: 22
name: pd-192.168.8.11-2379
client_port: 2379
peer_port: 2380
deploy_dir: /data/tidb-deploy/pd-2379
data_dir: /data/tidb-data/pd_data
- host: 192.168.8.12
ssh_port: 22
name: pd-192.168.8.12-2379
client_port: 2379
peer_port: 2380
deploy_dir: /data/tidb-deploy/pd-2379
data_dir: /data/tidb-data/pd_data
- host: 192.168.8.13
ssh_port: 22
name: pd-192.168.8.13-2379
client_port: 2379
peer_port: 2380
deploy_dir: /data/tidb-deploy/pd-2379
data_dir: /data/tidb-data/pd_data
rule.json
[
{
"group_id": "pd",
"group_index": 0,
"group_override": false,
"rules": [
{
"group_id": "pd",
"id": "logic1",
"start_key": "",
"end_key": "",
"role": "voter",
"count": 1,
"location_labels": ["dc", "logic", "rack", "host"],
"label_constraints": [{"key": "logic", "op": "in", "values": ["logic1"]}]
},
{
"group_id": "pd",
"id": "logic2",
"start_key": "",
"end_key": "",
"role": "voter",
"count": 1,
"location_labels": ["dc", "logic", "rack", "host"],
"label_constraints": [{"key": "logic", "op": "in", "values": ["logic2"]}]
},
{
"group_id": "pd",
"id": "logic3",
"start_key": "",
"end_key": "",
"role": "voter",
"count": 1,
"location_labels": ["dc", "logic", "rack", "host"],
"label_constraints": [{"key": "logic", "op": "in", "values": ["logic3"]}]
},
{
"group_id": "pd",
"id": "logic4",
"start_key": "",
"end_key": "",
"role": "learner",
"count": 1,
"location_labels": ["dc", "logic", "rack", "host"],
"label_constraints": [{"key": "logic", "op": "in", "values": ["logic4"]}]
}
]
}
]
【发生容灾切换】集群状态
[root@centos3 ~]# tiup cluster display dr-auto-sync
tiup is checking updates for component cluster ...timeout!
Starting component `cluster`: /root/.tiup/components/cluster/v1.10.2/tiup-cluster display dr-auto-sync
Cluster type: tidb
Cluster name: dr-auto-sync
Cluster version: v5.4.0
Deploy user: tidb
SSH type: builtin
Dashboard URL: http://192.168.8.13:2379/dashboard
ID Role Host Ports OS/Arch Status Data Dir Deploy Dir
-- ---- ---- ----- ------- ------ -------- ----------
192.168.8.13:2379 pd 192.168.8.13 2379/2380 linux/x86_64 Up|L|UI /data/tidb-data/pd_data /data/tidb-deploy/pd-2379
192.168.8.13:4000 tidb 192.168.8.13 4000/10080 linux/x86_64 Down - /data/tidb-deploy/tidb-4000
192.168.8.13:20160 tikv 192.168.8.13 20160/20180 linux/x86_64 Up /data/tidb-data/tikv_data /data/tidb-deploy/tikv-20160
192.168.8.13:20161 tikv 192.168.8.13 20161/20181 linux/x86_64 Up /data/tidb-data/tikv_data-20161 /data/tidb-deploy/tikv-20161
【复现路径】
执行命令:
tiup ctl:v5.4.0 pd -u http://192.168.8.13:2379 config placement-rules rule-bundle save --in=rules_dr.json
tiup ctl:v5.4.0 pd -u http://192.168.8.13:2379 config set replication-mode majority
tiup ctl:v5.4.0 pd -u http://192.168.8.13:2379 unsafe remove-failed-stores 1,6
tiup ctl:v5.4.0 pd -u http://192.168.8.13:2379 unsafe remove-failed-stores show
[
"No on-going operation."
]
# 看起来是pd重启后没有触发 unasfe recover
tiup ctl:v5.4.0 pd -u http://192.168.8.13:2379 unsafe remove-failed-stores history
[
"No unasfe recover has been triggered since PD restarted."
]
【问题现象及影响】
问题:执行 tiup ctl:v5.4.0 pd -u http://192.168.8.13:2379 unsafe remove-failed-stores 1,6
后,未触发unsafe remove-failed-stores
pd.log
[2022/07/20 13:36:17.224 +08:00] [WARN] [forwarder.go:106] ["Unable to resolve connection address since no alive TiDB instance"]
[2022/07/20 13:36:17.224 +08:00] [ERROR] [tidb_requests.go:64] ["fail to send schema request"] [component=TiDB] [error=error.tidb.no_alive_tidb]