这是昨晚又找个 4.0 集群 升级,问题复现了
19:40 开始升级集群, 21点左右升级完毕,期间18.141 发生evict leader 失败
pd 操作如下:
[root@emarsys105016 logs]# date && pd-ctl -u http://pd:2479 scheduler add evict-leader-scheduler 459653
Tue Jul 6 20:06:09 CST 2021
Success!
[root@emarsys105016 logs]# date && pd-ctl -u http://pd:2479 store 459653
Tue Jul 6 20:08:34 CST 2021
{
“store”: {
“id”: 459653,
“address”: “123.59.18.141:20160”,
“labels”: [
{
“key”: “rack”,
“value”: “5F-A8-01”
}
],
“version”: “4.0.13”,
“status_address”: “123.59.18.141:20180”,
“git_hash”: “a448d617f79ddf545be73931525bb41af0f790f3”,
“start_timestamp”: 1623481939,
“deploy_path”: “/”,
“last_heartbeat”: 1625573310245948904,
“state_name”: “Up”
},
“status”: {
“capacity”: “892.6GiB”,
“available”: “623.3GiB”,
“used_size”: “51.59GiB”,
“leader_count”: 1374,
“leader_weight”: 4,
“leader_score”: 343.5,
“leader_size”: 92893,
“region_count”: 5163,
“region_weight”: 4,
“region_score”: 86283.75,
“region_size”: 345135,
“start_ts”: “2021-06-12T07:12:19Z”,
“last_heartbeat_ts”: “2021-07-06T12:08:30.245948904Z”,
“uptime”: “580h56m11.245948904s”
}
}
[root@emarsys105016 logs]# date && pd-ctl -u http://pd:2479 store 459653
Tue Jul 6 20:13:01 CST 2021
{
“store”: {
“id”: 459653,
“address”: “123.59.18.141:20160”,
“labels”: [
{
“key”: “rack”,
“value”: “5F-A8-01”
}
],
“version”: “4.0.13”,
“status_address”: “123.59.18.141:20180”,
“git_hash”: “a448d617f79ddf545be73931525bb41af0f790f3”,
“start_timestamp”: 1623481939,
“deploy_path”: “/”,
“last_heartbeat”: 1625573580283732716,
“state_name”: “Up”
},
“status”: {
“capacity”: “892.6GiB”,
“available”: “625.1GiB”,
“used_size”: “51.5GiB”,
“leader_count”: 1344,
“leader_weight”: 4,
“leader_score”: 336,
“leader_size”: 90629,
“region_count”: 5162,
“region_weight”: 4,
“region_score”: 86080.25,
“region_size”: 344321,
“start_ts”: “2021-06-12T07:12:19Z”,
“last_heartbeat_ts”: “2021-07-06T12:13:00.283732716Z”,
“uptime”: “581h0m41.283732716s”
}
}
[root@emarsys105016 logs]# date && pd-ctl -u http://pd:2479 store 459653
Tue Jul 6 20:15:02 CST 2021
{
“store”: {
“id”: 459653,
“address”: “123.59.18.141:20160”,
“labels”: [
{
“key”: “rack”,
“value”: “5F-A8-01”
}
],
“version”: “4.0.13”,
“status_address”: “123.59.18.141:20180”,
“git_hash”: “a448d617f79ddf545be73931525bb41af0f790f3”,
“start_timestamp”: 1623481939,
“deploy_path”: “/”,
“last_heartbeat”: 1625573700334779338,
“state_name”: “Up”
},
“status”: {
“capacity”: “892.6GiB”,
“available”: “625GiB”,
“used_size”: “51.61GiB”,
“leader_count”: 1345,
“leader_weight”: 4,
“leader_score”: 336.25,
“leader_size”: 90625,
“region_count”: 5162,
“region_weight”: 4,
“region_score”: 86004.5,
“region_size”: 344018,
“start_ts”: “2021-06-12T07:12:19Z”,
“last_heartbeat_ts”: “2021-07-06T12:15:00.334779338Z”,
“uptime”: “581h2m41.334779338s”
}
}
等了10分钟。leader数没有变化,就动别的tikv了
异常tikv日志
tikvlog.gz (2.1 MB)