tidb4.0.0 kv 突然leader数量降为0后再无leader分配

版本TIDB4.0.0

没有做缩容的操作,kv突然leader就降为0了。tikv这台服务器上不分配leader是怎么回事?
生产环境啊

kv状态up:

{
  "store": {
    "id": 223229,
    "address": "192.168.192.27:20160",
    "version": "4.0.0",
    "status_address": "192.168.192.27:20180",
    "git_hash": "198a2cea01734ce8f46d55a29708f123f9133944",
    "start_timestamp": 1596157805,
    "last_heartbeat": 1596437562910927155,
    "state_name": "Up"
  },
  "status": {
    "capacity": "388.2GiB",
    "available": "362.3GiB",
    "used_size": "18.05GiB",
    "leader_count": 0,
    "leader_weight": 1,
    "leader_score": 0,
    "leader_size": 0,
    "region_count": 3644,
    "region_weight": 1,
    "region_score": 95623,
    "region_size": 95623,
    "start_ts": "2020-07-31T09:10:05+08:00",
    "last_heartbeat_ts": "2020-08-03T14:52:42.910927155+08:00",
    "uptime": "77h42m37.910927155s"
  }
}

pd-ctl config show all

Starting component `ctl`:  pd -u http://192.168.192.32:2379 config show all
{
  "client-urls": "http://0.0.0.0:2379",
  "peer-urls": "http://192.168.192.32:2380",
  "advertise-client-urls": "http://192.168.192.32:2379",
  "advertise-peer-urls": "http://192.168.192.32:2380",
  "name": "pd_huirui-32",
  "data-dir": "/home/tidb/deploy/data.pd",
  "force-new-cluster": false,
  "enable-grpc-gateway": true,
  "initial-cluster": "pd_huirui-31=http://192.168.192.31:2380,pd_huirui-32=http://192.168.192.32:2380,pd_huirui-33=http://192.168.192.33:2380",
  "initial-cluster-state": "new",
  "join": "",
  "lease": 3,
  "log": {
    "level": "info",
    "format": "text",
    "disable-timestamp": false,
    "file": {
      "filename": "/home/tidb/deploy/log/pd.log",
      "max-size": 300,
      "max-days": 0,
      "max-backups": 0
    },
    "development": false,
    "disable-caller": false,
    "disable-stacktrace": false,
    "disable-error-verbose": true,
    "sampling": null
  },
  "tso-save-interval": "3s",
  "metric": {
    "job": "pd_huirui-32",
    "address": "",
    "interval": "15s"
  },
  "schedule": {
    "max-snapshot-count": 3,
    "max-pending-peer-count": 16,
    "max-merge-region-size": 20,
    "max-merge-region-keys": 200000,
    "split-merge-interval": "1h0m0s",
    "enable-one-way-merge": "false",
    "enable-cross-table-merge": "false",
    "patrol-region-interval": "100ms",
    "max-store-down-time": "30m0s",
    "leader-schedule-limit": 4,
    "leader-schedule-policy": "count",
    "region-schedule-limit": 4,
    "replica-schedule-limit": 8,
    "merge-schedule-limit": 8,
    "hot-region-schedule-limit": 4,
    "hot-region-cache-hits-threshold": 3,
    "store-balance-rate": 15,
    "tolerant-size-ratio": 5,
    "low-space-ratio": 0.8,
    "high-space-ratio": 0.6,
    "scheduler-max-waiting-operator": 3,
    "enable-remove-down-replica": "true",
    "enable-replace-offline-replica": "true",
    "enable-make-up-replica": "true",
    "enable-remove-extra-replica": "true",
    "enable-location-replacement": "true",
    "enable-debug-metrics": "false",
    "schedulers-v2": [
      {
        "type": "balance-region",
        "args": null,
        "disable": false,
        "args-payload": ""
      },
      {
        "type": "balance-leader",
        "args": null,
        "disable": false,
        "args-payload": ""
      },
      {
        "type": "hot-region",
        "args": null,
        "disable": false,
        "args-payload": ""
      },
      {
        "type": "label",
        "args": null,
        "disable": false,
        "args-payload": ""
      },
      {
        "type": "evict-leader",
        "args": [
          "223229"
        ],
        "disable": false,
        "args-payload": ""
      }
    ],
    "schedulers-payload": {
      "balance-hot-region-scheduler": "null",
      "balance-leader-scheduler": "{\"name\":\"balance-leader-scheduler\",\"ranges\":[{\"start-key\":\"\",\"end-key\":\"\"}]}",
      "balance-region-scheduler": "{\"name\":\"balance-region-scheduler\",\"ranges\":[{\"start-key\":\"\",\"end-key\":\"\"}]}",
      "evict-leader-scheduler": "{\"store-id-ranges\":{\"223229\":[{\"start-key\":\"\",\"end-key\":\"\"}]}}",
      "label-scheduler": "{\"name\":\"label-scheduler\",\"ranges\":[{\"start-key\":\"\",\"end-key\":\"\"}]}"
    },
    "store-limit-mode": "manual"
  },
  "replication": {
    "max-replicas": 3,
    "location-labels": "",
    "strictly-match-label": "false",
    "enable-placement-rules": "false"
  },
  "pd-server": {
    "use-region-storage": "true",
    "max-gap-reset-ts": "24h0m0s",
    "key-type": "table",
    "runtime-services": "",
    "metric-storage": "http://192.168.192.33:9090",
    "dashboard-address": "http://192.168.192.33:2379"
  },
  "cluster-version": "4.0.0",
  "quota-backend-bytes": "8GiB",
  "auto-compaction-mode": "periodic",
  "auto-compaction-retention-v2": "1h",
  "TickInterval": "500ms",
  "ElectionInterval": "3s",
  "PreVote": true,
  "security": {
    "cacert-path": "",
    "cert-path": "",
    "key-path": "",
    "cert-allowed-cn": null
  },
  "label-property": {},
  "WarningMsgs": [
    "Config contains undefined item: namespace-classifier"
  ],
  "DisableStrictReconfigCheck": false,
  "HeartbeatStreamBindInterval": "1m0s",
  "LeaderPriorityCheckInterval": "1m0s",
  "dashboard": {
    "tidb_cacert_path": "",
    "tidb_cert_path": "",
    "tidb_key_path": "",
    "public_path_prefix": "/dashboard"
  },
  "replication-mode": {
    "replication-mode": "majority",
    "dr-auto-sync": {
      "label-key": "",
      "primary": "",
      "dr": "",
      "primary-replicas": 0,
      "dr-replicas": 0,
      "wait-store-timeout": "1m0s",
      "wait-sync-timeout": "1m0s"
    }
  }
}

请问使用的重启命令是什么,如果使用 tidb-ansible 执行 rolling_update 操作会在 tikv 添加此调度,但是在重启成功后会将此调度删除,再重启中是否遇到什么报错呢。

当前可以使用 pd-ctl scheduler remove evict-leader-scheduler 223229 解决此问题

感谢
前提:使用的TIUP部署的tidb4.0.0版本集群
1、kv没有重启


2、是不是执行以下命令就可以

 tiup ctl pd -u http://192.168.192.32:2379 scheduler remove evict-leader-scheduler 223229

3、我刚才扩容了一台kv也没有分配leader且监控中看不到,是不是也需要执行下pd-ctl scheduler remove evict-leader-scheduler

Starting component `ctl`:  pd -u http://192.168.192.32:2379 store 763660
{
  "store": {
    "id": 763660,
    "address": "192.168.192.38:20160",
    "version": "4.0.0",
    "status_address": "192.168.192.38:20180",
    "git_hash": "198a2cea01734ce8f46d55a29708f123f9133944",
    "start_timestamp": 1596438961,
    "deploy_path": "/home/tidb/deploy/tikv-20160/bin",
    "last_heartbeat": 1596442591610212055,
    "state_name": "Up"
  },
  "status": {
    "capacity": "388GiB",
    "available": "385.6GiB",
    "used_size": "31.5MiB",
    "leader_count": 0,
    "leader_weight": 1,
    "leader_score": 0,
    "leader_size": 0,
    "region_count": 17,
    "region_weight": 1,
    "region_score": 839,
    "region_size": 839,
    "start_ts": "2020-08-03T15:16:01+08:00",
    "last_heartbeat_ts": "2020-08-03T16:16:31.610212055+08:00",
    "uptime": "1h0m30.610212055s"
  }
}

主要根据此信息进行判断。
新扩容的 tikv 不会添加此调度,可以通过 store 看下 region count 和 leader count 是否有增长,观察一段时间,leader 和 region 的调度是根据集群负载和对应 score 进行的

:innocent:1、执行了这个命令还是没有leader的分配

tiup ctl pd -u http://192.168.192.32:2379 scheduler remove evict-leader-scheduler 223229
Starting component `ctl`:  pd -u http://192.168.192.32:2379 scheduler remove evict-leader-scheduler 223229
Usage:
  pd-ctl scheduler remove <scheduler> [flags]

Global Flags:
      --cacert string   path of file that contains list of trusted SSL CAs
      --cert string     path of file that contains X509 certificate in PEM format
  -h, --help            help message
      --key string      path of file that contains X509 key in PEM format
  -u, --pd string       address of pd (default "http://127.0.0.1:2379")
<nil>

2、新增的tikv 中的store 在慢慢增多但是非常慢,leader还是0,Prometheus监控还是没有数据

在反馈下 config show all 情况。

是否返回 successful 字样?

以上是运行tiup ctl pd -u http://192.168.192.32:2379 scheduler remove evict-leader-scheduler 223229返回的结果
2、tiup ctl pd -u http://192.168.192.32:2379 config show all

Starting component `ctl`:  pd -u http://192.168.192.32:2379 config show all
{
  "client-urls": "http://0.0.0.0:2379",
  "peer-urls": "http://192.168.192.32:2380",
  "advertise-client-urls": "http://192.168.192.32:2379",
  "advertise-peer-urls": "http://192.168.192.32:2380",
  "name": "pd_huirui-32",
  "data-dir": "/home/tidb/deploy/data.pd",
  "force-new-cluster": false,
  "enable-grpc-gateway": true,
  "initial-cluster": "pd_huirui-31=http://192.168.192.31:2380,pd_huirui-32=http://192.168.192.32:2380,pd_huirui-33=http://192.168.192.33:2380",
  "initial-cluster-state": "new",
  "join": "",
  "lease": 3,
  "log": {
    "level": "info",
    "format": "text",
    "disable-timestamp": false,
    "file": {
      "filename": "/home/tidb/deploy/log/pd.log",
      "max-size": 300,
      "max-days": 0,
      "max-backups": 0
    },
    "development": false,
    "disable-caller": false,
    "disable-stacktrace": false,
    "disable-error-verbose": true,
    "sampling": null
  },
  "tso-save-interval": "3s",
  "metric": {
    "job": "pd_huirui-32",
    "address": "",
    "interval": "15s"
  },
  "schedule": {
    "max-snapshot-count": 3,
    "max-pending-peer-count": 16,
    "max-merge-region-size": 20,
    "max-merge-region-keys": 200000,
    "split-merge-interval": "1h0m0s",
    "enable-one-way-merge": "false",
    "enable-cross-table-merge": "false",
    "patrol-region-interval": "100ms",
    "max-store-down-time": "30m0s",
    "leader-schedule-limit": 4,
    "leader-schedule-policy": "count",
    "region-schedule-limit": 4,
    "replica-schedule-limit": 8,
    "merge-schedule-limit": 8,
    "hot-region-schedule-limit": 4,
    "hot-region-cache-hits-threshold": 3,
    "store-balance-rate": 15,
    "tolerant-size-ratio": 5,
    "low-space-ratio": 0.8,
    "high-space-ratio": 0.6,
    "scheduler-max-waiting-operator": 3,
    "enable-remove-down-replica": "true",
    "enable-replace-offline-replica": "true",
    "enable-make-up-replica": "true",
    "enable-remove-extra-replica": "true",
    "enable-location-replacement": "true",
    "enable-debug-metrics": "false",
    "schedulers-v2": [
      {
        "type": "balance-region",
        "args": null,
        "disable": false,
        "args-payload": ""
      },
      {
        "type": "balance-leader",
        "args": null,
        "disable": false,
        "args-payload": ""
      },
      {
        "type": "hot-region",
        "args": null,
        "disable": false,
        "args-payload": ""
      },
      {
        "type": "label",
        "args": null,
        "disable": false,
        "args-payload": ""
      },
      {
        "type": "evict-leader",
        "args": [
          "223229"
        ],
        "disable": false,
        "args-payload": ""
      }
    ],
    "schedulers-payload": {
      "balance-hot-region-scheduler": "null",
      "balance-leader-scheduler": "{\"name\":\"balance-leader-scheduler\",\"ranges\":[{\"start-key\":\"\",\"end-key\":\"\"}]}",
      "balance-region-scheduler": "{\"name\":\"balance-region-scheduler\",\"ranges\":[{\"start-key\":\"\",\"end-key\":\"\"}]}",
      "evict-leader-scheduler": "{\"store-id-ranges\":{\"223229\":[{\"start-key\":\"\",\"end-key\":\"\"}]}}",
      "label-scheduler": "{\"name\":\"label-scheduler\",\"ranges\":[{\"start-key\":\"\",\"end-key\":\"\"}]}"
    },
    "store-limit-mode": "manual"
  },
  "replication": {
    "max-replicas": 3,
    "location-labels": "",
    "strictly-match-label": "false",
    "enable-placement-rules": "false"
  },
  "pd-server": {
    "use-region-storage": "true",
    "max-gap-reset-ts": "24h0m0s",
    "key-type": "table",
    "runtime-services": "",
    "metric-storage": "http://192.168.192.33:9090",
    "dashboard-address": "http://192.168.192.33:2379"
  },
  "cluster-version": "4.0.0",
  "quota-backend-bytes": "8GiB",
  "auto-compaction-mode": "periodic",
  "auto-compaction-retention-v2": "1h",
  "TickInterval": "500ms",
  "ElectionInterval": "3s",
  "PreVote": true,
  "security": {
    "cacert-path": "",
    "cert-path": "",
    "key-path": "",
    "cert-allowed-cn": null
  },
  "label-property": {},
  "WarningMsgs": [
    "Config contains undefined item: namespace-classifier"
  ],
  "DisableStrictReconfigCheck": false,
  "HeartbeatStreamBindInterval": "1m0s",
  "LeaderPriorityCheckInterval": "1m0s",
  "dashboard": {
    "tidb_cacert_path": "",
    "tidb_cert_path": "",
    "tidb_key_path": "",
    "public_path_prefix": "/dashboard"
  },
  "replication-mode": {
    "replication-mode": "majority",
    "dr-auto-sync": {
      "label-key": "",
      "primary": "",
      "dr": "",
      "primary-replicas": 0,
      "dr-replicas": 0,
      "wait-store-timeout": "1m0s",
      "wait-sync-timeout": "1m0s"
    }
  }
}

tiup ctl pd  192.168.192.32:2379 scheduler remove evict-leader-scheduler 223229 

执行看下截图。

scale out 的 tikv 节点可以先观察,或者调整下调度线程加快调度,详细可以在 asktug 检索下

tiup update --self && tiup update cluster

升级下

evict-leader-scheduler和ID不应该分开:joy:

[tidb@back-paas ~]$ tiup ctl pd   -u http://192.168.192.32:2379 scheduler remove evict-leader-scheduler-223229
Starting component `ctl`:  pd -u http://192.168.192.32:2379 scheduler remove evict-leader-scheduler-223229
Success!

上来了,老师可以帮忙解释下(remove evict-leader-scheduler-223229)具体作用么,不是太懂

额,sorry,下次可以通过 scheduler show 看下 scheduler 具体的名称。。。

:upside_down_face:辛苦:老师扩容的kv192.168.192.38已经2个多小时了,我看"region_size"没有什么变化,而且prometheus监控上没有任何监控数据,这种情况该怎么办啊

[tidb@back-paas ~]$ tiup ctl pd   -u http://192.168.192.32:2379 store 763660
Starting component `ctl`:  pd -u http://192.168.192.32:2379 store 763660
{
  "store": {
    "id": 763660,
    "address": "192.168.192.38:20160",
    "version": "4.0.0",
    "status_address": "192.168.192.38:20180",
    "git_hash": "198a2cea01734ce8f46d55a29708f123f9133944",
    "start_timestamp": 1596438961,
    "deploy_path": "/home/tidb/deploy/tikv-20160/bin",
    "last_heartbeat": 1596448132045454847,
    "state_name": "Up"
  },
  "status": {
    "capacity": "388GiB",
    "available": "385.6GiB",
    "used_size": "31.5MiB",
    "leader_count": 0,
    "leader_weight": 1,
    "leader_score": 0,
    "leader_size": 0,
    "region_count": 18,
    "region_weight": 1,
    "region_score": 871,
    "region_size": 871,
    "start_ts": "2020-08-03T15:16:01+08:00",
    "last_heartbeat_ts": "2020-08-03T17:48:52.045454847+08:00",
    "uptime": "2h32m51.045454847s"
  }
}

上面的问题是 27 吗? 现在这个是又新扩容了一个? 可以按照上面的排查思路检查下。如果有问题,麻烦反馈 store 和 config show all 信息,多谢

老师我重新立了一个tug新的信息这个帖子里麻烦帮忙看下

OK。