TIKV扩容后数据没有均衡

按照官网进行tikv扩容,执行成功,但是发现数据没有均衡。具体信息如下:

store信息如下:
{
“count”: 7,
“stores”: [
{
“store”: {
“id”: 7,
“address”: “10.92.138.251:20160”,
“version”: “4.0.5”,
“status_address”: “10.92.138.251:20180”,
“git_hash”: “f39927a3529d40a6bb4e6c54854a94fdac996e92”,
“start_timestamp”: 1600331051,
“deploy_path”: “/tidb/tikv-20160/bin”,
“last_heartbeat”: 1600744859101722547,
“state_name”: “Up”
},
“status”: {
“capacity”: “40.23GiB”,
“available”: “10.49GiB”,
“used_size”: “26.57GiB”,
“leader_count”: 0,
“leader_weight”: 1,
“leader_score”: 0,
“leader_size”: 0,
“region_count”: 0,
“region_weight”: 1,
“region_score”: -6670707882.846615,
“region_size”: 0,
“start_ts”: “2020-09-17T16:24:11+08:00”,
“last_heartbeat_ts”: “2020-09-22T11:20:59.101722547+08:00”,
“uptime”: “114h56m48.101722547s”
}
},
{
“store”: {
“id”: 48,
“address”: “10.92.138.254:3930”,
“labels”: [
{
“key”: “engine”,
“value”: “tiflash”
}
],
“version”: “v4.0.5”,
“peer_address”: “10.92.138.254:20170”,
“status_address”: “10.92.138.254:20292”,
“git_hash”: “47883266d561b299eb900cd04e32a816fc621a69”,
“start_timestamp”: 1600331073,
“deploy_path”: “/tidb/tiflash-9000/bin/tiflash”,
“last_heartbeat”: 1600744860112974366,
“state_name”: “Up”
},
“status”: {
“capacity”: “20.54GiB”,
“available”: “17.38GiB”,
“used_size”: “29KiB”,
“leader_count”: 0,
“leader_weight”: 1,
“leader_score”: 0,
“leader_size”: 0,
“region_count”: 0,
“region_weight”: 1,
“region_score”: 0,
“region_size”: 0,
“start_ts”: “2020-09-17T16:24:33+08:00”,
“last_heartbeat_ts”: “2020-09-22T11:21:00.112974366+08:00”,
“uptime”: “114h56m27.112974366s”
}
},
{
“store”: {
“id”: 4033,
“address”: “10.92.139.0:20160”,
“version”: “4.0.5”,
“status_address”: “10.92.139.0:20180”,
“git_hash”: “f39927a3529d40a6bb4e6c54854a94fdac996e92”,
“start_timestamp”: 1600682930,
“deploy_path”: “/tidb/tikv-20160/bin”,
“last_heartbeat”: 1600744857869695991,
“state_name”: “Up”
},
“status”: {
“capacity”: “56.96GiB”,
“available”: “54.77GiB”,
“used_size”: “31.5MiB”,
“leader_count”: 0,
“leader_weight”: 1,
“leader_score”: 0,
“leader_size”: 0,
“region_count”: 0,
“region_weight”: 1,
“region_score”: 0,
“region_size”: 0,
“start_ts”: “2020-09-21T18:08:50+08:00”,
“last_heartbeat_ts”: “2020-09-22T11:20:57.869695991+08:00”,
“uptime”: “17h12m7.869695991s”
}
},
{
“store”: {
“id”: 4034,
“address”: “10.92.139.1:20160”,
“version”: “4.0.5”,
“status_address”: “10.92.139.1:20180”,
“git_hash”: “f39927a3529d40a6bb4e6c54854a94fdac996e92”,
“start_timestamp”: 1600682930,
“deploy_path”: “/tidb/tikv-20160/bin”,
“last_heartbeat”: 1600744858539908052,
“state_name”: “Up”
},
“status”: {
“capacity”: “56.96GiB”,
“available”: “54.77GiB”,
“used_size”: “31.5MiB”,
“leader_count”: 0,
“leader_weight”: 1,
“leader_score”: 0,
“leader_size”: 0,
“region_count”: 0,
“region_weight”: 1,
“region_score”: 0,
“region_size”: 0,
“start_ts”: “2020-09-21T18:08:50+08:00”,
“last_heartbeat_ts”: “2020-09-22T11:20:58.539908052+08:00”,
“uptime”: “17h12m8.539908052s”
}
},
{
“store”: {
“id”: 4035,
“address”: “10.92.137.114:20160”,
“version”: “4.0.5”,
“status_address”: “10.92.137.114:20180”,
“git_hash”: “f39927a3529d40a6bb4e6c54854a94fdac996e92”,
“start_timestamp”: 1600682930,
“deploy_path”: “/tidb/tikv-20160/bin”,
“last_heartbeat”: 1600744858538797739,
“state_name”: “Up”
},
“status”: {
“capacity”: “56.96GiB”,
“available”: “54.77GiB”,
“used_size”: “31.5MiB”,
“leader_count”: 0,
“leader_weight”: 1,
“leader_score”: 0,
“leader_size”: 0,
“region_count”: 0,
“region_weight”: 1,
“region_score”: 0,
“region_size”: 0,
“start_ts”: “2020-09-21T18:08:50+08:00”,
“last_heartbeat_ts”: “2020-09-22T11:20:58.538797739+08:00”,
“uptime”: “17h12m8.538797739s”
}
},
{
“store”: {
“id”: 1,
“address”: “10.92.138.252:20161”,
“version”: “4.0.5”,
“status_address”: “10.92.138.252:20181”,
“git_hash”: “f39927a3529d40a6bb4e6c54854a94fdac996e92”,
“start_timestamp”: 1600331066,
“deploy_path”: “/tidb/tikv-20161/bin”,
“last_heartbeat”: 1600744862895177749,
“state_name”: “Up”
},
“status”: {
“capacity”: “40.23GiB”,
“available”: “10.65GiB”,
“used_size”: “26.61GiB”,
“leader_count”: 0,
“leader_weight”: 1,
“leader_score”: 0,
“leader_size”: 0,
“region_count”: 0,
“region_weight”: 1,
“region_score”: -6724683021.379413,
“region_size”: 0,
“start_ts”: “2020-09-17T16:24:26+08:00”,
“last_heartbeat_ts”: “2020-09-22T11:21:02.895177749+08:00”,
“uptime”: “114h56m36.895177749s”
}
},
{
“store”: {
“id”: 2,
“address”: “10.92.138.253:20162”,
“version”: “4.0.5”,
“status_address”: “10.92.138.253:20182”,
“git_hash”: “f39927a3529d40a6bb4e6c54854a94fdac996e92”,
“start_timestamp”: 1600331053,
“deploy_path”: “/tidb/tikv-20162/bin”,
“last_heartbeat”: 1600744862550404243,
“state_name”: “Up”
},
“status”: {
“capacity”: “40.23GiB”,
“available”: “11.16GiB”,
“used_size”: “26.5GiB”,
“leader_count”: 0,
“leader_weight”: 1,
“leader_score”: 0,
“leader_size”: 0,
“region_count”: 0,
“region_weight”: 1,
“region_score”: -6830957559.181029,
“region_size”: 0,
“start_ts”: “2020-09-17T16:24:13+08:00”,
“last_heartbeat_ts”: “2020-09-22T11:21:02.550404243+08:00”,
“uptime”: “114h56m49.550404243s”
}
}
]
}

config 信息如下:
{
“replication”: {
“enable-placement-rules”: “true”,
“location-labels”: “”,
“max-replicas”: 3,
“strictly-match-label”: “false”
},
“schedule”: {
“enable-cross-table-merge”: “false”,
“enable-debug-metrics”: “false”,
“enable-location-replacement”: “true”,
“enable-make-up-replica”: “true”,
“enable-one-way-merge”: “false”,
“enable-remove-down-replica”: “true”,
“enable-remove-extra-replica”: “true”,
“enable-replace-offline-replica”: “true”,
“high-space-ratio”: 0.7,
“hot-region-cache-hits-threshold”: 3,
“hot-region-schedule-limit”: 4,
“leader-schedule-limit”: 4,
“leader-schedule-policy”: “count”,
“low-space-ratio”: 0.8,
“max-merge-region-keys”: 200000,
“max-merge-region-size”: 20,
“max-pending-peer-count”: 16,
“max-snapshot-count”: 3,
“max-store-down-time”: “30m0s”,
“merge-schedule-limit”: 8,
“patrol-region-interval”: “100ms”,
“region-schedule-limit”: 2048,
“replica-schedule-limit”: 64,
“scheduler-max-waiting-operator”: 5,
“split-merge-interval”: “1h0m0s”,
“store-limit-mode”: “manual”,
“tolerant-size-ratio”: 0
}
}

» scheduler show
[
“balance-hot-region-scheduler”,
“balance-leader-scheduler”,
“balance-region-scheduler”,
“label-scheduler”
]

通过这个 pd-ctl 执行 store 命令的结果看,所有的节点上的 region count 和 leader count 都为 0

这个麻烦导出一下 Overview 面板以及 PD 面板的监控看下
导出监控步骤:

  1. 打开 Overview 面板,监控时间选举最近 3 小时
  2. 打开 Grafana 监控面板(先按 d 再按 E 可将所有 Rows 的 Panels 打开,需等待一段时间待页面加载完成)
  3. https://metricstool.pingcap.com/ 使用工具导出 Grafana 数据为快照

具体可以参考文档:[FAQ] Grafana Metrics 页面的导出和导入

信息如附件,请查看。
tidb-test-Overview_2020-09-22T05 50 39.817Z.json (2.8 MB) tidb-test-PD_2020-09-22T05 52 27.597Z.json (2.7 MB)

PD 集群可以理解为 ETCD 集群,只有一个 PD 节点的话无法正常选出 PD leader 节点,会导致 PD 无法正常工作
建议扩容成 3 个 PD 节点,最少也需要 2 个 PD 节点。

hello,我已经把pd扩容为3个节点了,然后怎么做?

tarting component cluster: /home/tidb/.tiup/components/cluster/v1.1.2/tiup-cluster display tidb-test
tidb Cluster: tidb-test
tidb Version: v4.0.5
ID Role Host Ports OS/Arch Status Data Dir Deploy Dir


10.92.138.250:3000 grafana 10.92.138.250 3000 linux/x86_64 Up - /tidb/grafana-3000
10.92.138.249:2379 pd 10.92.138.249 2379/2380 linux/x86_64 Up /data1/pd-2379 /tidb/pd-2379
10.92.138.250:2379 pd 10.92.138.250 2379/2380 linux/x86_64 Up|L|UI /data1/pd-2379 /tidb/pd-2379
10.92.138.255:2379 pd 10.92.138.255 2379/2380 linux/x86_64 Up /data1/pd-2379 /tidb/pd-2379
10.92.138.250:9090 prometheus 10.92.138.250 9090 linux/x86_64 Up /data1/prometheus-9090 /tidb/prometheus-9090
10.92.138.249:4000 tidb 10.92.138.249 4000/10080 linux/x86_64 Up - /tidb/tidb-4000
10.92.138.254:9000 tiflash 10.92.138.254 9000/8123/3930/20170/20292/8234 linux/x86_64 Up /data1/tiflash-9000 /tidb/tiflash-9000
10.92.137.114:20160 tikv 10.92.137.114 20160/20180 linux/x86_64 Up /data1/tikv-20160 /tidb/tikv-20160
10.92.138.251:20160 tikv 10.92.138.251 20160/20180 linux/x86_64 Up /data1/tikv-20160 /tidb/tikv-20160
10.92.138.252:20161 tikv 10.92.138.252 20161/20181 linux/x86_64 Up /data1/tikv-20161 /tidb/tikv-20161
10.92.138.253:20162 tikv 10.92.138.253 20162/20182 linux/x86_64 Up /data1/tikv-20162 /tidb/tikv-20162
10.92.139.0:20160 tikv 10.92.139.0 20160/20180 linux/x86_64 Up /data1/tikv-20160 /tidb/tikv-20160
10.92.139.1:20160 tikv 10.92.139.1 20160/20180 linux/x86_64 Up /data1/tikv-20160 /tidb/tikv-20160

完成pd扩容后,store的信息也没有变化,具体如下:
“status”: {
“capacity”: “56.96GiB”,
“available”: “54.76GiB”,
“used_size”: “31.5MiB”,
“leader_count”: 0,
“leader_weight”: 1,
“leader_score”: 0,
“leader_size”: 0,
“region_count”: 0,
“region_weight”: 1,
“region_score”: 0,
“region_size”: 0,
“start_ts”: “2020-09-21T18:08:50+08:00”,
“last_heartbeat_ts”: “2020-09-23T09:17:47.600463979+08:00”,
“uptime”: “39h8m57.600463979s”

问题一开始是扩容后不均衡吗? 麻烦反馈pd-ctl 中 config show all 的完整信息,多谢

扩容前,那时还不会用pd store的命令没有注意到,但是通过df -h看三个tikv的容量都是比较接近的。 下面是config show all 的内容,请查看:
» config show all
{
“client-urls”: “http://0.0.0.0:2379”,
“peer-urls”: “http://10.92.138.250:2380”,
“advertise-client-urls”: “http://10.92.138.250:2379”,
“advertise-peer-urls”: “http://10.92.138.250:2380”,
“name”: “pd-10.92.138.250-2379”,
“data-dir”: “/data1/pd-2379”,
“force-new-cluster”: false,
“enable-grpc-gateway”: true,
“initial-cluster”: “pd-10.92.138.250-2379=http://10.92.138.250:2380”,
“initial-cluster-state”: “new”,
“join”: “”,
“lease”: 3,
“log”: {
“level”: “”,
“format”: “text”,
“disable-timestamp”: false,
“file”: {
“filename”: “/tidb/pd-2379/log/pd.log”,
“max-size”: 300,
“max-days”: 0,
“max-backups”: 0
},
“development”: false,
“disable-caller”: false,
“disable-stacktrace”: false,
“disable-error-verbose”: true,
“sampling”: null
},
“tso-save-interval”: “3s”,
“metric”: {
“job”: “pd-10.92.138.250-2379”,
“address”: “”,
“interval”: “15s”
},
“schedule”: {
“max-snapshot-count”: 3,
“max-pending-peer-count”: 16,
“max-merge-region-size”: 20,
“max-merge-region-keys”: 200000,
“split-merge-interval”: “1h0m0s”,
“enable-one-way-merge”: “false”,
“enable-cross-table-merge”: “false”,
“patrol-region-interval”: “100ms”,
“max-store-down-time”: “30m0s”,
“leader-schedule-limit”: 4,
“leader-schedule-policy”: “count”,
“region-schedule-limit”: 2048,
“replica-schedule-limit”: 64,
“merge-schedule-limit”: 8,
“hot-region-schedule-limit”: 4,
“hot-region-cache-hits-threshold”: 3,
“store-limit”: {
“1”: {
“add-peer”: 15,
“remove-peer”: 15
},
“2”: {
“add-peer”: 15,
“remove-peer”: 15
},
“7”: {
“add-peer”: 15,
“remove-peer”: 15
}
},
“tolerant-size-ratio”: 0,
“low-space-ratio”: 0.8,
“high-space-ratio”: 0.7,
“scheduler-max-waiting-operator”: 5,
“enable-remove-down-replica”: “true”,
“enable-replace-offline-replica”: “true”,
“enable-make-up-replica”: “true”,
“enable-remove-extra-replica”: “true”,
“enable-location-replacement”: “true”,
“enable-debug-metrics”: “false”,
“schedulers-v2”: [
{
“type”: “balance-region”,
“args”: null,
“disable”: false,
“args-payload”: “”
},
{
“type”: “balance-leader”,
“args”: null,
“disable”: false,
“args-payload”: “”
},
{
“type”: “hot-region”,
“args”: null,
“disable”: false,
“args-payload”: “”
},
{
“type”: “label”,
“args”: null,
“disable”: false,
“args-payload”: “”
}
],
“schedulers-payload”: {
“balance-hot-region-scheduler”: null,
“balance-leader-scheduler”: {
“name”: “balance-leader-scheduler”,
“ranges”: [
{
“end-key”: “”,
“start-key”: “”
}
]
},
“balance-region-scheduler”: {
“name”: “balance-region-scheduler”,
“ranges”: [
{
“end-key”: “”,
“start-key”: “”
}
]
},
“label-scheduler”: {
“name”: “label-scheduler”,
“ranges”: [
{
“end-key”: “”,
“start-key”: “”
}
]
}
},
“store-limit-mode”: “manual”
},
“replication”: {
“max-replicas”: 3,
“location-labels”: “”,
“strictly-match-label”: “false”,
“enable-placement-rules”: “true”
},
“pd-server”: {
“use-region-storage”: “true”,
“max-gap-reset-ts”: “24h0m0s”,
“key-type”: “table”,
“runtime-services”: “”,
“metric-storage”: “http://10.92.138.250:9090”,
“dashboard-address”: “http://10.92.138.250:2379”,
“trace-region-flow”: “false”
},
“cluster-version”: “4.0.5”,
“quota-backend-bytes”: “8GiB”,
“auto-compaction-mode”: “periodic”,
“auto-compaction-retention-v2”: “1h”,
“TickInterval”: “500ms”,
“ElectionInterval”: “3s”,
“PreVote”: true,
“security”: {
“cacert-path”: “”,
“cert-path”: “”,
“key-path”: “”,
“cert-allowed-cn”: null
},
“label-property”: {},
“WarningMsgs”: null,
“DisableStrictReconfigCheck”: false,
“HeartbeatStreamBindInterval”: “1m0s”,
“LeaderPriorityCheckInterval”: “1m0s”,
“dashboard”: {
“tidb-cacert-path”: “”,
“tidb-cert-path”: “”,
“tidb-key-path”: “”,
“public-path-prefix”: “”,
“internal-proxy”: false,
“enable-telemetry”: true
},
“replication-mode”: {
“replication-mode”: “majority”,
“dr-auto-sync”: {
“label-key”: “”,
“primary”: “”,
“dr”: “”,
“primary-replicas”: 0,
“dr-replicas”: 0,
“wait-store-timeout”: “1m0s”,
“wait-sync-timeout”: “1m0s”
}
}
}

»

所有store 的 region count 都是0,麻烦发一下
pdctl>> region
的信息

region信息请看附件。
region.log (215.4 KB)

上面有很多region,与之前提到 store 的信息有些不符合,建议:

  • 升级 TiDB 到 4.0.6
  • 测试的磁盘空间要进行清理,有些盘可以看到容量是 50GB, 被不知道什么东西用了 20GB. 比如:
    “capacity”: “40.23GiB”,
    “available”: “11.16GiB”,
    “used_size”: “26.5GiB”,
    “leader_count”: 0,
    “leader_weight”: 1,
    “leader_score”: 0,
    “leader_size”: 0,
    “region_count”: 0,

如果不能亲清理可以按磁盘剩余空间限制tikv 能使用的大小,比如修改 tikv 参数:
[raftstore]
capacity = “10GB”

1 个赞