PD集群启动失败

【 TiDB 使用环境】测试/ Poc
【 TiDB 版本】6.5.8
【复现路径】K8S 1.23 使用TiOperator部署
【遇到的问题】PD经常启动失败
【资源配置】
【附件:截图/日志/监控】
cluster-2:~ # kubectl logs -f -n namespace basic-pd-0
Server: 10.96.0.10
Address 1: 10.96.0.10 kube-dns.kube-system.svc.cluster.local

Name: basic-pd-0.basic-pd-peer.namespace.svc
Address 1: 100.66.32.25 basic-pd-0.basic-pd-peer.namespace.svc.cluster.local
nslookup domain basic-pd-0.basic-pd-peer.namespace.svc.svc success
starting pd-server …
/pd-server --data-dir=/var/lib/pd --name=basic-pd-0 --peer-urls=http://0.0.0.0:2380 --advertise-peer-urls=http://basic-pd-0.basic-pd-peer.namespace.svc:2380 --client-urls=http://0.0.0.0:2379 --advertise-client-urls=http://basic-pd-0.basic-pd-peer.namespace.svc:2379 --config=/etc/pd/pd.toml --join=http://basic-pd-2.basic-pd-peer.namespace.svc:2380,http://basic-pd-1.basic-pd-peer.namespace.svc:2380,http://basic-pd-0.basic-pd-peer.namespace.svc:2380/
[2024/09/03 10:14:42.281 +00:00] [INFO] [util.go:43] [“Welcome to Placement Driver (PD)”]
[2024/09/03 10:14:42.281 +00:00] [INFO] [util.go:44] [PD] [release-version=v6.5.8]
[2024/09/03 10:14:42.281 +00:00] [INFO] [util.go:45] [PD] [edition=Community]
[2024/09/03 10:14:42.281 +00:00] [INFO] [util.go:46] [PD] [git-hash=4506d63ba4fba7123ecc8277da7ef5f635efee90]
[2024/09/03 10:14:42.281 +00:00] [INFO] [util.go:47] [PD] [git-branch=heads/refs/tags/v6.5.8]
[2024/09/03 10:14:42.281 +00:00] [INFO] [util.go:48] [PD] [utc-build-time=“2024-01-25 10:03:20”]
[2024/09/03 10:14:42.281 +00:00] [INFO] [metricutil.go:83] [“disable Prometheus push client”]
[2024/09/03 10:14:42.281 +00:00] [INFO] [server.go:253] [“PD Config”] [config=“{"client-urls":"http://0.0.0.0:2379","peer-urls":"http://0.0.0.0:2380","advertise-client-urls":"http://basic-pd-0.basic-pd-peer.namespace.svc:2379","advertise-peer-urls":"http://basic-pd-0.basic-pd-peer.namespace.svc:2380","name":"basic-pd-0","data-dir":"/var/lib/pd","force-new-cluster":false,"enable-grpc-gateway":true,"initial-cluster":"basic-pd-2=http://basic-pd-2.basic-pd-peer.namespace.svc:2380,basic-pd-1=http://basic-pd-1.basic-pd-peer.namespace.svc:2380,basic-pd-0=http://basic-pd-0.basic-pd-peer.namespace.svc:2380/\“,\“initial-cluster-state\”:\“existing\”,\“initial-cluster-token\”:\“pd-cluster\”,\“join\”:\“http://basic-pd-2.basic-pd-peer.namespace.svc:2380,http://basic-pd-1.basic-pd-peer.namespace.svc:2380,http://basic-pd-0.basic-pd-peer.namespace.svc:2380/\”,\“lease\”:3,\“log\”:{\“level\”:\“info\”,\“format\”:\“text\”,\“disable-timestamp\”:false,\“file\”:{\“filename\”:\”\“,\“max-size\”:0,\“max-days\”:0,\“max-backups\”:0},\“development\”:false,\“disable-caller\”:false,\“disable-stacktrace\”:false,\“disable-error-verbose\”:true,\“sampling\”:null,\“error-output-path\”:\”\“},\“tso-save-interval\”:\“3s\”,\“tso-update-physical-interval\”:\“50ms\”,\“enable-local-tso\”:false,\“metric\”:{\“job\”:\“basic-pd-0\”,\“address\”:\”\“,\“interval\”:\“15s\”},\“schedule\”:{\“max-snapshot-count\”:64,\“max-pending-peer-count\”:64,\“max-merge-region-size\”:20,\“max-merge-region-keys\”:0,\“split-merge-interval\”:\“1h0m0s\”,\“swtich-witness-interval\”:\“1h0m0s\”,\“enable-one-way-merge\”:\“false\”,\“enable-cross-table-merge\”:\“true\”,\“patrol-region-interval\”:\“10ms\”,\“max-store-down-time\”:\“30m0s\”,\“max-store-preparing-time\”:\“48h0m0s\”,\“leader-schedule-limit\”:4,\“leader-schedule-policy\”:\“count\”,\“region-schedule-limit\”:2048,\“replica-schedule-limit\”:64,\“merge-schedule-limit\”:8,\“hot-region-schedule-limit\”:4,\“hot-region-cache-hits-threshold\”:3,\“store-limit\”:{},\“tolerant-size-ratio\”:0,\“low-space-ratio\”:0.8,\“high-space-ratio\”:0.7,\“region-score-formula-version\”:\“v2\”,\“scheduler-max-waiting-operator\”:5,\“enable-remove-down-replica\”:\“true\”,\“enable-replace-offline-replica\”:\“true\”,\“enable-make-up-replica\”:\“true\”,\“enable-remove-extra-replica\”:\“true\”,\“enable-location-replacement\”:\“true\”,\“enable-debug-metrics\”:\“false\”,\“enable-joint-consensus\”:\“true\”,\“enable-tikv-split-region\”:\“true\”,\“schedulers-v2\”:[{\“type\”:\“balce-region\”,\“args\”:null,\“disable\”:false,\“args-payload\”:\”\“},{\“type\”:\“balce-leader\”,\“args\”:null,\“disable\”:false,\“args-payload\”:\”\“},{\“type\”:\“hot-region\”,\“args\”:null,\“disable\”:false,\“args-payload\”:\”\“},{\“type\”:\“split-bucket\”,\“args\”:null,\“disable\”:false,\“args-payload\”:\”\“}],\“schedulers-payload\”:null,\“store-limit-mode\”:\“manual\”,\“hot-regions-write-interval\”:\“10m0s\”,\“hot-regions-reserved-days\”:7,\“enable-diagnostic\”:\“false\”,\“enable-witness\”:\“false\”},\“replication\”:{\“max-replicas\”:3,\“location-labels\”:\”\“,\“strictly-match-label\”:\“false\”,\“enable-placement-rules\”:\“true\”,\“enable-placement-rules-cache\”:\“false\”,\“isolation-level\”:\”\“},\“pd-server\”:{\“use-region-storage\”:\“true\”,\“max-gap-reset-ts\”:\“24h0m0s\”,\“key-type\”:\“table\”,\“runtime-services\”:\”\“,\“metric-storage\”:\”\“,\“dashboard-address\”:\“auto\”,\“trace-region-flow\”:\“true\”,\“flow-round-by-digit\”:3,\“min-resolved-ts-persistence-interval\”:\“1s\”},\“cluster-version\”:\“0.0.0\”,\“labels\”:{},\“quota-backend-bytes\”:\“8GiB\”,\“auto-compaction-mode\”:\“periodic\”,\“auto-compaction-retention-v2\”:\“1h\”,\“TickInterval\”:\“500ms\”,\“ElectionInterval\”:\“3s\”,\“PreVote\”:true,\“max-request-bytes\”:157286400,\“security\”:{\“cacert-path\”:\”\“,\“cert-path\”:\”\“,\“key-path\”:\”\“,\“cert-allowed-cn\”:null,\“SSLCABytes\”:null,\“SSLCertBytes\”:null,\“SSLKEYBytes\”:null,\“redact-info-log\”:false,\“encryption\”:{\“data-encryption-method\”:\“plaintext\”,\“data-key-rotation-period\”:\“168h0m0s\”,\“master-key\”:{\“type\”:\“plaintext\”,\“key-id\”:\”\“,\“region\”:\”\“,\“endpoint\”:\”\“,\“path\”:\”\“}}},\“label-property\”:null,\“WarningMsgs\”:null,\“DisableStrictReconfigCheck\”:false,\“HeartbeatStreamBindInterval\”:\“1m0s\”,\“LeaderPriorityCheckInterval\”:\“1m0s\”,\“dashboard\”:{\“tidb-cacert-path\”:\”\“,\“tidb-cert-path\”:\”\“,\“tidb-key-path\”:\”\“,\“public-path-prefix\”:\”\“,\“internal-proxy\”:false,\“enable-telemetry\”:false,\“enable-experimental\”:false},\“replication-mode\”:{\“replication-mode\”:\“majority\”,\“dr-auto-sync\”:{\“label-key\”:\”\“,\“primary\”:\”\“,\“dr\”:\”\“,\“primary-replicas\”:0,\“dr-replicas\”:0,\“wait-store-timeout\”:\“1m0s\”,\“pause-region-split\”:\“false\”}}}”]
[2024/09/03 10:14:42.289 +00:00] [INFO] [server.go:228] [“register REST path”] [path=/pd/api/v1]
[2024/09/03 10:14:42.289 +00:00] [INFO] [server.go:228] [“register REST path”] [path=/pd/api/v2/]
[2024/09/03 10:14:42.289 +00:00] [INFO] [server.go:228] [“register REST path”] [path=/autoscaling]
[2024/09/03 10:14:42.289 +00:00] [INFO] [distro.go:51] [“Using distribution strings”] [strings={}]
[2024/09/03 10:14:42.292 +00:00] [INFO] [server.go:228] [“register REST path”] [path=/dashboard/api/]
[2024/09/03 10:14:42.292 +00:00] [INFO] [server.go:228] [“register REST path”] [path=/dashboard/]
[2024/09/03 10:14:42.293 +00:00] [INFO] [etcd.go:117] [“configuring peer listeners”] [listen-peer-urls=”[http://0.0.0.0:2380]“]
[2024/09/03 10:14:42.293 +00:00] [INFO] [systimemon.go:30] [“start system time monitor”]
[2024/09/03 10:14:42.293 +00:00] [INFO] [etcd.go:127] [“configuring client listeners”] [listen-client-urls=”[http://0.0.0.0:2379]“]
[2024/09/03 10:14:42.293 +00:00] [INFO] [etcd.go:611] [“pprof is enabled”] [path=/debug/pprof]
[2024/09/03 10:14:42.294 +00:00] [INFO] [etcd.go:369] [“closing etcd server”] [name=basic-pd-0] [data-dir=/var/lib/pd] [advertise-peer-urls=”[http://basic-pd-0.basic-pd-peer.namespace.svc:2380]“] [advertise-client-urls=”[http://basic-pd-0.basic-pd-peer.namespace.svc:2379]“]
[2024/09/03 10:14:42.294 +00:00] [INFO] [etcd.go:373] [“closed etcd server”] [name=basic-pd-0] [data-dir=/var/lib/pd] [advertise-peer-urls=”[http://basic-pd-0.basic-pd-peer.namespace.svc:2380]“] [advertise-client-urls=”[http://basic-pd-0.basic-pd-peer.namespace.svc:2379]“]
[2024/09/03 10:14:42.294 +00:00] [FATAL] [main.go:120] [“run server failed”] [error=”[PD:etcd:ErrStartEtcd]error setting up initial cluster: URL must not contain a path: http://basic-pd-0.basic-pd-peer.namespace.svc:2380/: error setting up initial cluster: URL must not contain a path: http://basic-pd-0.basic-pd-peer.namespace.svc:2380/"] [stack=“main.main\n\t/home/jenkins/agent/workspace/build-common/go/src/github.com/pingcap/pd/cmd/pd-server/main.go:120\nruntime.main\n\t/usr/local/go/src/runtime/proc.go:250”]

有没有大神回答下,后面这个报错的原因,我看了正常启动的时候,join的成员是3个啊。为啥说不能包含自己。

Answer

这个错误的原因是 PD 服务器的 join 配置中包含了一个无效的 URL。具体来说,问题出在以下配置行:

--join=http://basic-pd-2.basic-pd-peer.namespace.svc:2380,http://basic-pd-1.basic-pd-peer.namespace.svc:2380,http://basic-pd-0.basic-pd-peer.namespace.svc:2380/

错误信息指出 URL 不能包含路径,但最后一个 URL http://basic-pd-0.basic-pd-peer.namespace.svc:2380/ 末尾多了一个斜杠 /。这个多余的斜杠被解释为路径,导致了错误。

要解决这个问题,你需要移除最后一个 URL 末尾的斜杠。正确的配置应该是:

--join=http://basic-pd-2.basic-pd-peer.namespace.svc:2380,http://basic-pd-1.basic-pd-peer.namespace.svc:2380,http://basic-pd-0.basic-pd-peer.namespace.svc:2380

这个错误与 join 配置包含自身无关。PD 服务器在启动时可以包含自己的 URL 在 join 列表中,这是允许的。问题纯粹是由于 URL 格式不正确造成的。

要修复这个问题:

  1. 检查你的 PD 配置文件(通常是 pd.toml)。
  2. 找到 join 配置项。
  3. 确保所有的 URL 都不包含末尾的斜杠。
  4. 保存更改并重启 PD 服务器。

如果你使用的是 Kubernetes 部署,你可能需要更新你的 Kubernetes 配置文件(如 ConfigMap 或 StatefulSet 定义)来修复这个问题。

这个错误强调了在配置分布式系统时需要格外注意细节,即使是一个小小的斜杠也可能导致启动失败12。

Footnotes

  1. PD Configuration File | PingCAP Docs:leftwards_arrow_with_hook:
  2. TiDB Cluster Troubleshooting | PingCAP Docs

以上答案来自于 tidb.ai。可以试试有没有作用

下面的答案是我本来要说的:
既然你是tidb-operator部署的,拿出来你的tc看看,按理说你都不会动到pd的启动参数的。

1 个赞

看报错etcd无法启动。

配置文件看看

跟大佬们学习下。

etcd close?是不是etcd有问题

这个根本不是自己配的,这个是TiOperator自动配置的内容。

tc配置如下:
apiVersion: pingcap.com/v1alpha1
kind: TidbCluster
metadata:
name: basic
namespace: namespace
spec:
version: v6.5.8
preferIPv6: false
timezone: UTC
pvReclaimPolicy: Retain
discovery: {}
pd:
podManagementPolicy: Parallel
baseImage: pingcap/pd
replicas: 3
# if storageClassName is not set, the default Storage Class of the Kubernetes cluster will be used
storageClassName: tidb-pd-storage-namespace
requests:
storage: “1Gi”
config: {}
nodeSelector:
node-role.kubernetes.io/master: “”
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: app.kubernetes.io/component
operator: “In”
values:
- “pd”
topologyKey: “kubernetes.io/hostname
tikv:
podManagementPolicy: Parallel
baseImage: pingcap/tikv
replicas: 3
# if storageClassName is not set, the default Storage Class of the Kubernetes cluster will be used
storageClassName: tidb-kv-storage-namespace
requests:
storage: “20Gi”
config:
raftdb:
compaction-readahead-size: 2MiB
defaultcf:
max-write-buffer-number: 10
target-file-size-base: 32MiB
raftstore:
sync-log: true
readpool:
coprocessor:
use-unified-pool: true
storage:
use-unified-pool: true
unified:
max-thread-count: 10
rocksdb:
compaction-readahead-size: 2MiB
defaultcf:
max-write-buffer-number: 10
target-file-size-base: 32MiB
storage:
block-cache:
capacity: 2GiB
shared: true
strict-capacity-limit: true
reserve-space: 0MB
nodeSelector:
node-role.kubernetes.io/master: “”
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: app.kubernetes.io/component
operator: “In”
values:
- “tikv”
topologyKey: “kubernetes.io/hostname
tidb:
podManagementPolicy: Parallel
baseImage: pingcap/tidb
replicas: 3
limits:
memory: 8Gi
service:
type: ClusterIP
config: |
[instance]
tidb_slow_log_threshold = 10000
nodeSelector:
node-role.kubernetes.io/master: “”
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: app.kubernetes.io/component
operator: “In”
values:
- “tidb”
topologyKey: “kubernetes.io/hostname

etcd是pd内嵌的,源码都没找到,不知道莫名其妙要close

operator使用的是什么版本的

看你的tc没啥问题,那检查下operator的版本?

https://docs.pingcap.com/zh/tidb-in-kubernetes/stable/tidb-operator-overview

现在用的1.5.2版本

看报错是生成的启动命令行有问题。你要不试试升级下 tidb operator 到 1.6 版本,tidb 到 6.5.10

我准备回退到1.4.7

OK 了?

没有,回退到1.4.7发现这个版本不支持IPv6,难了! 现在升级到1.5.3再观察。

要不用更新的 Orz

在配置 PD 集群时,提供的 etcd 初始集群 URL 包含了一个路径,这是不被允许的。在 etcd 的 URL 中,路径部分(即冒号后面的部分)应该是空的,因为 etcd 服务监听的是 IP 地址和端口,而不是特定的路径。 http://basic-pd-0.basic-pd-peer.namespace.svc:2380/ 这个末尾的斜杠去掉试试