K8S TiDB集群报错后无法重启

【 TiDB 使用环境】生产环境
【 TiDB 版本】v6.5.0
【复现路径】无
【遇到的问题:问题现象及影响】
TiDB集群中的TiDB节点内存和CPU升高,随后开始不断重启并重启失败,报错日志如下
看起来是在载入BindInfo的过程中使用了未初始化的指针导致了问题
删除Pod后无法恢复,目前全部的TiDB Pod全部crashed,导致集群处于不可用状态

 panic: runtime error: invalid memory address or nil pointer dereference [recovered]
     panic: runtime error: invalid memory address or nil pointer dereference
 [signal SIGSEGV: segmentation violation code=0x1 addr=0x0 pc=0x313b7ce]

 goroutine 1 [running]:
 github.com/pingcap/tidb/executor.(*Compiler).Compile.func1()
     /home/jenkins/agent/workspace/build-common/go/src/github.com/pingcap/tidb/executor/compiler.go:72 +0x445
 panic({0x4318e40, 0x6ec6870})
     /usr/local/go/src/runtime/panic.go:884 +0x212
 github.com/pingcap/tidb/statistics/handle.(*Handle).GetPartitionStats(0xc00031d180?, 0x4f8da00?, 0x4f72898?, {0x0?, 0xc0046745a0?, 0x16?})
     /home/jenkins/agent/workspace/build-common/go/src/github.com/pingcap/tidb/statistics/handle/handle.go:997 +0x2e
 github.com/pingcap/tidb/statistics/handle.(*Handle).GetTableStats(...)
     /home/jenkins/agent/workspace/build-common/go/src/github.com/pingcap/tidb/statistics/handle/handle.go:992
 github.com/pingcap/tidb/planner/core.(*PlanBuilder).buildDataSource(0xc004a82d00, {0x4fafbb0, 0xc0046f7410}, 0xc0048c6ea0, 0xc0048b8c90)
     /home/jenkins/agent/workspace/build-common/go/src/github.com/pingcap/tidb/planner/core/logical_plan_builder.go:4456 +0x9ce
 github.com/pingcap/tidb/planner/core.(*PlanBuilder).buildResultSetNode(0xc004a82d00, {0x4fafbb0?, 0xc0046f7410?}, {0x4fc96b0?, 0xc0048b8c40?}, 0x0?)
     /home/jenkins/agent/workspace/build-common/go/src/github.com/pingcap/tidb/planner/core/logical_plan_builder.go:380 +0x19d
 github.com/pingcap/tidb/planner/core.(*PlanBuilder).buildJoin(0xc002b24f50?, {0x4fafbb0?, 0xc0046f7410?}, 0x16?)
     /home/jenkins/agent/workspace/build-common/go/src/github.com/pingcap/tidb/planner/core/logical_plan_builder.go:720 +0x71d
 github.com/pingcap/tidb/planner/core.(*PlanBuilder).buildResultSetNode(0x0?, {0x4fafbb0?, 0xc0046f7410?}, {0x4fc8948?, 0xc001ea9950?}, 0x0?)
     /home/jenkins/agent/workspace/build-common/go/src/github.com/pingcap/tidb/planner/core/logical_plan_builder.go:367 +0x271
 github.com/pingcap/tidb/planner/core.(*PlanBuilder).buildTableRefs(0xc004a82d00?, {0x4fafbb0?, 0xc0046f7410?}, 0x393d09e?)
     /home/jenkins/agent/workspace/build-common/go/src/github.com/pingcap/tidb/planner/core/logical_plan_builder.go:359 +0x85
 github.com/pingcap/tidb/planner/core.(*PlanBuilder).buildSelect(0xc004a82d00, {0x4fafbb0, 0xc0046f7410}, 0xc0048ce000)
     /home/jenkins/agent/workspace/build-common/go/src/github.com/pingcap/tidb/planner/core/logical_plan_builder.go:3916 +0x6c7
 github.com/pingcap/tidb/planner/core.(*PlanBuilder).Build(0xc004a82d00, {0x4fafbb0, 0xc0046f7410}, {0x4fc4080?, 0xc0048ce000?})
     /home/jenkins/agent/workspace/build-common/go/src/github.com/pingcap/tidb/planner/core/planbuilder.go:804 +0x745
 github.com/pingcap/tidb/planner.buildLogicalPlan({0x4fafbb0, 0xc0046f7410}, {0x501e818?, 0xc00031d180}, {0x4fc4080, 0xc0048ce000}, 0xc004a82d00)
     /home/jenkins/agent/workspace/build-common/go/src/github.com/pingcap/tidb/planner/optimize.go:461 +0x12f
 github.com/pingcap/tidb/planner.optimize({0x4fafbb0, 0xc0046f7410}, {0x501e818?, 0xc00031d180}, {0x4fc4080?, 0xc0048ce000?}, {0x4fe5b50, 0xc0046f7470})
     /home/jenkins/agent/workspace/build-common/go/src/github.com/pingcap/tidb/planner/optimize.go:382 +0x473
 github.com/pingcap/tidb/planner.Optimize({0x4fafbb0, 0xc0046f7410}, {0x501e818, 0xc00031d180}, {0x4fc4080, 0xc0048ce000}, {0x4fe5b50, 0xc0046f7470})
     /home/jenkins/agent/workspace/build-common/go/src/github.com/pingcap/tidb/planner/optimize.go:245 +0xf11
 github.com/pingcap/tidb/planner/core.(*PlanBuilder).buildExplain(0xc0047856c0, {0x4fafbb0, 0xc0046f7410}, 0xc0048b8cb0)
     /home/jenkins/agent/workspace/build-common/go/src/github.com/pingcap/tidb/planner/core/planbuilder.go:4783 +0xd9
 github.com/pingcap/tidb/planner/core.(*PlanBuilder).Build(0xc0047856c0, {0x4fafbb0, 0xc0046f7410}, {0x4fc2c80?, 0xc0048b8cb0?})
     /home/jenkins/agent/workspace/build-common/go/src/github.com/pingcap/tidb/planner/core/planbuilder.go:779 +0x432
 github.com/pingcap/tidb/planner.buildLogicalPlan({0x4fafbb0, 0xc0046f7410}, {0x501e818?, 0xc00031d180}, {0x4fc2c80, 0xc0048b8cb0}, 0xc0047856c0)
     /home/jenkins/agent/workspace/build-common/go/src/github.com/pingcap/tidb/planner/optimize.go:461 +0x12f
 github.com/pingcap/tidb/planner.optimize({0x4fafbb0, 0xc0046f7410}, {0x501e818?, 0xc00031d180}, {0x4fc2c80?, 0xc0048b8cb0?}, {0x4fe5b50, 0xc0046f7470})
     /home/jenkins/agent/workspace/build-common/go/src/github.com/pingcap/tidb/planner/optimize.go:382 +0x473
 github.com/pingcap/tidb/planner.Optimize({0x4fafbb0, 0xc0046f7410}, {0x501e818, 0xc00031d180}, {0x4fc2c80, 0xc0048b8cb0}, {0x4fe5b50, 0xc0046f7470})
     /home/jenkins/agent/workspace/build-common/go/src/github.com/pingcap/tidb/planner/optimize.go:245 +0xf11
 github.com/pingcap/tidb/executor.(*Compiler).Compile(0xc002b26fc8, {0x4fafbb0, 0xc0046f7410}, {0x4fc8580, 0xc0048b8cb0?})
     /home/jenkins/agent/workspace/build-common/go/src/github.com/pingcap/tidb/executor/compiler.go:116 +0x6f8
 github.com/pingcap/tidb/session.(*session).ExecuteStmt(0xc00031d180, {0x4fafbb0, 0xc0046f7410}, {0x4fc8580?, 0xc0048b8cb0})
     /home/jenkins/agent/workspace/build-common/go/src/github.com/pingcap/tidb/session/session.go:2171 +0x54e
 github.com/pingcap/tidb/session.(*session).ExecuteInternal(0xc00031d180, {0x4fafbb0, 0xc0046f7410}, {0xc0048be900, 0x11f}, {0x0, 0x0, 0x0})
     /home/jenkins/agent/workspace/build-common/go/src/github.com/pingcap/tidb/session/session.go:1674 +0x3f2
 github.com/pingcap/tidb/bindinfo.getHintsForSQL({0x501e818, 0xc00031d180}, {0xc00086e580, 0x109})
     /home/jenkins/agent/workspace/build-common/go/src/github.com/pingcap/tidb/bindinfo/handle.go:951 +0x177
 github.com/pingcap/tidb/bindinfo.(*BindRecord).prepareHints(0xc0046e94c0, {0x501e818, 0xc00031d180})
     /home/jenkins/agent/workspace/build-common/go/src/github.com/pingcap/tidb/bindinfo/bind_record.go:178 +0x1e7
 github.com/pingcap/tidb/bindinfo.(*BindHandle).newBindRecord(0xc00478ba80, {0xc0047fe2d0?, 0x1?})
     /home/jenkins/agent/workspace/build-common/go/src/github.com/pingcap/tidb/bindinfo/handle.go:723 +0xbcf
 github.com/pingcap/tidb/bindinfo.(*BindHandle).Update(0xc00478ba80, 0x1)
     /home/jenkins/agent/workspace/build-common/go/src/github.com/pingcap/tidb/bindinfo/handle.go:173 +0x6c5
 github.com/pingcap/tidb/domain.(*Domain).LoadBindInfoLoop(0xc0016b8000, {0x501e818, 0xc00031d180}, {0x501e818, 0xc000742780})
     /home/jenkins/agent/workspace/build-common/go/src/github.com/pingcap/tidb/domain/domain.go:1444 +0xe5
 github.com/pingcap/tidb/session.BootstrapSession({0x4fd95f0, 0xc000d25900})
     /home/jenkins/agent/workspace/build-common/go/src/github.com/pingcap/tidb/session/session.go:3301 +0x648
 main.createStoreAndDomain()
     /home/jenkins/agent/workspace/build-common/go/src/github.com/pingcap/tidb/tidb-server/main.go:314 +0x1cb
 main.main()
     /home/jenkins/agent/workspace/build-common/go/src/github.com/pingcap/tidb/tidb-server/main.go:214 +0x2ca

无效的内存地址或者解引用一个空指针,容器挂载的磁盘有没有出问题,另外,CPU上升,是不是在执行一个ddl,是不是删除数据和ddl同时执行了

监控集群的资源使用情况,确认是否有资源瓶颈

PV看起来是正常的,目前因为TiDB Pod全挂,已经拿不到日志了,不清楚当时发生了什么,但看监控CPU和内存在当时确实是急剧升高的

在集群崩溃前前看CPU和Memory很高,但这个应该不至于引起这个空指针错误啊

这个崩溃堆栈,看是 tidb/statistics/handle/handle.go:997,感觉是和统计信息相关。 错误发生在 TiDB 处理 bindinfo 相关操作时 :thinking:

TiDB POD 扩容几个新的在不同物理机器上面,有同样的问题吗?

同样的,已经换了新的机器了,还是有这个问题

是的,我理解是初始化bindinfo时加载统计信息,此时就出现了统计信息(statsHandle)的未初始化指针

tidb Operator 用的不多,crash 状态不确定能否升级。我觉得可以试试升级到 6.5.10 。

因为在产线上现在不是很敢升级,想先把具体原因搞清楚然后看怎么处理

目前能排查到的是在GetPartitionStats中的statsCache := h.statsCache.Load().(statsCache)加载了一个空指针,导致TiDB无法启动
但我大致浏览了代码,我看到statsHandle这个指针是在domain.go的LoadAndUpdateStatsLoop才初始化的,而LoadAndUpdateStatsLoop在LoadBindInfoLoop后面,那我理解这个空指针错误好像是必然发生的,但其他环境是正常的,应该是中间漏了一些东西,有大佬知道这个statsHandle在正常情况下是怎么做到在LoadBindInfoLoop就初始化的吗?

有没有可能这个集群加过 bind。

这个就不清楚了,因为现在集群中TiDB Pod全挂了,dashboard都看不到了

应该是命中了 bug https://github.com/pingcap/tidb/issues/40368 把镜像换成 v6.5.1 试下

感谢,就是这个问题,升级后解决了

此话题已在最后回复的 7 天后被自动关闭。不再允许新回复。