tidb组件频繁重启

TiDB版本v5.4.3
TiDB组件16C32G内存500G hdd

TiDB组件会时常重启,导致上游链接断开
fatal error: runtime: out of memory

runtime stack:
runtime.sysMap(0xc754000000, 0x4000000, 0x646a6b0)
/usr/local/go/src/runtime/malloc.go:729 +0x1e5
runtime.(*mheap).grow(0x644ebe0, 0x23, 0x0)
/usr/local/go/src/runtime/mheap.go:1346 +0x85
runtime.(*mheap).allocSpan(0x644ebe0, 0x23, 0x100, 0x7f6e6e9fed40)
/usr/local/go/src/runtime/mheap.go:1173 +0x609
runtime.(*mheap).alloc.func1()
/usr/local/go/src/runtime/mheap.go:910 +0x59
runtime.systemstack(0x0)
/usr/local/go/src/runtime/asm_amd64.s:379 +0x66
runtime.mstart()
/usr/local/go/src/runtime/proc.go:1246

goroutine 12431894 [running]:
runtime.systemstack_switch()
/usr/local/go/src/runtime/asm_amd64.s:339 fp=0xc001880b70 sp=0xc001880b68 pc=0x135c680
runtime.(*mheap).alloc(0x644ebe0, 0x23, 0xc753fa0001, 0xc001880c60)
/usr/local/go/src/runtime/mheap.go:904 +0x85 fp=0xc001880bc0 sp=0xc001880b70 pc=0x1312da5
runtime.(*mcache).allocLarge(0x7f6f22781f18, 0x4445a, 0x100, 0x7f6f22781f18)
/usr/local/go/src/runtime/mcache.go:224 +0x97 fp=0xc001880c18 sp=0xc001880bc0 pc=0x1303417
runtime.mallocgc(0x4445a, 0x0, 0x0, 0xc7176da7a0)
/usr/local/go/src/runtime/malloc.go:1078 +0x925 fp=0xc001880ca0 sp=0xc001880c18 pc=0x12f8e65
github.com/pingcap/tidb/statistics.(*baseCollector).FromProto(0xc716b49030, 0xc715366d00)
github.com/pingcap/tidb/executor.(*AnalyzeColumnsExec).subMergeWorker(0xc05197db80, 0xc0574e4ae0, 0xc0574e4b40, 0x7, 0x3db3701)
runtime.goexit()
/usr/local/go/src/runtime/asm_amd64.s:1371 +0x1 fp=0xc001880fc0 sp=0xc001880fb8 pc=0x135e4c1
created by github.com/pingcap/tidb/executor.(*AnalyzeColumnsExec).buildSamplingStats
/home/jenkins/agent/workspace/build-common/go/src/github.com/pingcap/tidb/executor/analyze.go:915 +0x565

goroutine 1 [chan receive, 60 minutes]:
github.com/pingcap/tidb/server.(*Server).Run(0xc00b40e4e0, 0xc00b40b650, 0xc00b414540)
/home/jenkins/agent/workspace/build-common/go/src/github.com/pingcap/tidb/server/server.go:368 +0x17e
main.main()
/home/jenkins/agent/workspace/build-common/go/src/github.com/pingcap/tidb/tidb-server/main.go:218 +0x385

goroutine 1249 [IO wait]:
internal/poll.runtime_pollWait(0x7f6ef1ee1920, 0x72, 0xffffffffffffffff)
/usr/local/go/src/runtime/netpoll.go:222 +0x55
internal/poll.(*pollDesc).wait(0xc00cbb1598, 0x72, 0x8000, 0x8000, 0xffffffffffffffff)
/usr/local/go/src/internal/poll/fd_poll_runtime.go:87 +0x45
internal/poll.(*pollDesc).waitRead(…)
/usr/local/go/src/internal/poll/fd_poll_runtime.go:92
internal/poll.(*FD).Read(0xc00cbb1580, 0xc007e12000, 0x8000, 0x8000, 0x0, 0x0, 0x0)
/usr/local/go/src/internal/poll/fd_unix.go:166 +0x1d5
net.(*netFD).Read(0xc00cbb1580, 0xc007e12000, 0x8000, 0x8000, 0x189f85e, 0x400000801, 0xc000000000)
/usr/local/go/src/net/fd_posix.go:55 +0x4f
net.(*conn).Read(0xc000682338, 0xc007e12000, 0x8000, 0x8000, 0x0, 0x0, 0x0)
/usr/local/go/src/net/net.go:183 +0x91
bufio.(*Reader).Read(0xc004ec5440, 0xc004c5e740, 0x9, 0x9, 0xc007e12038, 0xc00cdc1dd0, 0x12f889b)
/usr/local/go/src/bufio/bufio.go:227 +0x222
io.ReadAtLeast(0x44a7a40, 0xc004ec5440, 0xc004c5e740, 0x9, 0x9, 0x9, 0xc71a255d88, 0x7b84b2ef01, 0xc00008e800)
/usr/local/go/src/io/io.go:328 +0x87
io.ReadFull(…)
/usr/local/go/src/io/io.go:347
golang.org/x/net/http2.readFrameHeader(0xc004c5e740, 0x9, 0x9, 0x44a7a40, 0xc004ec5440, 0x0, 0xc700000000, 0xc00cdc1e50, 0x12f8ff8)
/go/pkg/mod/golang.org/x/net@v0.0.0-20211112202133-69e39bad7dc2/http2/frame.go:237 +0x89
golang.org/x/net/http2.(*Framer).ReadFrame(0xc004c5e700, 0xc71a5ca130, 0x6432da0, 0x0, 0x0)
/go/pkg/mod/golang.org/x/net@v0.0.0-20211112202133-69e39bad7dc2/http2/frame.go:498 +0xa5
google.golang.org/grpc/internal/transport.(*http2Client).reader(0xc0006ac380)
/go/pkg/mod/google.golang.org/grpc@v1.29.1/internal/transport/http2_client.go:1273 +0x185
created by google.golang.org/grpc/internal/transport.newHTTP2Client
/go/pkg/mod/google.golang.org/grpc@v1.29.1/internal/transport/http2_client.go:300 +0xd51

goroutine 10 [select]:
go.opencensus.io/stats/view.(*worker).start(0xc00043ac00)
/go/pkg/mod/go.opencensus.io@v0.23.0/stats/view/worker.go:276 +0xcd
created by go.opencensus.io/stats/view.init.0
/go/pkg/mod/go.opencensus.io@v0.23.0/stats/view/worker.go:34 +0x68

goroutine 196 [chan receive]:
go.etcd.io/etcd/pkg/logutil.(*MergeLogger).outputLoop(0xc0006c1608)
/go/pkg/mod/go.etcd.io/etcd@v0.5.0-alpha.5.0.20210512015243-d19fbe541bf9/pkg/logutil/merge_logger.go:173 +0x3ac
created by go.etcd.io/etcd/pkg/logutil.NewMergeLogger
/go/pkg/mod/go.etcd.io/etcd@v0.5.0-alpha.5.0.20210512015243-d19fbe541bf9/pkg/logutil/merge_logger.go:91 +0x85

goroutine 199 [chan receive]:

参考OOM排查思路:
https://docs.pingcap.com/zh/tidb/dev/troubleshoot-tidb-oom

1 个赞

优化慢SQL,肯定是查了大结果集的sql在tidb节点处理了。