tikv异常重启

【TiDB 使用环境】生产环境
【操作系统】Anolis OS 8.8
【部署方式】阿里云 本地SSD
【集群数据量】2.5T
【集群节点数】15
【问题复现路径】暂时无法复现
【遇到的问题:问题现象及影响】某个tikv节点出现异常重启
【资源配置】进入到 TiDB Dashboard -集群信息 (Cluster Info) -主机(Hosts) 截图此页面


【复制黏贴 ERROR 报错的日志】
tikv日志:

[2025/04/25 10:28:53.000 +08:00] [FATAL] [lib.rs:512] ["region 3291678531 commit_ts: TimeStamp(457584969468084514), resolved_ts: TimeStamp(457584969638739998)"] [backtrace="   0: tikv_util::set_panic_hook::{{closure}}
             at /workspace/source/tikv/components/tikv_util/src/lib.rs:511:18
   1: <alloc::boxed::Box<F,A> as core::ops::function::Fn<Args>>::call
             at /root/.rustup/toolchains/nightly-2022-11-15-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/alloc/src/boxed.rs:2032:9
      std::panicking::rust_panic_with_hook
             at /root/.rustup/toolchains/nightly-2022-11-15-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/std/src/panicking.rs:692:13
   2: std::panicking::begin_panic_handler::{{closure}}
             at /root/.rustup/toolchains/nightly-2022-11-15-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/std/src/panicking.rs:579:13
   3: std::sys_common::backtrace::__rust_end_short_backtrace
             at /root/.rustup/toolchains/nightly-2022-11-15-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/std/src/sys_common/backtrace.rs:137:18
   4: rust_begin_unwind
             at /root/.rustup/toolchains/nightly-2022-11-15-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/std/src/panicking.rs:575:5
   5: core::panicking::panic_fmt
             at /root/.rustup/toolchains/nightly-2022-11-15-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/core/src/panicking.rs:65:14
   6: cdc::delegate::Delegate::sink_txn_put
             at /workspace/source/tikv/components/cdc/src/delegate.rs:929:21
      cdc::delegate::Delegate::sink_put
             at /workspace/source/tikv/components/cdc/src/delegate.rs:889:13
      cdc::delegate::Delegate::sink_data
             at /workspace/source/tikv/components/cdc/src/delegate.rs:694:21
   7: cdc::delegate::Delegate::on_batch
             at /workspace/source/tikv/components/cdc/src/delegate.rs:561:17
   8: cdc::endpoint::Endpoint<T,E,S>::on_multi_batch
             at /workspace/source/tikv/components/cdc/src/endpoint.rs:889:33
      <cdc::endpoint::Endpoint<T,E,S> as tikv_util::worker::pool::Runnable>::run
             at /workspace/source/tikv/components/cdc/src/endpoint.rs:1283:18
   9: tikv_util::worker::pool::Worker::start_with_timer_impl::{{closure}}
             at /workspace/source/tikv/components/tikv_util/src/worker/pool.rs:506:25
      <core::future::from_generator::GenFuture<T> as core::future::future::Future>::poll
             at /root/.rustup/toolchains/nightly-2022-11-15-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/core/src/future/mod.rs:91:19
      <tracker::tls::TrackedFuture<F> as core::future::future::Future>::poll::{{closure}}
             at /workspace/source/tikv/components/tracker/src/tls.rs:64:23
      std::thread::local::LocalKey<T>::try_with
             at /root/.rustup/toolchains/nightly-2022-11-15-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/std/src/thread/local.rs:446:16
      std::thread::local::LocalKey<T>::with
             at /root/.rustup/toolchains/nightly-2022-11-15-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/std/src/thread/local.rs:422:9
      <tracker::tls::TrackedFuture<F> as core::future::future::Future>::poll
             at /workspace/source/tikv/components/tracker/src/tls.rs:62:9
      <futures_util::future::future::map::Map<Fut,F> as core::future::future::Future>::poll
             at /workspace/.cargo/registry/src/mirrors.tuna.tsinghua.edu.cn-df7c3c540f42cdbd/futures-util-0.3.31/src/future/future/map.rs:55:37
      <futures_util::future::future::Map<Fut,F> as core::future::future::Future>::poll
             at /workspace/.cargo/registry/src/mirrors.tuna.tsinghua.edu.cn-df7c3c540f42cdbd/futures-util-0.3.31/src/lib.rs:86:13
      yatp::task::future::RawTask<F>::poll
             at /workspace/.cargo/git/checkouts/yatp-e704b73c3ee279b6/5572a78/src/task/future.rs:59:9
  10: yatp::task::future::TaskCell::poll
             at /workspace/.cargo/git/checkouts/yatp-e704b73c3ee279b6/5572a78/src/task/future.rs:103:9
      <yatp::task::future::Runner as yatp::pool::runner::Runner>::handle
             at /workspace/.cargo/git/checkouts/yatp-e704b73c3ee279b6/5572a78/src/task/future.rs:387:20
  11: <tikv_util::yatp_pool::YatpPoolRunner<T> as yatp::pool::runner::Runner>::handle
             at /workspace/source/tikv/components/tikv_util/src/yatp_pool/mod.rs:199:24
      yatp::pool::worker::WorkerThread<T,R>::run
             at /workspace/.cargo/git/checkouts/yatp-e704b73c3ee279b6/5572a78/src/pool/worker.rs:48:13
      yatp::pool::builder::LazyBuilder<T>::build::{{closure}}
             at /workspace/.cargo/git/checkouts/yatp-e704b73c3ee279b6/5572a78/src/pool/builder.rs:114:25
      std::sys_common::backtrace::__rust_begin_short_backtrace
             at /root/.rustup/toolchains/nightly-2022-11-15-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/std/src/sys_common/backtrace.rs:121:18
  12: std::thread::Builder::spawn_unchecked_::{{closure}}::{{closure}}
             at /root/.rustup/toolchains/nightly-2022-11-15-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/std/src/thread/mod.rs:551:17
      <core::panic::unwind_safe::AssertUnwindSafe<F> as core::ops::function::FnOnce<()>>::call_once
             at /root/.rustup/toolchains/nightly-2022-11-15-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/core/src/panic/unwind_safe.rs:271:9
      std::panicking::try::do_call
             at /root/.rustup/toolchains/nightly-2022-11-15-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/std/src/panicking.rs:483:40
      std::panicking::try
             at /root/.rustup/toolchains/nightly-2022-11-15-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/std/src/panicking.rs:447:19
      std::panic::catch_unwind
             at /root/.rustup/toolchains/nightly-2022-11-15-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/std/src/panic.rs:137:14
      std::thread::Builder::spawn_unchecked_::{{closure}}
             at /root/.rustup/toolchains/nightly-2022-11-15-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/std/src/thread/mod.rs:550:30
      core::ops::function::FnOnce::call_once{{vtable.shim}}
             at /root/.rustup/toolchains/nightly-2022-11-15-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/core/src/ops/function.rs:513:5
  13: <alloc::boxed::Box<F,A> as core::ops::function::FnOnce<Args>>::call_once
             at /root/.rustup/toolchains/nightly-2022-11-15-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/alloc/src/boxed.rs:2000:9
      <alloc::boxed::Box<F,A> as core::ops::function::FnOnce<Args>>::call_once
             at /root/.rustup/toolchains/nightly-2022-11-15-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/alloc/src/boxed.rs:2000:9
      std::sys::unix::thread::Thread::new::thread_start
             at /root/.rustup/toolchains/nightly-2022-11-15-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/std/src/sys/unix/thread.rs:108:17
  14: start_thread
  15: clone
"] [location=components/cdc/src/delegate.rs:929] [thread_name=cdc-0] [thread_id=25]

pd日志:
[2025/04/25 10:27:18.616 +08:00] [INFO] [region.go:740] [“region Version changed”] [region-id=4986928619] [detail=“StartKey Changed:{7480000000000002FF4D5F698000000000FF0000040380000000FF10481D30038005EFFF7CAE82F33E038000FF0000000000010380FF0000000000031B03FF80000019C3CF668AFF038000000251EF54FF6400000000000000F8} → {7480000000000002FF4D5F698000000000FF0000040380000000FF10481F7F038005EFFF7CAE831D30038000FF0000000000020380FF00000000044C7703FF80000019C42E88D7FF0380000002573AFCFF8400000000000000F8}, EndKey:{7480000000000002FF4D5F698000000000FF0000040380000000FF104890DB038005EFFF7CAE831ABE038000FF0000000000020380FF0000000000006C03FF80000019C3D22097FF03800000025536D4FF9800000000000000F8}”] [old-version=13642] [new-version=13643]
[2025/04/25 10:27:18.616 +08:00] [INFO] [cluster_worker.go:237] [“region batch split, generate new regions”] [region-id=4986928619] [origin=“id:4986937007 start_key:"7480000000000002FF4D5F698000000000FF0000040380000000FF10481D30038005EFFF7CAE82F33E038000FF0000000000010380FF0000000000031B03FF80000019C3CF668AFF038000000251EF54FF6400000000000000F8" end_key:"7480000000000002FF4D5F698000000000FF0000040380000000FF10481F7F038005EFFF7CAE831D30038000FF0000000000020380FF00000000044C7703FF80000019C42E88D7FF0380000002573AFCFF8400000000000000F8" region_epoch:<conf_ver:11419 version:13643 > peers:<id:4986937008 store_id:1013675188 > peers:<id:4986937009 store_id:4569495272 > peers:<id:4986937010 store_id:3294463243 >”] [total=1]
[2025/04/25 10:28:12.439 +08:00] [INFO] [grpc_service.go:1948] [“update service GC safe point”] [service-id=ticdc-default-14951988490289429887] [expire-at=1745555292] [safepoint=457584958576001081]
【其他附件:截图/日志/监控】

看上去就是这个bug。

问题是这个bug的修复没有合并到7.5这个分支上,而是直接合并到了master。

7.5版本还没有可用的子版本包含这个修复,晚于2024年6月17日发布的主版本,以及晚于这个时间的8.1的子版本是包含这个修复的。

目前来说,只有8.1.1之后的子版本,和8.5.0之后的子版本才包含这个修复。

2 个赞

还有个类似的 https://github.com/tikv/tikv/issues/16776 ,不知道是不是一个问题