How StarRocks TabletChecker Guarantees Tablet Health and Scheduling
The article explains the purpose, configuration, and core implementation of StarRocks' TabletChecker component, detailing how it periodically scans OlapTable tablets, evaluates their health through multiple checks, and hands unhealthy tablets to the TabletScheduler for repair.
TabletChecker is a component in StarRocks FE whose sole responsibility is to check all tablets that are in an unhealthy state; tablet repair, balancing and other scheduling tasks are not its duties.
Related configuration items
// 20 seconds interval for each check (runAfterCatalogReady())
public static int tablet_sched_checker_interval_seconds = 20;
// If more than 10,000 tablets are pending scheduling, skip the check
@ConfField(mutable = true, aliases = {"max_scheduling_tablets"})
public static int tablet_sched_max_scheduling_tablets = 10000;
/**
* After checking tablet_checker_partition_batch_num partitions, the DB lock will be released,
* allowing other threads to acquire the lock.
*/
@ConfField(mutable = true)
public static int tablet_checker_partition_batch_num = 500;In each cycle, TabletChecker examines all tablets of every OlapTable, creates TabletInfo for unhealthy tablets, and forwards them to TabletScheduler for repair.
@Override
protected void runAfterCatalogReady() {
if (RunMode.isSharedDataMode()) {
return;
}
int pendingNum = tabletScheduler.getPendingNum();
int runningNum = tabletScheduler.getRunningNum();
if (pendingNum > Config.tablet_sched_max_scheduling_tablets
|| runningNum > Config.tablet_sched_max_scheduling_tablets) {
LOG.info("too many tablets are being scheduled. pending: {}, running: {}, limit: {}. skip check",
pendingNum, runningNum, Config.tablet_sched_max_scheduling_tablets);
return;
}
checkAllTablets();
cleanInvalidUrgentTable();
stat.counterTabletCheckRound.incrementAndGet();
LOG.info(stat.incrementalBrief());
}The checkAllTablets method first checks urgent tablets and then non‑urgent tablets; both paths eventually call doCheck with the appropriate isUrgent flag.
Inside doCheck, the algorithm iterates over all databases, tables, partitions, and tablets, collecting statistics, handling lock‑release batching, evaluating tablet health, and adding unhealthy tablets to the scheduler with proper priority.
private void doCheck(boolean isUrgent) {
// 255‑261: initialize statistics
long start = System.nanoTime();
long totalTabletNum = 0;
long unhealthyTabletNum = 0;
long addToSchedulerTabletNum = 0;
long tabletInScheduler = 0;
long tabletNotReady = 0;
// 263‑266: lock and wait‑time statistics
long lockTotalTime = 0;
long waitTotalTime = 0;
long lockStart;
List<Long> dbIds = GlobalStateMgr.getCurrentState().getDbIdsIncludeRecycleBin();
// 267‑276: iterate databases
DATABASE:
for (Long dbId : dbIds) {
Database db = GlobalStateMgr.getCurrentState().getDbIncludeRecycleBin(dbId);
if (db == null) continue;
if (db.isSystemDatabase()) continue;
int partitionBatchNum = Config.tablet_checker_partition_batch_num;
int partitionChecked = 0;
db.readLock();
lockStart = System.nanoTime();
try {
List<Long> aliveBeIdsInCluster = GlobalStateMgr.getCurrentSystemInfo().getBackendIds(true);
// TABLE:
for (Table table : GlobalStateMgr.getCurrentState().getTablesIncludeRecycleBin(db)) {
if (!table.needSchedule(false)) continue;
if (table.isCloudNativeTableOrMaterializedView()) continue;
if (isUrgent && !isUrgentTable(dbId, table.getId())) continue;
OlapTable olapTbl = (OlapTable) table;
for (Partition partition : GlobalStateMgr.getCurrentState().getAllPartitionsIncludeRecycleBin(olapTbl)) {
partitionChecked++;
boolean isPartitionUrgent = isPartitionUrgent(dbId, table.getId(), partition.getId());
if ((isUrgent && !isPartitionUrgent) || (!isUrgent && isPartitionUrgent)) continue;
if (partitionChecked % partitionBatchNum == 0) {
LOG.debug("partition checked reached batch value, release lock");
lockTotalTime += System.nanoTime() - lockStart;
db.readUnlock();
db.readLock();
LOG.debug("checker get lock again");
lockStart = System.nanoTime();
if (GlobalStateMgr.getCurrentState().getDbIncludeRecycleBin(dbId) == null) continue DATABASE;
if (GlobalStateMgr.getCurrentState().getTableIncludeRecycleBin(db, olapTbl.getId()) == null) continue TABLE;
if (GlobalStateMgr.getCurrentState().getPartitionIncludeRecycleBin(olapTbl, partition.getId()) == null) continue;
}
if (partition.getState() != PartitionState.NORMAL) continue;
short replicaNum = GlobalStateMgr.getCurrentState()
.getReplicationNumIncludeRecycleBin(olapTbl.getPartitionInfo(), partition.getId());
if (replicaNum == (short) -1) continue;
for (MaterializedIndex idx : partition.getMaterializedIndices(IndexExtState.VISIBLE)) {
for (Tablet tablet : idx.getTablets()) {
LocalTablet localTablet = (LocalTablet) tablet;
totalTabletNum++;
if (tabletScheduler.containsTablet(tablet.getId())) {
tabletInScheduler++;
continue;
}
Pair<TabletStatus, TabletSchedCtx.Priority> statusWithPrio =
localTablet.getHealthStatusWithPriority(
GlobalStateMgr.getCurrentSystemInfo(),
partition.getVisibleVersion(),
replicaNum,
aliveBeIdsInCluster);
if (statusWithPrio.first == TabletStatus.HEALTHY) {
localTablet.setLastStatusCheckTime(System.currentTimeMillis());
continue;
} else if (isPartitionUrgent) {
statusWithPrio.second = TabletSchedCtx.Priority.VERY_HIGH;
isUrgentPartitionHealthy = false;
}
unhealthyTabletNum++;
if (!localTablet.readyToBeRepaired(statusWithPrio.first, statusWithPrio.second)) {
tabletNotReady++;
continue;
}
TabletSchedCtx tabletCtx = new TabletSchedCtx(
TabletSchedCtx.Type.REPAIR,
db.getId(), olapTbl.getId(),
partition.getId(), idx.getId(), tablet.getId(),
System.currentTimeMillis());
tabletCtx.setTabletStatus(statusWithPrio.first);
tabletCtx.setOrigPriority(statusWithPrio.second);
tabletCtx.setTablet(localTablet);
if (!tryChooseSrcBeforeSchedule(tabletCtx)) continue;
Pair<Boolean, Long> result = tabletScheduler.blockingAddTabletCtxToScheduler(
db, tabletCtx, isPartitionUrgent);
waitTotalTime += result.second;
if (result.first) addToSchedulerTabletNum++;
}
}
if (isUrgentPartitionHealthy && isPartitionUrgent) {
LOG.debug("partition is healthy, remove from urgent table: {}-{}-{}",
db.getId(), olapTbl.getId(), partition.getId());
removeFromUrgentTable(new RepairTabletInfo(db.getId(),
olapTbl.getId(), Lists.newArrayList(partition.getId())));
}
}
}
} finally {
lockTotalTime += System.nanoTime() - lockStart;
db.readUnlock();
}
}
// 415‑427: record statistics and log
long cost = (System.nanoTime() - start) / 1000000;
lockTotalTime = lockTotalTime / 1000000;
stat.counterTabletCheckCostMs.addAndGet(cost);
stat.counterTabletChecked.addAndGet(totalTabletNum);
stat.counterUnhealthyTabletNum.addAndGet(unhealthyTabletNum);
stat.counterTabletAddToBeScheduled.addAndGet(addToSchedulerTabletNum);
LOG.info("finished to check tablets. isUrgent: {}, unhealthy/total/added/in_sched/not_ready: {}/{}/{}/{}/{}, " +
"cost: {} ms, in lock time: {} ms, wait time: {}ms",
isUrgent, unhealthyTabletNum, totalTabletNum, addToSchedulerTabletNum,
tabletInScheduler, tabletNotReady, cost, lockTotalTime - waitTotalTime, waitTotalTime);
}The health‑checking logic can be summarized as follows:
Alive check: count replicas that are alive.
Replica count check: ensure enough alive replicas exist.
Version completeness check: verify replicas have the latest visible version.
Backend stability check: skip replicas on decommissioned or dead backends.
Disk stability check: skip replicas on decommissioned disks.
Redundancy check: detect and handle excess replicas.
Overall health: if all checks pass, the tablet is considered healthy.
The method getHealthStatusWithPriorityUnlocked returns a Pair whose first generic type is TabletStatus (an enum such as HEALTHY, REPLICA_MISSING, VERSION_INCOMPLETE, REDUNDANT, NEED_FURTHER_REPAIR, etc.) and whose second generic type is a priority enum ( VERY_HIGH, HIGH, NORMAL, LOW).
private Pair<TabletStatus, TabletSchedCtx.Priority> getHealthStatusWithPriorityUnlocked(
SystemInfoService systemInfoService,
long visibleVersion,
int replicationNum,
List<Long> aliveBeIdsInCluster) {
// 598‑601: initialize counters
int alive = 0;
int aliveAndVersionComplete = 0;
int backendStable = 0;
int diskStable = 0;
// 603‑604: auxiliary variables
Replica needFurtherRepairReplica = null;
Set<String> hosts = Sets.newHashSet();
// 605‑639: iterate replicas
for (Replica replica : replicas) {
Backend backend = systemInfoService.getBackend(replica.getBackendId());
if (isReplicaBackendDropped(backend) || isReplicaBackendDead(backend) ||
isReplicaStateAbnormal(replica, backend, hosts)) {
continue;
}
alive++;
if (replica.needFurtherRepair() && needFurtherRepairReplica == null) {
needFurtherRepairReplica = replica;
}
if (replica.getLastFailedVersion() > 0 || replica.getVersion() < visibleVersion) {
continue;
}
aliveAndVersionComplete++;
if (backend.isDecommissioned()) {
continue;
}
backendStable++;
if (backend.isDiskDecommissioned(replica.getPathHash())) {
continue;
}
diskStable++;
}
// 641‑680: first‑stage checks – insufficient alive replicas
int aliveBackendsNum = aliveBeIdsInCluster.size();
if (needRecoverWithEmptyTablet(systemInfoService)) {
LOG.info("need to forcefully recover with empty tablet for {}, replica info:{}", id, getReplicaInfos());
return createRedundantSchedCtx(TabletStatus.FORCE_REDUNDANT, Priority.VERY_HIGH, needFurtherRepairReplica);
}
if (alive < replicationNum && replicas.size() >= aliveBackendsNum &&
aliveBackendsNum >= replicationNum && replicationNum > 1) {
// scenario: 3 replicas, 3 BE, 1 bad replica – force delete one to free space
return createRedundantSchedCtx(TabletStatus.FORCE_REDUNDANT, Priority.VERY_HIGH, needFurtherRepairReplica);
} else {
List<Long> availableBEs = systemInfoService.getAvailableBackendIds();
if (availableBEs.size() > alive) {
if (alive < (replicationNum / 2) + 1) {
return Pair.create(TabletStatus.REPLICA_MISSING, Priority.HIGH);
} else if (alive < replicationNum) {
return Pair.create(TabletStatus.REPLICA_MISSING, Priority.NORMAL);
}
}
}
// 682‑691: second‑stage – insufficient version‑complete replicas
if (aliveAndVersionComplete < (replicationNum / 2) + 1) {
return Pair.create(TabletStatus.VERSION_INCOMPLETE, Priority.HIGH);
} else if (aliveAndVersionComplete < replicationNum) {
return Pair.create(TabletStatus.VERSION_INCOMPLETE, Priority.NORMAL);
} else if (aliveAndVersionComplete > replicationNum) {
return createRedundantSchedCtx(TabletStatus.REDUNDANT, Priority.VERY_HIGH, needFurtherRepairReplica);
}
// 693‑712: third‑stage – replica relocation (backend down)
if (backendStable < replicationNum) {
List<Long> replicaBeIds = replicas.stream()
.map(Replica::getBackendId).collect(Collectors.toList());
List<Long> availableBeIds = aliveBeIdsInCluster.stream()
.filter(systemInfoService::checkBackendAvailable)
.collect(Collectors.toList());
if (replicaBeIds.containsAll(availableBeIds) &&
availableBeIds.size() >= replicationNum && replicationNum > 1) {
return createRedundantSchedCtx(TabletStatus.FORCE_REDUNDANT,
backendStable < (replicationNum / 2) + 1 ? Priority.NORMAL : Priority.LOW,
needFurtherRepairReplica);
}
if (backendStable < (replicationNum / 2) + 1) {
return Pair.create(TabletStatus.REPLICA_RELOCATING, Priority.NORMAL);
} else {
return Pair.create(TabletStatus.REPLICA_RELOCATING, Priority.LOW);
}
}
// 714‑717: fourth‑stage – disk migration
if (diskStable < replicationNum) {
return Pair.create(TabletStatus.DISK_MIGRATION, Priority.NORMAL);
}
// 719‑724: fifth‑stage – redundancy
if (replicas.size() > replicationNum) {
return createRedundantSchedCtx(TabletStatus.REDUNDANT, Priority.VERY_HIGH, needFurtherRepairReplica);
}
// 726‑727: sixth‑stage – healthy
return Pair.create(TabletStatus.HEALTHY, Priority.NORMAL);
}Signed-in readers can open the original source through BestHub's protected redirect.
This article has been distilled and summarized from source material, then republished for learning and reference. If you believe it infringes your rights, please contactand we will review it promptly.
Big Data Technology Tribe
Focused on computer science and cutting‑edge tech, we distill complex knowledge into clear, actionable insights. We track tech evolution, share industry trends and deep analysis, helping you keep learning, boost your technical edge, and ride the digital wave forward.
How this landed with the community
Was this worth your time?
0 Comments
Thoughtful readers leave field notes, pushback, and hard-won operational detail here.
