[Enhancement] Add more failure reasons for tablet clone (backport #62293) (#62380)

Signed-off-by: wyb <wybb86@gmail.com>
Co-authored-by: wyb <wybb86@gmail.com>
This commit is contained in:
mergify[bot] 2025-08-27 09:09:53 +00:00 committed by GitHub
parent d4d0774dda
commit 56c73c2443
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 169 additions and 9 deletions

View File

@ -403,6 +403,10 @@ public class TabletSchedCtx implements Comparable<TabletSchedCtx> {
this.errMsg = errMsg;
}
public String getErrMsg() {
return errMsg;
}
public long getCopySize() {
return copySize;
}
@ -602,7 +606,8 @@ public class TabletSchedCtx implements Comparable<TabletSchedCtx> {
*/
List<Replica> candidates = getHealthyReplicas();
if (candidates.isEmpty()) {
throw new SchedException(Status.UNRECOVERABLE, "unable to find source replica");
throw new SchedException(Status.UNRECOVERABLE,
"unable to find source replica. replicas: " + tablet.getReplicaInfos());
}
// Shuffle the candidate list first so that we won't always choose the same replica with
@ -702,7 +707,8 @@ public class TabletSchedCtx implements Comparable<TabletSchedCtx> {
}
if (chosenReplica == null) {
throw new SchedException(Status.UNRECOVERABLE, "unable to choose dest replica(maybe no incomplete replica");
throw new SchedException(Status.UNRECOVERABLE,
"unable to choose dest replica(maybe no incomplete replica). replicas: " + tablet.getReplicaInfos());
}
// check if the dest replica has available slot

View File

@ -548,7 +548,7 @@ public class TabletScheduler extends FrontendDaemon {
LOG.debug("pending tablets current count: {}\n{}", pendingTablets.size(), sb);
}
private boolean checkIfTabletExpired(TabletSchedCtx ctx) {
protected boolean checkIfTabletExpired(TabletSchedCtx ctx) {
return checkIfTabletExpired(ctx, GlobalStateMgr.getCurrentState().getRecycleBin(), System.currentTimeMillis());
}
@ -647,7 +647,9 @@ public class TabletScheduler extends FrontendDaemon {
LOG.warn("got unexpected exception, discard this schedule. tablet: {}",
tabletCtx.getTabletId(), e);
stat.counterTabletScheduledFailed.incrementAndGet();
finalizeTabletCtx(tabletCtx, TabletSchedCtx.State.UNEXPECTED, e.getMessage());
String errMsg = e.getMessage();
tabletCtx.setErrMsg(errMsg);
finalizeTabletCtx(tabletCtx, TabletSchedCtx.State.UNEXPECTED, errMsg);
continue;
}
@ -1109,7 +1111,8 @@ public class TabletScheduler extends FrontendDaemon {
} finally {
locker.unLockDatabase(db.getId(), LockType.WRITE);
}
throw new SchedException(Status.UNRECOVERABLE, "unable to delete any redundant replicas");
throw new SchedException(Status.UNRECOVERABLE, "unable to delete any redundant replicas. replicas: " +
tabletCtx.getTablet().getReplicaInfos());
}
private boolean deleteBackendDropped(TabletSchedCtx tabletCtx, boolean force) throws SchedException {
@ -1343,7 +1346,8 @@ public class TabletScheduler extends FrontendDaemon {
deleteReplicaInternal(tabletCtx, replica, "colocate redundant", forceDropBad);
throw new SchedException(Status.FINISHED, "colocate redundant replica is deleted");
}
throw new SchedException(Status.UNRECOVERABLE, "unable to delete any colocate redundant replicas");
throw new SchedException(Status.UNRECOVERABLE, "unable to delete any colocate redundant replicas. replicas: " +
tabletCtx.getTablet().getReplicaInfos() + ", backend set: " + backendSet);
} finally {
locker.unLockDatabase(db.getId(), LockType.WRITE);
}
@ -1853,7 +1857,9 @@ public class TabletScheduler extends FrontendDaemon {
LOG.warn("got unexpected exception when finish clone task. tablet: {}",
tabletCtx.getTabletId(), e);
stat.counterTabletScheduledDiscard.incrementAndGet();
finalizeTabletCtx(tabletCtx, TabletSchedCtx.State.UNEXPECTED, e.getMessage());
String errMsg = e.getMessage();
tabletCtx.setErrMsg(errMsg);
finalizeTabletCtx(tabletCtx, TabletSchedCtx.State.UNEXPECTED, errMsg);
return;
}

View File

@ -173,7 +173,8 @@ public class TabletSchedCtxTest {
Config.recover_with_empty_tablet = false;
SchedException schedException = Assertions.assertThrows(SchedException.class, () -> tabletScheduler
.handleTabletByTypeAndStatus(LocalTablet.TabletHealthStatus.REPLICA_MISSING, ctx, agentBatchTask));
Assertions.assertEquals("unable to find source replica", schedException.getMessage());
Assertions.assertEquals("unable to find source replica. replicas: 10001:-1/-1/-1/0:NORMAL:NIL,",
schedException.getMessage());
Config.recover_with_empty_tablet = true;
tabletScheduler.handleTabletByTypeAndStatus(LocalTablet.TabletHealthStatus.REPLICA_MISSING, ctx, agentBatchTask);

View File

@ -14,7 +14,9 @@
package com.starrocks.clone;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import com.starrocks.catalog.CatalogRecycleBin;
import com.starrocks.catalog.ColocateTableIndex;
import com.starrocks.catalog.Column;
@ -22,7 +24,11 @@ import com.starrocks.catalog.DataProperty;
import com.starrocks.catalog.Database;
import com.starrocks.catalog.FakeEditLog;
import com.starrocks.catalog.LocalTablet;
import com.starrocks.catalog.MaterializedIndex;
import com.starrocks.catalog.OlapTable;
import com.starrocks.catalog.Partition;
import com.starrocks.catalog.PartitionInfo;
import com.starrocks.catalog.PhysicalPartition;
import com.starrocks.catalog.RecyclePartitionInfo;
import com.starrocks.catalog.RecycleRangePartitionInfo;
import com.starrocks.catalog.Replica;
@ -32,6 +38,7 @@ import com.starrocks.catalog.TabletInvertedIndex;
import com.starrocks.catalog.TabletMeta;
import com.starrocks.catalog.Type;
import com.starrocks.common.Config;
import com.starrocks.common.ExceptionChecker;
import com.starrocks.common.Pair;
import com.starrocks.common.jmockit.Deencapsulation;
import com.starrocks.common.util.concurrent.lock.LockManager;
@ -44,7 +51,9 @@ import com.starrocks.server.GlobalStateMgr;
import com.starrocks.server.NodeMgr;
import com.starrocks.system.Backend;
import com.starrocks.system.SystemInfoService;
import com.starrocks.task.CloneTask;
import com.starrocks.task.CreateReplicaTask;
import com.starrocks.thrift.TBackend;
import com.starrocks.thrift.TCompressionType;
import com.starrocks.thrift.TDisk;
import com.starrocks.thrift.TFinishTaskRequest;
@ -56,9 +65,10 @@ import com.starrocks.thrift.TTabletSchema;
import com.starrocks.thrift.TTabletType;
import com.starrocks.transaction.GtidGenerator;
import mockit.Expectations;
import mockit.Mock;
import mockit.MockUp;
import mockit.Mocked;
import org.apache.commons.lang3.tuple.Triple;
import org.assertj.core.util.Lists;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
@ -477,4 +487,141 @@ public class TabletSchedulerTest {
tabletScheduler.finishCreateReplicaTask(createReplicaTask, request);
Assertions.assertEquals(Replica.ReplicaState.NORMAL, replica.getState());
}
@Test
public void testScheduleTabletException() {
long dbId = 10002L;
long tblId = 10003L;
long partitionId = 10004L;
long physicalPartitionId = 10004L;
long indexId = 10005L;
long tabletId = 10006L;
Database db = new Database(dbId, "db");
OlapTable table = new OlapTable(tblId, "table", null, null, null, null);
MaterializedIndex index = new MaterializedIndex(indexId);
PhysicalPartition physicalPartition = new PhysicalPartition(physicalPartitionId, "physical_part", partitionId, index);
Partition partition = new Partition(partitionId, physicalPartitionId, "partition", index, null);
ColocateTableIndex colocateTableIndex = new ColocateTableIndex();
new Expectations() {
{
globalStateMgr.getColocateTableIndex();
minTimes = 0;
result = colocateTableIndex;
globalStateMgr.getLocalMetastore().getDbIncludeRecycleBin(dbId);
minTimes = 0;
result = db;
globalStateMgr.getLocalMetastore().getTableIncludeRecycleBin(db, tblId);
minTimes = 0;
result = table;
globalStateMgr.getLocalMetastore().getPhysicalPartitionIncludeRecycleBin(table, physicalPartitionId);
minTimes = 0;
result = physicalPartition;
globalStateMgr.getLocalMetastore().getPartitionIncludeRecycleBin(table, partitionId);
minTimes = 0;
result = partition;
globalStateMgr.getLocalMetastore().getReplicationNumIncludeRecycleBin((PartitionInfo) any, partitionId);
minTimes = 0;
result = 3;
globalStateMgr.getLocalMetastore().getDataPropertyIncludeRecycleBin((PartitionInfo) any, partitionId);
minTimes = 0;
result = new DataProperty(TStorageMedium.HDD);
}
};
new MockUp<TabletScheduler>() {
@Mock
private boolean checkIfTabletExpired(TabletSchedCtx ctx) {
return false;
}
};
TabletSchedCtx ctx = new TabletSchedCtx(TabletSchedCtx.Type.REPAIR, dbId, tblId, partitionId, indexId, tabletId,
System.currentTimeMillis());
LocalTablet tablet = new LocalTablet(tabletId);
ctx.setTablet(tablet);
TabletScheduler tabletScheduler = new TabletScheduler(new TabletSchedulerStat());
Deencapsulation.invoke(tabletScheduler, "addToPendingTablets", ctx);
Deencapsulation.invoke(tabletScheduler, "schedulePendingTablets");
Assertions.assertEquals(TabletSchedCtx.State.UNEXPECTED, ctx.getState());
// index.getTablet returns null
// failed at Preconditions.checkNotNull(tablet);
Assertions.assertEquals(null, ctx.getErrMsg());
}
@Test
public void testFinishCloneTaskException() {
long beId = 10001L;
long dbId = 10002L;
long tblId = 10003L;
long partitionId = 10004L;
long indexId = 10005L;
long tabletId = 10006L;
TabletSchedCtx ctx = new TabletSchedCtx(TabletSchedCtx.Type.REPAIR, dbId, tblId, partitionId, indexId, tabletId,
System.currentTimeMillis());
LocalTablet tablet = new LocalTablet(tabletId);
ctx.setTablet(tablet);
ctx.setState(TabletSchedCtx.State.RUNNING);
// taskVersion is VERSION_1
CloneTask task = new CloneTask(beId, "127.0.0.1", dbId, tblId, partitionId, indexId, tabletId, 0,
Arrays.asList(new TBackend("host1", 8290, 8390)), TStorageMedium.HDD, 2L, 3600);
TabletScheduler tabletScheduler = new TabletScheduler(new TabletSchedulerStat());
Deencapsulation.invoke(tabletScheduler, "addToRunningTablets", ctx);
tabletScheduler.finishCloneTask(task, new TFinishTaskRequest());
Assertions.assertEquals(TabletSchedCtx.State.UNEXPECTED, ctx.getState());
// failed at Preconditions.checkArgument(cloneTask.getTaskVersion() == CloneTask.VERSION_2);
Assertions.assertEquals(null, ctx.getErrMsg());
}
@Test
public void testHandleColocateRedundantNoRedundantReplicas() {
long beId = 10001L;
long dbId = 10002L;
long tblId = 10003L;
long partitionId = 10004L;
long physicalPartitionId = 10004L;
long indexId = 10005L;
long tabletId = 10006L;
long replicaId = 10007L;
Database db = new Database(dbId, "db");
OlapTable table = new OlapTable(tblId, "table", null, null, null, null);
Replica replica = new Replica(replicaId, beId, 0, Replica.ReplicaState.NORMAL);
LocalTablet tablet = new LocalTablet(tabletId, Lists.newArrayList(replica));
MaterializedIndex index = new MaterializedIndex(indexId);
index.addTablet(tablet, new TabletMeta(dbId, tblId, physicalPartitionId, indexId, TStorageMedium.HDD));
PhysicalPartition physicalPartition = new PhysicalPartition(physicalPartitionId, "physical_part", partitionId, index);
new Expectations() {
{
globalStateMgr.getLocalMetastore().getDbIncludeRecycleBin(dbId);
minTimes = 0;
result = db;
globalStateMgr.getLocalMetastore().getTableIncludeRecycleBin(db, tblId);
minTimes = 0;
result = table;
globalStateMgr.getLocalMetastore().getPhysicalPartitionIncludeRecycleBin(table, physicalPartitionId);
minTimes = 0;
result = physicalPartition;
}
};
TabletSchedCtx ctx = new TabletSchedCtx(TabletSchedCtx.Type.REPAIR, dbId, tblId, partitionId, indexId, tabletId,
System.currentTimeMillis());
ctx.setTablet(tablet);
ctx.setTabletStatus(LocalTablet.TabletHealthStatus.COLOCATE_REDUNDANT);
ctx.setColocateGroupBackendIds(Sets.newHashSet(beId));
TabletScheduler tabletScheduler = new TabletScheduler(new TabletSchedulerStat());
ExceptionChecker.expectThrowsWithMsg(SchedException.class,
"unable to delete any colocate redundant replicas. replicas: 10001:-1/-1/-1/0:NORMAL:NIL,, backend set: [10001]",
() -> Deencapsulation.invoke(tabletScheduler, "handleColocateRedundant", ctx));
}
}