diff --git a/raft/src/main/java/org/apache/kafka/raft/KafkaRaftClient.java b/raft/src/main/java/org/apache/kafka/raft/KafkaRaftClient.java index f7091cac1ecd7..11b4da910f134 100644 --- a/raft/src/main/java/org/apache/kafka/raft/KafkaRaftClient.java +++ b/raft/src/main/java/org/apache/kafka/raft/KafkaRaftClient.java @@ -2708,10 +2708,12 @@ private void maybeTransition( } } else if ( leaderId.isPresent() && - (!quorum.hasLeader() || leaderEndpoints.size() > quorum.leaderEndpoints().size()) + (!quorum.hasLeader() || leaderEndpoints.size() > quorum.leaderEndpoints().size() || + (quorum.isUnattached() && !leaderEndpoints.isEmpty())) ) { // The request or response indicates the leader of the current epoch - // which are currently unknown or the replica has discovered more endpoints + // which are currently unknown, the replica has discovered more endpoints, + // or the replica is unattached but the has discovered endpoints for the leader. transitionToFollower(epoch, leaderId.getAsInt(), leaderEndpoints, currentTimeMs); } } @@ -3394,6 +3396,9 @@ private long pollFollowerAsObserver(FollowerState state, long currentTimeMs) { state.resetUpdateVoterSetPeriod(currentTimeMs); } return sendResult.timeToWaitMs(); + } else if (state.hasFetchTimeoutExpired(currentTimeMs)) { + transitionToUnattached(state.epoch(), OptionalInt.of(state.leaderId())); + return 0L; } else { return maybeSendFetchToBestNode(state, currentTimeMs); } diff --git a/raft/src/main/java/org/apache/kafka/raft/QuorumState.java b/raft/src/main/java/org/apache/kafka/raft/QuorumState.java index 24c929f135fa4..b6fbc792d51cb 100644 --- a/raft/src/main/java/org/apache/kafka/raft/QuorumState.java +++ b/raft/src/main/java/org/apache/kafka/raft/QuorumState.java @@ -74,10 +74,12 @@ * * Unattached transitions to: * Unattached: After learning of a new election with a higher epoch - * Follower: After discovering a leader with an equal or larger epoch + * Follower: After discovering a leader or new leader endpoints + * with an equal or larger epoch * * Follower transitions to: - * Unattached: After learning of a new election with a higher epoch + * Unattached: After learning of a new election with a higher epoch, or after + * expiration of the fetch timeout * Follower: After discovering a leader with a larger epoch * */ @@ -379,7 +381,7 @@ public void transitionToResigned(List preferredSuccessors) { */ public void transitionToUnattached(int epoch, OptionalInt leaderId) { int currentEpoch = state.epoch(); - if (epoch < currentEpoch || (epoch == currentEpoch && !isProspective())) { + if (epoch < currentEpoch || (epoch == currentEpoch && !isProspective() && !isObserver())) { throw new IllegalStateException( String.format( "Cannot transition to Unattached with epoch %d from current state %s", diff --git a/raft/src/test/java/org/apache/kafka/raft/KafkaRaftClientAutoJoinTest.java b/raft/src/test/java/org/apache/kafka/raft/KafkaRaftClientAutoJoinTest.java index 40869505659a3..c8afc28a9b468 100644 --- a/raft/src/test/java/org/apache/kafka/raft/KafkaRaftClientAutoJoinTest.java +++ b/raft/src/test/java/org/apache/kafka/raft/KafkaRaftClientAutoJoinTest.java @@ -49,14 +49,16 @@ public void testAutoRemoveOldVoter() throws Exception { .withAutoJoin(true) .withCanBecomeVoter(true) .build(); + var initialSleepMs = context.fetchTimeoutMs - 1; - context.advanceTimeAndCompleteFetch(epoch, leader.id(), true); + context.advanceTimeAndCompleteFetch(epoch, leader.id(), initialSleepMs, true); + initialSleepMs -= 1; // the next request should be a remove voter request pollAndDeliverRemoveVoter(context, oldFollower); // after sending a remove voter the next request should be a fetch - context.advanceTimeAndCompleteFetch(epoch, leader.id(), true); + context.advanceTimeAndCompleteFetch(epoch, leader.id(), initialSleepMs, true); // the replica should send remove voter again because the fetch did not update the voter set pollAndDeliverRemoveVoter(context, oldFollower); @@ -80,14 +82,16 @@ public void testAutoAddNewVoter() throws Exception { .withAutoJoin(true) .withCanBecomeVoter(true) .build(); + var initialSleepMs = context.fetchTimeoutMs - 1; - context.advanceTimeAndCompleteFetch(epoch, leader.id(), true); + context.advanceTimeAndCompleteFetch(epoch, leader.id(), initialSleepMs, true); + initialSleepMs -= 1; // the next request should be an add voter request pollAndSendAddVoter(context, newVoter); // expire the add voter request, the next request should be a fetch - context.advanceTimeAndCompleteFetch(epoch, leader.id(), true); + context.advanceTimeAndCompleteFetch(epoch, leader.id(), initialSleepMs, true); // the replica should send add voter again because the completed fetch // did not update the voter set, and its timer has expired @@ -128,7 +132,7 @@ public void testObserverRemovesOldVoterAndAutoJoins() throws Exception { .build(); // advance time and complete a fetch to trigger the remove voter request - context.advanceTimeAndCompleteFetch(epoch, leader.id(), true); + context.advanceTimeAndCompleteFetch(epoch, leader.id(), context.fetchTimeoutMs - 1, true); // the next request should be a remove voter request pollAndDeliverRemoveVoter(context, oldFollower); @@ -142,7 +146,7 @@ public void testObserverRemovesOldVoterAndAutoJoins() throws Exception { ); // advance time and complete a fetch to trigger the add voter request - context.advanceTimeAndCompleteFetch(epoch, leader.id(), true); + context.advanceTimeAndCompleteFetch(epoch, leader.id(), context.fetchTimeoutMs - 1, true); // the next request should be an add voter request final var addVoterRequest = pollAndSendAddVoter(context, newFollowerKey); @@ -163,7 +167,7 @@ public void testObserverRemovesOldVoterAndAutoJoins() throws Exception { // advance time and complete a fetch and expire the update voter set timer // the next request should be a fetch because the log voter configuration is up-to-date - context.advanceTimeAndCompleteFetch(epoch, leader.id(), true); + context.advanceTimeAndCompleteFetch(epoch, leader.id(), context.fetchTimeoutMs - 1, true); context.pollUntilRequest(); context.assertSentFetchRequest(); } @@ -188,7 +192,7 @@ public void testObserversDoNotAutoJoin() throws Exception { .withCanBecomeVoter(false) .build(); - context.advanceTimeAndCompleteFetch(epoch, leader.id(), true); + context.advanceTimeAndCompleteFetch(epoch, leader.id(), context.fetchTimeoutMs - 1, true); context.time.sleep(context.fetchTimeoutMs - 1); context.pollUntilRequest(); @@ -217,7 +221,7 @@ public void testObserverDoesNotAddItselfWhenAutoJoinDisabled() throws Exception .withCanBecomeVoter(true) .build(); - context.advanceTimeAndCompleteFetch(epoch, leader.id(), true); + context.advanceTimeAndCompleteFetch(epoch, leader.id(), context.fetchTimeoutMs - 1, true); context.time.sleep(context.fetchTimeoutMs - 1); context.pollUntilRequest(); @@ -246,7 +250,7 @@ public void testObserverDoesNotAutoJoinWithKRaftVersion0() throws Exception { .withCanBecomeVoter(true) .build(); - context.advanceTimeAndCompleteFetch(epoch, leader.id(), true); + context.advanceTimeAndCompleteFetch(epoch, leader.id(), context.fetchTimeoutMs - 1, true); context.time.sleep(context.fetchTimeoutMs - 1); context.pollUntilRequest(); diff --git a/raft/src/test/java/org/apache/kafka/raft/KafkaRaftClientFetchTest.java b/raft/src/test/java/org/apache/kafka/raft/KafkaRaftClientFetchTest.java index cb228820869dd..66dc5839efb9c 100644 --- a/raft/src/test/java/org/apache/kafka/raft/KafkaRaftClientFetchTest.java +++ b/raft/src/test/java/org/apache/kafka/raft/KafkaRaftClientFetchTest.java @@ -38,6 +38,7 @@ import java.nio.ByteBuffer; import java.util.List; +import java.util.Map; import java.util.Optional; import java.util.OptionalInt; import java.util.OptionalLong; @@ -765,4 +766,122 @@ void testUpdatedHighWatermarkCompleted() throws Exception { assertEquals(localLogEndOffset, partitionResponse.highWatermark()); } } + + @Test + void testObserverFetchesBetweenLeaderAndBootstrapServers() throws Exception { + final var epoch = 2; + final var local = KafkaRaftClientTest.replicaKey( + KafkaRaftClientTest.randomReplicaId(), + true + ); + final var leader = KafkaRaftClientTest.replicaKey(local.id() + 1, true); + final var bootstrapVoter = KafkaRaftClientTest.replicaKey(local.id() + 2, true); + final var voters = VoterSet.fromMap( + Map.of( + leader.id(), VoterSetTest.voterNode(leader), + bootstrapVoter.id(), VoterSetTest.voterNode(bootstrapVoter) + ) + ); + + final var context = new RaftClientTestContext.Builder( + local.id(), + local.directoryId().get() + ) + .withStartingVoters(voters, KRaftVersion.KRAFT_VERSION_1) + // configure the bootstrap servers to only include the bootstrap voter + // to reliably check the destination of the observer's fetch requests + // alternates between the leader and the bootstrap voter + .withBootstrapServers( + Optional.of(List.of(RaftClientTestContext.mockAddress(bootstrapVoter.id()))) + ) + .withRaftProtocol(RaftClientTestContext.RaftProtocol.KIP_1186_PROTOCOL) + .build(); + + // The observer initially fetches from the bootstrap servers, + // where it will discover the leader's endpoints. + final var bootstrapFetch = pollAndCheckObserverFetchRequest( + context, + true, + bootstrapVoter.id() + ); + context.deliverResponse( + bootstrapFetch.correlationId(), + bootstrapFetch.destination(), + context.fetchResponse( + epoch, + leader.id(), + MemoryRecords.EMPTY, + 0L, + Errors.NOT_LEADER_OR_FOLLOWER + ) + ); + + // Subsequent fetch from the observer is sent to the leader + // Return a BROKER_NOT_AVAILABLE error, handle that response, and then + // advance time past the fetch timeout. + // This is to simulate the leader endpoints being unreachable, which will + // cause the observer to fetch from the bootstrap servers after the fetch timeout expires. + final var leaderFetch = pollAndCheckObserverFetchRequest( + context, + false, + leader.id() + ); + context.deliverResponse( + leaderFetch.correlationId(), + leaderFetch.destination(), + RaftUtil.errorResponse( + ApiKeys.FETCH, + Errors.BROKER_NOT_AVAILABLE + ) + ); + context.client.poll(); + + // The fetch timeout is much greater than the request manager's configured backoff, so the + // current unreachable connection will no longer be backing off when the next fetch is sent. + // Expire the fetch timeout and check that the next fetch is sent to the bootstrap server again. + context.time.sleep(context.fetchTimeoutMs + 1); + final var nextBootstrapFetch = pollAndCheckObserverFetchRequest( + context, + true, + bootstrapVoter.id() + ); + context.deliverResponse( + nextBootstrapFetch.correlationId(), + nextBootstrapFetch.destination(), + context.fetchResponse( + epoch, + leader.id(), + MemoryRecords.EMPTY, + 0L, + Errors.NOT_LEADER_OR_FOLLOWER + ) + ); + + // Discovering the leader from a bootstrap fetch means the observer resumes fetching from the leader + pollAndCheckObserverFetchRequest( + context, + false, + leader.id() + ); + } + + private RaftRequest.Outbound pollAndCheckObserverFetchRequest( + RaftClientTestContext context, + boolean isBootstrapFetch, + int expectedDestinationId + ) throws Exception { + context.pollUntilRequest(); + RaftRequest.Outbound fetchRequest = context.assertSentFetchRequest(); + if (isBootstrapFetch) { + assertTrue(fetchRequest.destination().id() < -1); + } else { + assertEquals(expectedDestinationId, fetchRequest.destination().id()); + } + // only need to check port since the host is always "localhost" for the mock addresses + assertEquals( + RaftClientTestContext.mockAddress(expectedDestinationId).getPort(), + fetchRequest.destination().port() + ); + return fetchRequest; + } } diff --git a/raft/src/test/java/org/apache/kafka/raft/KafkaRaftClientReconfigTest.java b/raft/src/test/java/org/apache/kafka/raft/KafkaRaftClientReconfigTest.java index 08f3d63833b68..ece7a6ce76102 100644 --- a/raft/src/test/java/org/apache/kafka/raft/KafkaRaftClientReconfigTest.java +++ b/raft/src/test/java/org/apache/kafka/raft/KafkaRaftClientReconfigTest.java @@ -2231,7 +2231,7 @@ void testFollowerSendsUpdateVoter() throws Exception { .build(); // waiting for FETCH requests until the UpdateRaftVoter request is sent - context.advanceTimeAndCompleteFetch(epoch, voter1.id(), true); + context.advanceTimeAndCompleteFetch(epoch, voter1.id(), context.fetchTimeoutMs - 1, true); context.pollUntilRequest(); RaftRequest.Outbound updateRequest = context.assertSentUpdateVoterRequest( @@ -2284,7 +2284,7 @@ void testFollowerSendsUpdateVoterWithKraftVersion0(Errors updateVoterError) thro .build(); // waiting for FETCH request until the UpdateRaftVoter request is set - context.advanceTimeAndCompleteFetch(epoch, voter1.id(), true); + context.advanceTimeAndCompleteFetch(epoch, voter1.id(), context.fetchTimeoutMs - 1, true); context.pollUntilRequest(); RaftRequest.Outbound updateRequest = context.assertSentUpdateVoterRequest( @@ -2355,7 +2355,7 @@ void testFollowerSendsUpdateVoterAfterElectionWithKraftVersion0(Errors updateVot .build(); // waiting for FETCH request until the UpdateRaftVoter request is set - context.advanceTimeAndCompleteFetch(epoch, voter1.id(), true); + context.advanceTimeAndCompleteFetch(epoch, voter1.id(), context.fetchTimeoutMs - 1, true); context.pollUntilRequest(); RaftRequest.Outbound updateRequest = context.assertSentUpdateVoterRequest( @@ -2383,7 +2383,7 @@ void testFollowerSendsUpdateVoterAfterElectionWithKraftVersion0(Errors updateVot context.pollUntilResponse(); // waiting for FETCH request until the UpdateRaftVoter request is set - context.advanceTimeAndCompleteFetch(newEpoch, voter1.id(), true); + context.advanceTimeAndCompleteFetch(newEpoch, voter1.id(), context.fetchTimeoutMs - 1, true); context.pollUntilRequest(); updateRequest = context.assertSentUpdateVoterRequest( @@ -2657,7 +2657,7 @@ void testFollowerSendsUpdateVoterWhenDifferent() throws Exception { .build(); // waiting for FETCH request until the UpdateRaftVoter request is set - context.advanceTimeAndCompleteFetch(epoch, voter1.id(), true); + context.advanceTimeAndCompleteFetch(epoch, voter1.id(), context.fetchTimeoutMs - 1, true); // update voter should not be sent because the local listener is not different from the voter set context.pollUntilRequest(); @@ -2698,7 +2698,7 @@ void testFollowerSendsUpdateVoterIfPendingFetchDuringTimeout() throws Exception .build(); // waiting up to the last FETCH request before the UpdateRaftVoter request is set - context.advanceTimeAndCompleteFetch(epoch, voter1.id(), false); + context.advanceTimeAndCompleteFetch(epoch, voter1.id(), context.fetchTimeoutMs - 1, false); // expect one last FETCH request context.pollUntilRequest(); @@ -2759,7 +2759,7 @@ void testUpdateVoterResponseCausesEpochChange() throws Exception { .build(); // waiting for FETCH request until the UpdateRaftVoter request is set - context.advanceTimeAndCompleteFetch(epoch, voter1.id(), true); + context.advanceTimeAndCompleteFetch(epoch, voter1.id(), context.fetchTimeoutMs - 1, true); context.pollUntilRequest(); RaftRequest.Outbound updateRequest = context.assertSentUpdateVoterRequest( diff --git a/raft/src/test/java/org/apache/kafka/raft/KafkaRaftClientTest.java b/raft/src/test/java/org/apache/kafka/raft/KafkaRaftClientTest.java index 4a191784ac694..61e05fe081469 100644 --- a/raft/src/test/java/org/apache/kafka/raft/KafkaRaftClientTest.java +++ b/raft/src/test/java/org/apache/kafka/raft/KafkaRaftClientTest.java @@ -2010,7 +2010,8 @@ public void testFollowerAsObserverDoesNotBecomeProspectiveAfterFetchTimeout(bool context.time.sleep(context.fetchTimeoutMs); context.pollUntilRequest(); - assertTrue(context.client.quorum().isFollower()); + assertFalse(context.client.quorum().isProspective()); + assertTrue(context.client.quorum().isUnattached()); // transitions to unattached context.deliverRequest(context.voteRequest(epoch + 1, replicaKey(otherNodeId, withKip853Rpc), epoch, 1)); diff --git a/raft/src/testFixtures/java/org/apache/kafka/raft/RaftClientTestContext.java b/raft/src/testFixtures/java/org/apache/kafka/raft/RaftClientTestContext.java index ad47c8d958102..3aa75bb57fc1f 100644 --- a/raft/src/testFixtures/java/org/apache/kafka/raft/RaftClientTestContext.java +++ b/raft/src/testFixtures/java/org/apache/kafka/raft/RaftClientTestContext.java @@ -1013,17 +1013,25 @@ void deliverResponse(int correlationId, Node source, ApiMessage response) { * This is used to expire the update voter set timer without also expiring the fetch timer, * which is needed for add, remove, and update voter tests. * For voters and observers, polling after exiting this method expires the update voter set timer. + * + * For subsequent calls of this method that intend to also expire the update voter set timer, + * the initial sleep time must be less than what was previously used. + * This avoids expiring the fetch timer, but will expire the update voter set timer. + * * @param epoch - the current epoch * @param leaderId - the leader id + * @param initialSleepMs - the initial sleep time before the first fetch, which should + * be less than the fetch timeout to avoid expiring the fetch timer * @param expireUpdateVoterSetTimer - if true, advance time again to expire this timer */ void advanceTimeAndCompleteFetch( int epoch, int leaderId, + int initialSleepMs, boolean expireUpdateVoterSetTimer ) throws Exception { for (int i = 0; i < NUMBER_FETCH_TIMEOUTS_IN_UPDATE_VOTER_SET_PERIOD; i++) { - time.sleep(fetchTimeoutMs - 1); + time.sleep(initialSleepMs); pollUntilRequest(); final var fetchRequest = assertSentFetchRequest(); assertFetchRequestData( @@ -1049,7 +1057,7 @@ void advanceTimeAndCompleteFetch( client.poll(); } if (expireUpdateVoterSetTimer) { - time.sleep(fetchTimeoutMs - 1); + time.sleep(fetchTimeoutMs - initialSleepMs); } }