From c731873aef43e81224caafdf02a1ba2f3b3a99e7 Mon Sep 17 00:00:00 2001 From: Timothy Potter Date: Thu, 20 May 2021 12:35:44 -0600 Subject: [PATCH] SOLR-15399: IndexFetcher should not issue a local commit for PULL replicas (#133) --- solr/CHANGES.txt | 2 + .../solr/cloud/ReplicateFromLeader.java | 18 ++++++- .../apache/solr/cloud/TestPullReplica.java | 47 +++++++++++++++---- .../solr/cloud/TestPullReplicaWithAuth.java | 3 +- 4 files changed, 60 insertions(+), 10 deletions(-) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 55cf9e642a79..457f64010dd0 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -399,6 +399,8 @@ Bug Fixes * SOLR-15424: Solr replication UI wraps ETA time on top of next line (janhoy) +* SOLR-15399: IndexFetcher should not issue a local commit for PULL replicas when leader's version is zero (Timothy Potter) + Other Changes --------------------- * SOLR-15118: Deprecate CollectionAdminRequest.getV2Request(). (Jason Gerlowski) diff --git a/solr/core/src/java/org/apache/solr/cloud/ReplicateFromLeader.java b/solr/core/src/java/org/apache/solr/cloud/ReplicateFromLeader.java index 7b18ffd773d5..06f671602147 100644 --- a/solr/core/src/java/org/apache/solr/cloud/ReplicateFromLeader.java +++ b/solr/core/src/java/org/apache/solr/cloud/ReplicateFromLeader.java @@ -21,6 +21,7 @@ import org.apache.lucene.index.IndexCommit; import org.apache.solr.common.SolrException; +import org.apache.solr.common.cloud.Replica; import org.apache.solr.common.params.ModifiableSolrParams; import org.apache.solr.common.util.NamedList; import org.apache.solr.core.CoreContainer; @@ -87,7 +88,22 @@ public void startReplication(boolean switchTransactionLog) { NamedList followerConfig = new NamedList<>(); followerConfig.add("fetchFromLeader", Boolean.TRUE); - followerConfig.add(ReplicationHandler.SKIP_COMMIT_ON_LEADER_VERSION_ZERO, switchTransactionLog); + + // don't commit on leader version zero for PULL replicas as PULL should only get its index state from leader + boolean skipCommitOnLeaderVersionZero = switchTransactionLog; + if (!skipCommitOnLeaderVersionZero) { + CloudDescriptor cloudDescriptor = core.getCoreDescriptor().getCloudDescriptor(); + if (cloudDescriptor != null) { + Replica replica = + cc.getZkController().getZkStateReader().getCollection(cloudDescriptor.getCollectionName()) + .getSlice(cloudDescriptor.getShardId()).getReplica(cloudDescriptor.getCoreNodeName()); + if (replica != null && replica.getType() == Replica.Type.PULL) { + skipCommitOnLeaderVersionZero = true; // only set this to true if we're a PULL replica, otherwise use value of switchTransactionLog + } + } + } + followerConfig.add(ReplicationHandler.SKIP_COMMIT_ON_LEADER_VERSION_ZERO, skipCommitOnLeaderVersionZero); + followerConfig.add("pollInterval", pollIntervalStr); NamedList replicationConfig = new NamedList<>(); replicationConfig.add("follower", followerConfig); diff --git a/solr/core/src/test/org/apache/solr/cloud/TestPullReplica.java b/solr/core/src/test/org/apache/solr/cloud/TestPullReplica.java index 779bae5e38f8..caf6aa9f8759 100644 --- a/solr/core/src/test/org/apache/solr/cloud/TestPullReplica.java +++ b/solr/core/src/test/org/apache/solr/cloud/TestPullReplica.java @@ -66,7 +66,7 @@ import org.slf4j.LoggerFactory; @Slow -@LogLevel("org.apache.solr.handler.ReplicationHandler=DEBUG,org.apache.solr.handler.IndexFetcher=DEBUG") +@LogLevel("org.apache.solr.handler.ReplicationHandler=DEBUG;org.apache.solr.handler.IndexFetcher=DEBUG") public class TestPullReplica extends SolrCloudTestCase { private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); @@ -80,7 +80,6 @@ private String suggestedCollectionName() { @BeforeClass public static void createTestCluster() throws Exception { - // cloudSolrClientMaxStaleRetries System.setProperty("cloudSolrClientMaxStaleRetries", "1"); System.setProperty("zkReaderGetLeaderRetryTimeoutMs", "1000"); @@ -122,7 +121,6 @@ public void tearDown() throws Exception { } @Repeat(iterations=2) // 2 times to make sure cleanup is complete and we can create the same collection - // commented out on: 17-Feb-2019 @BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // 21-May-2018 public void testCreateDelete() throws Exception { try { switch (random().nextInt(3)) { @@ -209,15 +207,18 @@ static void assertUlogPresence(DocCollection collection) { } @SuppressWarnings("unchecked") - @BadApple(bugUrl = "https://issues.apache.org/jira/browse/SOLR-15399") public void testAddDocs() throws Exception { int numPullReplicas = 1 + random().nextInt(3); CollectionAdminRequest.createCollection(collectionName, "conf", 1, 1, 0, numPullReplicas) .process(cluster.getSolrClient()); - waitForState("Expected collection to be created with 1 shard and " + (numPullReplicas + 1) + " replicas", collectionName, clusterShape(1, numPullReplicas + 1)); + waitForState("Expected collection to be created with 1 shard and " + (numPullReplicas + 1) + " replicas", + collectionName, clusterShape(1, numPullReplicas + 1)); DocCollection docCollection = assertNumberOfReplicas(1, 0, numPullReplicas, false, true); assertEquals(1, docCollection.getSlices().size()); + // ugly but needed to ensure a full PULL replication cycle (every sec) has occurred on the replicas before adding docs + Thread.sleep(1500); + boolean reloaded = false; int numDocs = 0; while (true) { @@ -232,7 +233,8 @@ public void testAddDocs() throws Exception { } log.info("Found {} docs in leader, verifying updates make it to {} pull replicas", numDocs, numPullReplicas); - List pullReplicas = s.getReplicas(EnumSet.of(Replica.Type.PULL)); + List pullReplicas = + (numDocs == 1) ? restartPullReplica(docCollection, numPullReplicas) : s.getReplicas(EnumSet.of(Replica.Type.PULL)); waitForNumDocsInAllReplicas(numDocs, pullReplicas); for (Replica r : pullReplicas) { @@ -258,6 +260,37 @@ public void testAddDocs() throws Exception { assertUlogPresence(docCollection); } + private List restartPullReplica(DocCollection docCollection, int numPullReplicas) throws Exception { + Slice s = docCollection.getSlices().iterator().next(); + List pullReplicas = s.getReplicas(EnumSet.of(Replica.Type.PULL)); + + // find a node with a PULL replica that's not hosting the leader + JettySolrRunner leaderJetty = cluster.getReplicaJetty(s.getLeader()); + JettySolrRunner replicaJetty = null; + for (Replica r : pullReplicas) { + JettySolrRunner jetty = cluster.getReplicaJetty(r); + if (!jetty.getNodeName().equals(leaderJetty.getNodeName())) { + replicaJetty = jetty; + break; + } + } + + // stop / start the node with a pull replica + if (replicaJetty != null) { + replicaJetty.stop(); + cluster.waitForJettyToStop(replicaJetty); + waitForState("Expected to see a downed PULL replica", collectionName, clusterStateReflectsActiveAndDownReplicas()); + replicaJetty.start(); + waitForState("Expected collection to have recovered with 1 shard and " + (numPullReplicas + 1) + " replicas after restarting " + replicaJetty.getNodeName(), + collectionName, clusterShape(1, numPullReplicas + 1)); + docCollection = assertNumberOfReplicas(1, 0, numPullReplicas, false, true); + s = docCollection.getSlices().iterator().next(); + pullReplicas = s.getReplicas(EnumSet.of(Replica.Type.PULL)); + } // else it's ok if all replicas ended up on the same node, we're not testing replica placement here, but skip this part of the test + + return pullReplicas; + } + public void testAddRemovePullReplica() throws Exception { CollectionAdminRequest.createCollection(collectionName, "conf", 2, 1, 0, 0) .process(cluster.getSolrClient()); @@ -282,13 +315,11 @@ public void testAddRemovePullReplica() throws Exception { } @Test - @BadApple(bugUrl = "https://issues.apache.org/jira/browse/SOLR-15399") public void testRemoveAllWriterReplicas() throws Exception { doTestNoLeader(true); } @Test - @BadApple(bugUrl = "https://issues.apache.org/jira/browse/SOLR-15399") public void testKillLeader() throws Exception { doTestNoLeader(false); } diff --git a/solr/core/src/test/org/apache/solr/cloud/TestPullReplicaWithAuth.java b/solr/core/src/test/org/apache/solr/cloud/TestPullReplicaWithAuth.java index d80b9efec0f0..cca619911524 100644 --- a/solr/core/src/test/org/apache/solr/cloud/TestPullReplicaWithAuth.java +++ b/solr/core/src/test/org/apache/solr/cloud/TestPullReplicaWithAuth.java @@ -41,6 +41,7 @@ import org.apache.solr.security.BasicAuthPlugin; import org.apache.solr.security.RuleBasedAuthorizationPlugin; import org.junit.BeforeClass; +import org.junit.Test; import static java.util.Collections.singletonList; import static java.util.Collections.singletonMap; @@ -84,7 +85,7 @@ private QueryResponse queryWithBasicAuth(HttpSolrClient client, SolrQuery q) thr return withBasicAuth(new QueryRequest(q)).process(client); } - @BadApple(bugUrl = "https://issues.apache.org/jira/browse/SOLR-15399") + @Test public void testPKIAuthWorksForPullReplication() throws Exception { int numPullReplicas = 2; withBasicAuth(CollectionAdminRequest.createCollection(collectionName, "conf", 1, 1, 0, numPullReplicas))