elastic
diff --git a/‎server/src/internalClusterTest/java/org/elasticsearch/cluster/routing/allocation/AllocationFailuresResetIT.java‎
Lines changed: 49 additions & 1 deletion b/‎server/src/internalClusterTest/java/org/elasticsearch/cluster/routing/allocation/AllocationFailuresResetIT.java‎
Lines changed: 49 additions & 1 deletion
diff --git a/‎server/src/main/java/org/elasticsearch/cluster/routing/RoutingNodes.java‎
Lines changed: 11 additions & 0 deletions b/‎server/src/main/java/org/elasticsearch/cluster/routing/RoutingNodes.java‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎server/src/main/java/org/elasticsearch/cluster/routing/allocation/AllocationService.java‎
Lines changed: 37 additions & 1 deletion b/‎server/src/main/java/org/elasticsearch/cluster/routing/allocation/AllocationService.java‎
Lines changed: 37 additions & 1 deletion
@@ -10,8 +10,10 @@
 package org.elasticsearch.cluster.routing.allocation;
 
 import org.elasticsearch.cluster.routing.ShardRouting;
+import org.elasticsearch.cluster.routing.ShardRoutingState;
 import org.elasticsearch.cluster.routing.allocation.decider.MaxRetryAllocationDecider;
 import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.index.Index;
 import org.elasticsearch.index.shard.IndexEventListener;
 import org.elasticsearch.plugins.Plugin;
 import org.elasticsearch.test.ESIntegTestCase;
@@ -20,6 +22,12 @@
 import org.elasticsearch.test.MockIndexEventListener;
 
 import java.util.List;
+import java.util.concurrent.atomic.AtomicBoolean;
+
+import static org.elasticsearch.cluster.metadata.IndexMetadata.INDEX_ROUTING_EXCLUDE_GROUP_PREFIX;
+import static org.hamcrest.CoreMatchers.equalTo;
+import static org.hamcrest.CoreMatchers.not;
+import static org.hamcrest.CoreMatchers.notNullValue;
 
 @ClusterScope(scope = Scope.TEST, numDataNodes = 0)
 public class AllocationFailuresResetIT extends ESIntegTestCase {
@@ -72,7 +80,7 @@ private void awaitShardAllocSucceed() throws Exception {
  });
  }
 
- public void testResetFailuresOnNodeJoin() throws Exception {
+ public void testResetAllocationFailuresOnNodeJoin() throws Exception {
  var node1 = internalCluster().startNode();
  injectAllocationFailures(node1);
  prepareCreate(INDEX, indexSettings(1, 0)).execute();
@@ -82,4 +90,44 @@ public void testResetFailuresOnNodeJoin() throws Exception {
  awaitShardAllocSucceed();
  }
 
+ public void testResetRelocationFailuresOnNodeJoin() throws Exception {
+ String node1 = internalCluster().startNode();
+ createIndex(INDEX, 1, 0);
+ ensureGreen(INDEX);
+ final var failRelocation = new AtomicBoolean(true);
+ String node2 = internalCluster().startNode();
+ internalCluster().getInstance(MockIndexEventListener.TestEventListener.class, node2).setNewDelegate(new IndexEventListener() {
+ @Override
+ public void beforeIndexCreated(Index index, Settings indexSettings) {
+ if (failRelocation.get()) {
+ throw new RuntimeException("FAIL");
+ }
+ }
+ });
+ updateIndexSettings(Settings.builder().put(INDEX_ROUTING_EXCLUDE_GROUP_PREFIX + "._name", node1), INDEX);
+ ensureGreen(INDEX);
+ // await all relocation attempts are exhausted
+ var maxAttempts = MaxRetryAllocationDecider.SETTING_ALLOCATION_MAX_RETRY.get(Settings.EMPTY);
+ assertBusy(() -> {
+ var state = clusterAdmin().prepareState(TEST_REQUEST_TIMEOUT).get().getState();
+ var shard = state.routingTable().index(INDEX).shard(SHARD).primaryShard();
+ assertThat(shard, notNullValue());
+ assertThat(shard.relocationFailureInfo().failedRelocations(), equalTo(maxAttempts));
+ });
+ // ensure the shard remain started
+ var state = clusterAdmin().prepareState(TEST_REQUEST_TIMEOUT).get().getState();
+ var shard = state.routingTable().index(INDEX).shard(SHARD).primaryShard();
+ assertThat(shard, notNullValue());
+ assertThat(shard.state(), equalTo(ShardRoutingState.STARTED));
+ assertThat(state.nodes().get(shard.currentNodeId()).getName(), equalTo(node1));
+ failRelocation.set(false);
+ // A new node joining should reset the counter and allow more relocation retries
+ internalCluster().startNode();
+ assertBusy(() -> {
+ var stateAfterNodeJoin = internalCluster().clusterService().state();
+ var relocatedShard = stateAfterNodeJoin.routingTable().index(INDEX).shard(SHARD).primaryShard();
+ assertThat(relocatedShard, notNullValue());
+ assertThat(stateAfterNodeJoin.nodes().get(relocatedShard.currentNodeId()).getName(), not(equalTo(node1)));
+ });
+ }
 }
@@ -1298,6 +1298,17 @@ public boolean hasAllocationFailures() {
  }));
  }
 
+ public boolean hasRelocationFailures() {
+ for (var shardRoutings : assignedShards.values()) {
+ for (var routing : shardRoutings) {
+ if (routing.relocationFailureInfo() != null && routing.relocationFailureInfo().failedRelocations() > 0) {
+ return true;
+ }
+ }
+ }
+ return false;
+ }
+
  public void resetFailedCounter(RoutingChangesObserver routingChangesObserver) {
  final var unassignedIterator = unassigned().iterator();
  while (unassignedIterator.hasNext()) {
 
@@ -12,13 +12,15 @@
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
 import org.elasticsearch.action.ActionListener;
+import org.elasticsearch.cluster.ClusterChangedEvent;
 import org.elasticsearch.cluster.ClusterInfoService;
 import org.elasticsearch.cluster.ClusterState;
 import org.elasticsearch.cluster.RestoreInProgress;
 import org.elasticsearch.cluster.health.ClusterHealthStatus;
 import org.elasticsearch.cluster.metadata.AutoExpandReplicas;
 import org.elasticsearch.cluster.metadata.IndexMetadata;
 import org.elasticsearch.cluster.metadata.Metadata;
+import org.elasticsearch.cluster.metadata.SingleNodeShutdownMetadata;
 import org.elasticsearch.cluster.metadata.SingleNodeShutdownMetadata.Type;
 import org.elasticsearch.cluster.node.DiscoveryNode;
 import org.elasticsearch.cluster.routing.IndexRoutingTable;
@@ -51,6 +53,7 @@
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.Comparator;
+import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
@@ -80,6 +83,8 @@ public class AllocationService {
  private final ClusterInfoService clusterInfoService;
  private final SnapshotsInfoService snapshotsInfoService;
  private final ShardRoutingRoleStrategy shardRoutingRoleStrategy;
+ // Tracks node IDs whose shutdown metadata has already been considered for resetting allocation/relocation failures
+ private final Set<String> processedNodeShutdowns = new HashSet<>();
 
  // only for tests that use the GatewayAllocator as the unique ExistingShardsAllocator
  @SuppressWarnings("this-escape")
@@ -573,12 +578,43 @@ public void addAllocFailuresResetListenerTo(ClusterService clusterService) {
  });
 
  clusterService.addListener((changeEvent) -> {
- if (changeEvent.nodesAdded() && changeEvent.state().getRoutingNodes().hasAllocationFailures()) {
+ if (shouldResetAllocationFailures(changeEvent)) {
  taskQueue.submitTask("reset-allocation-failures", (e) -> { assert MasterService.isPublishFailureException(e); }, null);
  }
  });
  }
 
+ /**
+ * We should reset allocation/relocation failure count to allow further retries when:
+ *
+ * 1. A new node joins the cluster.
+ * 2. A node shutdown metadata is added that could lead to a node being removed or replaced in the cluster.
+ *
+ * Note that removing a non-RESTART shutdown metadata from a node that is still in the cluster is treated similarly and
+ * will cause resetting the allocation/relocation failures.
+ */
+ private boolean shouldResetAllocationFailures(ClusterChangedEvent changeEvent) {
+ final var clusterState = changeEvent.state();
+ boolean hasAllocationFailures = clusterState.getRoutingNodes().hasAllocationFailures();
+ boolean hasRelocationFailures = clusterState.getRoutingNodes().hasRelocationFailures();
+ var shutdownEventAffectsAllocation = false;
+ final var nodes = clusterState.nodes();
+ final var nodeShutdowns = clusterState.metadata().nodeShutdowns();
+ // If we remove a shutdown marker from a node, but it is still in the cluster, we could re-attempt failed relocations/allocations.
+ shutdownEventAffectsAllocation = processedNodeShutdowns.stream()
+ .anyMatch(nodeId -> nodeShutdowns.contains(nodeId) == false && nodes.get(nodeId) != null);
+ // Clean up processed shutdowns that are removed from the cluster metadata
+ processedNodeShutdowns.removeIf(nodeId -> nodeShutdowns.contains(nodeId) == false);
+ for (var shutdown : nodeShutdowns.getAll().entrySet()) {
+ // A RESTART doesn't necessarily move around shards, so no need to consider it for a reset.
+ // Furthermore, once the node rejoins after restarting, there will be a reset if necessary.
+ if (shutdown.getValue().getType() != SingleNodeShutdownMetadata.Type.RESTART) {
+ shutdownEventAffectsAllocation |= processedNodeShutdowns.add(shutdown.getKey());
+ }
+ }
+ return (changeEvent.nodesAdded() || shutdownEventAffectsAllocation) && (hasAllocationFailures || hasRelocationFailures);
+ }
+
  private ClusterState rerouteWithResetFailedCounter(ClusterState clusterState) {
  RoutingAllocation allocation = createRoutingAllocation(clusterState, currentNanoTime());
  allocation.routingNodes().resetFailedCounter(allocation.changes());