|
12 | 12 | import org.apache.logging.log4j.LogManager;
|
13 | 13 | import org.apache.logging.log4j.Logger;
|
14 | 14 | import org.elasticsearch.action.ActionListener;
|
| 15 | +import org.elasticsearch.cluster.ClusterChangedEvent; |
15 | 16 | import org.elasticsearch.cluster.ClusterInfoService;
|
16 | 17 | import org.elasticsearch.cluster.ClusterState;
|
17 | 18 | import org.elasticsearch.cluster.RestoreInProgress;
|
18 | 19 | import org.elasticsearch.cluster.health.ClusterHealthStatus;
|
19 | 20 | import org.elasticsearch.cluster.metadata.AutoExpandReplicas;
|
20 | 21 | import org.elasticsearch.cluster.metadata.IndexMetadata;
|
21 | 22 | import org.elasticsearch.cluster.metadata.Metadata;
|
| 23 | +import org.elasticsearch.cluster.metadata.SingleNodeShutdownMetadata; |
22 | 24 | import org.elasticsearch.cluster.metadata.SingleNodeShutdownMetadata.Type;
|
23 | 25 | import org.elasticsearch.cluster.node.DiscoveryNode;
|
24 | 26 | import org.elasticsearch.cluster.routing.IndexRoutingTable;
|
|
51 | 53 | import java.util.ArrayList;
|
52 | 54 | import java.util.Collections;
|
53 | 55 | import java.util.Comparator;
|
| 56 | +import java.util.HashSet; |
54 | 57 | import java.util.Iterator;
|
55 | 58 | import java.util.List;
|
56 | 59 | import java.util.Map;
|
@@ -80,6 +83,8 @@ public class AllocationService {
|
80 | 83 | private final ClusterInfoService clusterInfoService;
|
81 | 84 | private final SnapshotsInfoService snapshotsInfoService;
|
82 | 85 | private final ShardRoutingRoleStrategy shardRoutingRoleStrategy;
|
| 86 | + // Tracks node IDs whose shutdown metadata has already been considered for resetting allocation/relocation failures |
| 87 | + private final Set<String> processedNodeShutdowns = new HashSet<>(); |
83 | 88 |
|
84 | 89 | // only for tests that use the GatewayAllocator as the unique ExistingShardsAllocator
|
85 | 90 | @SuppressWarnings("this-escape")
|
@@ -573,12 +578,43 @@ public void addAllocFailuresResetListenerTo(ClusterService clusterService) {
|
573 | 578 | });
|
574 | 579 |
|
575 | 580 | clusterService.addListener((changeEvent) -> {
|
576 |
| - if (changeEvent.nodesAdded() && changeEvent.state().getRoutingNodes().hasAllocationFailures()) { |
| 581 | + if (shouldResetAllocationFailures(changeEvent)) { |
577 | 582 | taskQueue.submitTask("reset-allocation-failures", (e) -> { assert MasterService.isPublishFailureException(e); }, null);
|
578 | 583 | }
|
579 | 584 | });
|
580 | 585 | }
|
581 | 586 |
|
| 587 | + /** |
| 588 | + * We should reset allocation/relocation failure count to allow further retries when: |
| 589 | + * |
| 590 | + * 1. A new node joins the cluster. |
| 591 | + * 2. A node shutdown metadata is added that could lead to a node being removed or replaced in the cluster. |
| 592 | + * |
| 593 | + * Note that removing a non-RESTART shutdown metadata from a node that is still in the cluster is treated similarly and |
| 594 | + * will cause resetting the allocation/relocation failures. |
| 595 | + */ |
| 596 | + private boolean shouldResetAllocationFailures(ClusterChangedEvent changeEvent) { |
| 597 | + final var clusterState = changeEvent.state(); |
| 598 | + boolean hasAllocationFailures = clusterState.getRoutingNodes().hasAllocationFailures(); |
| 599 | + boolean hasRelocationFailures = clusterState.getRoutingNodes().hasRelocationFailures(); |
| 600 | + var shutdownEventAffectsAllocation = false; |
| 601 | + final var nodes = clusterState.nodes(); |
| 602 | + final var nodeShutdowns = clusterState.metadata().nodeShutdowns(); |
| 603 | + // If we remove a shutdown marker from a node, but it is still in the cluster, we could re-attempt failed relocations/allocations. |
| 604 | + shutdownEventAffectsAllocation = processedNodeShutdowns.stream() |
| 605 | + .anyMatch(nodeId -> nodeShutdowns.contains(nodeId) == false && nodes.get(nodeId) != null); |
| 606 | + // Clean up processed shutdowns that are removed from the cluster metadata |
| 607 | + processedNodeShutdowns.removeIf(nodeId -> nodeShutdowns.contains(nodeId) == false); |
| 608 | + for (var shutdown : nodeShutdowns.getAll().entrySet()) { |
| 609 | + // A RESTART doesn't necessarily move around shards, so no need to consider it for a reset. |
| 610 | + // Furthermore, once the node rejoins after restarting, there will be a reset if necessary. |
| 611 | + if (shutdown.getValue().getType() != SingleNodeShutdownMetadata.Type.RESTART) { |
| 612 | + shutdownEventAffectsAllocation |= processedNodeShutdowns.add(shutdown.getKey()); |
| 613 | + } |
| 614 | + } |
| 615 | + return (changeEvent.nodesAdded() || shutdownEventAffectsAllocation) && (hasAllocationFailures || hasRelocationFailures); |
| 616 | + } |
| 617 | + |
582 | 618 | private ClusterState rerouteWithResetFailedCounter(ClusterState clusterState) {
|
583 | 619 | RoutingAllocation allocation = createRoutingAllocation(clusterState, currentNanoTime());
|
584 | 620 | allocation.routingNodes().resetFailedCounter(allocation.changes());
|
|
0 commit comments