Skip to content

Commit f6918de

Browse files
committed
add logging
1 parent d4140e5 commit f6918de

File tree

4 files changed

+102
-14
lines changed

4 files changed

+102
-14
lines changed

server/src/internalClusterTest/java/org/elasticsearch/cluster/routing/allocation/AllocationFailuresResetIT.java

Lines changed: 36 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,12 @@
99

1010
package org.elasticsearch.cluster.routing.allocation;
1111

12+
import org.apache.logging.log4j.Level;
13+
import org.elasticsearch.cluster.routing.RoutingNodes;
1214
import org.elasticsearch.cluster.routing.ShardRouting;
1315
import org.elasticsearch.cluster.routing.ShardRoutingState;
1416
import org.elasticsearch.cluster.routing.allocation.decider.MaxRetryAllocationDecider;
17+
import org.elasticsearch.common.Strings;
1518
import org.elasticsearch.common.settings.Settings;
1619
import org.elasticsearch.index.Index;
1720
import org.elasticsearch.index.shard.IndexEventListener;
@@ -20,6 +23,7 @@
2023
import org.elasticsearch.test.ESIntegTestCase.ClusterScope;
2124
import org.elasticsearch.test.ESIntegTestCase.Scope;
2225
import org.elasticsearch.test.MockIndexEventListener;
26+
import org.elasticsearch.test.MockLog;
2327

2428
import java.util.List;
2529
import java.util.concurrent.atomic.AtomicBoolean;
@@ -86,8 +90,20 @@ public void testResetAllocationFailuresOnNodeJoin() throws Exception {
8690
prepareCreate(INDEX, indexSettings(1, 0)).execute();
8791
awaitShardAllocMaxRetries();
8892
removeAllocationFailuresInjection(node1);
89-
internalCluster().startNode();
90-
awaitShardAllocSucceed();
93+
try (var mockLog = MockLog.capture(RoutingNodes.class)) {
94+
var shardId = internalCluster().clusterService().state().routingTable().index(INDEX).shard(SHARD).shardId();
95+
mockLog.addExpectation(
96+
new MockLog.SeenEventExpectation(
97+
"log resetting failed allocations",
98+
RoutingNodes.class.getName(),
99+
Level.INFO,
100+
Strings.format(RoutingNodes.RESET_FAILED_ALLOCATION_COUNTER_LOG_MSG, 1, List.of(shardId))
101+
)
102+
);
103+
internalCluster().startNode();
104+
awaitShardAllocSucceed();
105+
mockLog.assertAllExpectationsMatched();
106+
}
91107
}
92108

93109
public void testResetRelocationFailuresOnNodeJoin() throws Exception {
@@ -122,12 +138,23 @@ public void beforeIndexCreated(Index index, Settings indexSettings) {
122138
assertThat(state.nodes().get(shard.currentNodeId()).getName(), equalTo(node1));
123139
failRelocation.set(false);
124140
// A new node joining should reset the counter and allow more relocation retries
125-
internalCluster().startNode();
126-
assertBusy(() -> {
127-
var stateAfterNodeJoin = internalCluster().clusterService().state();
128-
var relocatedShard = stateAfterNodeJoin.routingTable().index(INDEX).shard(SHARD).primaryShard();
129-
assertThat(relocatedShard, notNullValue());
130-
assertThat(stateAfterNodeJoin.nodes().get(relocatedShard.currentNodeId()).getName(), not(equalTo(node1)));
131-
});
141+
try (var mockLog = MockLog.capture(RoutingNodes.class)) {
142+
mockLog.addExpectation(
143+
new MockLog.SeenEventExpectation(
144+
"log resetting failed relocations",
145+
RoutingNodes.class.getName(),
146+
Level.INFO,
147+
Strings.format(RoutingNodes.RESET_FAILED_RELOCATION_COUNTER_LOG_MSG, 1, List.of(shard.shardId()))
148+
)
149+
);
150+
internalCluster().startNode();
151+
assertBusy(() -> {
152+
var stateAfterNodeJoin = internalCluster().clusterService().state();
153+
var relocatedShard = stateAfterNodeJoin.routingTable().index(INDEX).shard(SHARD).primaryShard();
154+
assertThat(relocatedShard, notNullValue());
155+
assertThat(stateAfterNodeJoin.nodes().get(relocatedShard.currentNodeId()).getName(), not(equalTo(node1)));
156+
});
157+
mockLog.assertAllExpectationsMatched();
158+
}
132159
}
133160
}

server/src/main/java/org/elasticsearch/cluster/routing/RoutingNodes.java

Lines changed: 64 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,14 +16,19 @@
1616
import org.elasticsearch.cluster.node.DiscoveryNodes;
1717
import org.elasticsearch.cluster.routing.UnassignedInfo.AllocationStatus;
1818
import org.elasticsearch.cluster.routing.allocation.ExistingShardsAllocator;
19+
import org.elasticsearch.cluster.routing.allocation.RoutingAllocation;
1920
import org.elasticsearch.cluster.routing.allocation.allocator.DesiredBalanceMetrics;
21+
import org.elasticsearch.common.Strings;
2022
import org.elasticsearch.common.collect.Iterators;
2123
import org.elasticsearch.common.util.Maps;
2224
import org.elasticsearch.core.Assertions;
2325
import org.elasticsearch.core.Nullable;
2426
import org.elasticsearch.core.Tuple;
2527
import org.elasticsearch.index.Index;
28+
import org.elasticsearch.index.IndexNotFoundException;
2629
import org.elasticsearch.index.shard.ShardId;
30+
import org.elasticsearch.logging.LogManager;
31+
import org.elasticsearch.logging.Logger;
2732

2833
import java.util.ArrayDeque;
2934
import java.util.ArrayList;
@@ -44,6 +49,8 @@
4449
import java.util.stream.Stream;
4550
import java.util.stream.StreamSupport;
4651

52+
import static org.elasticsearch.cluster.routing.allocation.decider.MaxRetryAllocationDecider.SETTING_ALLOCATION_MAX_RETRY;
53+
4754
/**
4855
* {@link RoutingNodes} represents a copy the routing information contained in the {@link ClusterState cluster state}.
4956
* It can be either initialized as mutable or immutable allowing or disallowing changes to its elements.
@@ -60,6 +67,13 @@
6067
*/
6168
public class RoutingNodes implements Iterable<RoutingNode> {
6269

70+
private static final Logger logger = LogManager.getLogger(RoutingNodes.class);
71+
public static final String RESET_FAILED_ALLOCATION_COUNTER_LOG_MSG =
72+
"Resetting failure counter for %d shard(s) that have reached their max allocation retires (%s)";
73+
public static final String RESET_FAILED_RELOCATION_COUNTER_LOG_MSG =
74+
"Resetting failure counter for %d shard(s) that have reached their max relocation retries (%s)";
75+
private static final int MAX_SHARDS_IN_LOG_MSG = 20;
76+
6377
private final Map<String, RoutingNode> nodesToShards;
6478

6579
private final UnassignedShards unassignedShards;
@@ -1309,14 +1323,36 @@ public boolean hasRelocationFailures() {
13091323
return false;
13101324
}
13111325

1312-
public void resetFailedCounter(RoutingChangesObserver routingChangesObserver) {
1326+
public void resetFailedCounter(RoutingAllocation allocation) {
1327+
final var observer = allocation.changes();
1328+
int shardsWithMaxFailedAllocations = 0;
1329+
int shardsWithMaxFailedRelocations = 0;
1330+
List<ShardId> topShardIdsWithFailedAllocations = new ArrayList<>();
1331+
List<ShardId> topShardIdsWithFailedRelocations = new ArrayList<>();
1332+
13131333
final var unassignedIterator = unassigned().iterator();
13141334
while (unassignedIterator.hasNext()) {
13151335
ShardRouting shardRouting = unassignedIterator.next();
13161336
UnassignedInfo unassignedInfo = shardRouting.unassignedInfo();
1337+
int failedAllocations = unassignedInfo.failedAllocations();
1338+
if (failedAllocations > 0) {
1339+
try {
1340+
final var maxRetry = SETTING_ALLOCATION_MAX_RETRY.get(
1341+
allocation.metadata().getIndexSafe(shardRouting.index()).getSettings()
1342+
);
1343+
if (failedAllocations >= maxRetry) {
1344+
shardsWithMaxFailedAllocations++;
1345+
if (topShardIdsWithFailedAllocations.size() <= MAX_SHARDS_IN_LOG_MSG) {
1346+
topShardIdsWithFailedAllocations.add(shardRouting.shardId());
1347+
}
1348+
}
1349+
} catch (IndexNotFoundException e) {
1350+
// ignore
1351+
}
1352+
}
13171353
unassignedIterator.updateUnassigned(
13181354
new UnassignedInfo(
1319-
unassignedInfo.failedAllocations() > 0 ? UnassignedInfo.Reason.MANUAL_ALLOCATION : unassignedInfo.reason(),
1355+
failedAllocations > 0 ? UnassignedInfo.Reason.MANUAL_ALLOCATION : unassignedInfo.reason(),
13201356
unassignedInfo.message(),
13211357
unassignedInfo.failure(),
13221358
0,
@@ -1328,7 +1364,7 @@ public void resetFailedCounter(RoutingChangesObserver routingChangesObserver) {
13281364
unassignedInfo.lastAllocatedNodeId()
13291365
),
13301366
shardRouting.recoverySource(),
1331-
routingChangesObserver
1367+
observer
13321368
);
13331369
}
13341370

@@ -1337,6 +1373,20 @@ public void resetFailedCounter(RoutingChangesObserver routingChangesObserver) {
13371373
for (ShardRouting shardRouting : routingNode) {
13381374
if (shardRouting.relocationFailureInfo() != null && shardRouting.relocationFailureInfo().failedRelocations() > 0) {
13391375
shardsWithRelocationFailures.add(shardRouting);
1376+
try {
1377+
int failedRelocations = shardRouting.relocationFailureInfo().failedRelocations();
1378+
final var maxRetry = SETTING_ALLOCATION_MAX_RETRY.get(
1379+
allocation.metadata().getIndexSafe(shardRouting.index()).getSettings()
1380+
);
1381+
if (failedRelocations >= maxRetry) {
1382+
shardsWithMaxFailedRelocations++;
1383+
if (topShardIdsWithFailedRelocations.size() <= MAX_SHARDS_IN_LOG_MSG) {
1384+
topShardIdsWithFailedRelocations.add(shardRouting.shardId());
1385+
}
1386+
}
1387+
} catch (IndexNotFoundException e) {
1388+
// ignore
1389+
}
13401390
}
13411391
}
13421392

@@ -1347,6 +1397,17 @@ public void resetFailedCounter(RoutingChangesObserver routingChangesObserver) {
13471397
assignedShardsAdd(updated);
13481398
}
13491399
}
1400+
1401+
if (shardsWithMaxFailedAllocations > 0) {
1402+
logger.info(
1403+
Strings.format(RESET_FAILED_ALLOCATION_COUNTER_LOG_MSG, shardsWithMaxFailedAllocations, topShardIdsWithFailedAllocations)
1404+
);
1405+
}
1406+
if (shardsWithMaxFailedRelocations > 0) {
1407+
logger.info(
1408+
Strings.format(RESET_FAILED_RELOCATION_COUNTER_LOG_MSG, shardsWithMaxFailedRelocations, topShardIdsWithFailedRelocations)
1409+
);
1410+
}
13501411
}
13511412

13521413
/**

server/src/main/java/org/elasticsearch/cluster/routing/allocation/AllocationService.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -639,7 +639,7 @@ private boolean shouldResetAllocationFailures(ClusterChangedEvent changeEvent) {
639639

640640
private ClusterState rerouteWithResetFailedCounter(ClusterState clusterState) {
641641
RoutingAllocation allocation = createRoutingAllocation(clusterState, currentNanoTime());
642-
allocation.routingNodes().resetFailedCounter(allocation.changes());
642+
allocation.routingNodes().resetFailedCounter(allocation);
643643
reroute(allocation, routingAllocation -> shardsAllocator.allocate(routingAllocation, ActionListener.noop()));
644644
return buildResultAndLogHealthChange(clusterState, allocation, "reroute with reset failed counter");
645645
}

server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/ShardsAllocator.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ default RoutingExplanations execute(RoutingAllocation allocation, AllocationComm
6262

6363
try {
6464
if (retryFailed) {
65-
allocation.routingNodes().resetFailedCounter(allocation.changes());
65+
allocation.routingNodes().resetFailedCounter(allocation);
6666
}
6767
return commands.execute(allocation, explain);
6868
} finally {

0 commit comments

Comments
 (0)