16
16
import org .elasticsearch .cluster .node .DiscoveryNodes ;
17
17
import org .elasticsearch .cluster .routing .UnassignedInfo .AllocationStatus ;
18
18
import org .elasticsearch .cluster .routing .allocation .ExistingShardsAllocator ;
19
+ import org .elasticsearch .cluster .routing .allocation .RoutingAllocation ;
19
20
import org .elasticsearch .cluster .routing .allocation .allocator .DesiredBalanceMetrics ;
21
+ import org .elasticsearch .common .Strings ;
20
22
import org .elasticsearch .common .collect .Iterators ;
21
23
import org .elasticsearch .common .util .Maps ;
22
24
import org .elasticsearch .core .Assertions ;
23
25
import org .elasticsearch .core .Nullable ;
24
26
import org .elasticsearch .core .Tuple ;
25
27
import org .elasticsearch .index .Index ;
28
+ import org .elasticsearch .index .IndexNotFoundException ;
26
29
import org .elasticsearch .index .shard .ShardId ;
30
+ import org .elasticsearch .logging .LogManager ;
31
+ import org .elasticsearch .logging .Logger ;
27
32
28
33
import java .util .ArrayDeque ;
29
34
import java .util .ArrayList ;
44
49
import java .util .stream .Stream ;
45
50
import java .util .stream .StreamSupport ;
46
51
52
+ import static org .elasticsearch .cluster .routing .allocation .decider .MaxRetryAllocationDecider .SETTING_ALLOCATION_MAX_RETRY ;
53
+
47
54
/**
48
55
* {@link RoutingNodes} represents a copy the routing information contained in the {@link ClusterState cluster state}.
49
56
* It can be either initialized as mutable or immutable allowing or disallowing changes to its elements.
60
67
*/
61
68
public class RoutingNodes implements Iterable <RoutingNode > {
62
69
70
+ private static final Logger logger = LogManager .getLogger (RoutingNodes .class );
71
+ public static final String RESET_FAILED_ALLOCATION_COUNTER_LOG_MSG =
72
+ "Resetting failure counter for %d shard(s) that have reached their max allocation retires (%s)" ;
73
+ public static final String RESET_FAILED_RELOCATION_COUNTER_LOG_MSG =
74
+ "Resetting failure counter for %d shard(s) that have reached their max relocation retries (%s)" ;
75
+ private static final int MAX_SHARDS_IN_LOG_MSG = 20 ;
76
+
63
77
private final Map <String , RoutingNode > nodesToShards ;
64
78
65
79
private final UnassignedShards unassignedShards ;
@@ -1309,14 +1323,36 @@ public boolean hasRelocationFailures() {
1309
1323
return false ;
1310
1324
}
1311
1325
1312
- public void resetFailedCounter (RoutingChangesObserver routingChangesObserver ) {
1326
+ public void resetFailedCounter (RoutingAllocation allocation ) {
1327
+ final var observer = allocation .changes ();
1328
+ int shardsWithMaxFailedAllocations = 0 ;
1329
+ int shardsWithMaxFailedRelocations = 0 ;
1330
+ List <ShardId > topShardIdsWithFailedAllocations = new ArrayList <>();
1331
+ List <ShardId > topShardIdsWithFailedRelocations = new ArrayList <>();
1332
+
1313
1333
final var unassignedIterator = unassigned ().iterator ();
1314
1334
while (unassignedIterator .hasNext ()) {
1315
1335
ShardRouting shardRouting = unassignedIterator .next ();
1316
1336
UnassignedInfo unassignedInfo = shardRouting .unassignedInfo ();
1337
+ int failedAllocations = unassignedInfo .failedAllocations ();
1338
+ if (failedAllocations > 0 ) {
1339
+ try {
1340
+ final var maxRetry = SETTING_ALLOCATION_MAX_RETRY .get (
1341
+ allocation .metadata ().getIndexSafe (shardRouting .index ()).getSettings ()
1342
+ );
1343
+ if (failedAllocations >= maxRetry ) {
1344
+ shardsWithMaxFailedAllocations ++;
1345
+ if (topShardIdsWithFailedAllocations .size () <= MAX_SHARDS_IN_LOG_MSG ) {
1346
+ topShardIdsWithFailedAllocations .add (shardRouting .shardId ());
1347
+ }
1348
+ }
1349
+ } catch (IndexNotFoundException e ) {
1350
+ // ignore
1351
+ }
1352
+ }
1317
1353
unassignedIterator .updateUnassigned (
1318
1354
new UnassignedInfo (
1319
- unassignedInfo . failedAllocations () > 0 ? UnassignedInfo .Reason .MANUAL_ALLOCATION : unassignedInfo .reason (),
1355
+ failedAllocations > 0 ? UnassignedInfo .Reason .MANUAL_ALLOCATION : unassignedInfo .reason (),
1320
1356
unassignedInfo .message (),
1321
1357
unassignedInfo .failure (),
1322
1358
0 ,
@@ -1328,7 +1364,7 @@ public void resetFailedCounter(RoutingChangesObserver routingChangesObserver) {
1328
1364
unassignedInfo .lastAllocatedNodeId ()
1329
1365
),
1330
1366
shardRouting .recoverySource (),
1331
- routingChangesObserver
1367
+ observer
1332
1368
);
1333
1369
}
1334
1370
@@ -1337,6 +1373,20 @@ public void resetFailedCounter(RoutingChangesObserver routingChangesObserver) {
1337
1373
for (ShardRouting shardRouting : routingNode ) {
1338
1374
if (shardRouting .relocationFailureInfo () != null && shardRouting .relocationFailureInfo ().failedRelocations () > 0 ) {
1339
1375
shardsWithRelocationFailures .add (shardRouting );
1376
+ try {
1377
+ int failedRelocations = shardRouting .relocationFailureInfo ().failedRelocations ();
1378
+ final var maxRetry = SETTING_ALLOCATION_MAX_RETRY .get (
1379
+ allocation .metadata ().getIndexSafe (shardRouting .index ()).getSettings ()
1380
+ );
1381
+ if (failedRelocations >= maxRetry ) {
1382
+ shardsWithMaxFailedRelocations ++;
1383
+ if (topShardIdsWithFailedRelocations .size () <= MAX_SHARDS_IN_LOG_MSG ) {
1384
+ topShardIdsWithFailedRelocations .add (shardRouting .shardId ());
1385
+ }
1386
+ }
1387
+ } catch (IndexNotFoundException e ) {
1388
+ // ignore
1389
+ }
1340
1390
}
1341
1391
}
1342
1392
@@ -1347,6 +1397,17 @@ public void resetFailedCounter(RoutingChangesObserver routingChangesObserver) {
1347
1397
assignedShardsAdd (updated );
1348
1398
}
1349
1399
}
1400
+
1401
+ if (shardsWithMaxFailedAllocations > 0 ) {
1402
+ logger .info (
1403
+ Strings .format (RESET_FAILED_ALLOCATION_COUNTER_LOG_MSG , shardsWithMaxFailedAllocations , topShardIdsWithFailedAllocations )
1404
+ );
1405
+ }
1406
+ if (shardsWithMaxFailedRelocations > 0 ) {
1407
+ logger .info (
1408
+ Strings .format (RESET_FAILED_RELOCATION_COUNTER_LOG_MSG , shardsWithMaxFailedRelocations , topShardIdsWithFailedRelocations )
1409
+ );
1410
+ }
1350
1411
}
1351
1412
1352
1413
/**
0 commit comments