Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
50 commits
Select commit Hold shift + click to select a range
0807692
basic infra to evaluate rules
May 10, 2023
de75d4b
return an indicator in the case an index is stuck
May 10, 2023
915dab7
dummy conditions
May 10, 2023
ed478b9
separate impacts of ILM stopped and index stuck
May 10, 2023
ee9fbd5
summarize details of stuck indices
May 11, 2023
8076937
convert to TimeValue
May 11, 2023
e329450
dummy actions
May 11, 2023
84fce73
add diagnoses per ILM-action
May 11, 2023
36479a6
lift up to a class the logic to find stuck indices
May 11, 2023
0baabaa
add test
May 12, 2023
932361b
tests for StuckIndicesFinder
May 12, 2023
d8b7d2d
missing test
May 12, 2023
f244f85
spotless
May 12, 2023
0ab528d
create Rules using the records
May 12, 2023
e697f0b
fix message
May 15, 2023
b98bc25
Update docs/changelog/96092.yaml
HiDAl May 15, 2023
a5259bd
fix test after message change
May 15, 2023
8972075
add missing test for mapped fields
May 15, 2023
4c106bc
fix wording using PR comments
May 16, 2023
5aaeba9
use only clusterService's metadata to get the indices
May 16, 2023
d6b378c
Merge remote-tracking branch 'elastic/main' into ilm-indicator
May 16, 2023
7a07ad3
reword stuck to stagnating
May 16, 2023
f99de49
fix tests
May 16, 2023
01219ef
group diagnostic only by action
May 16, 2023
0477bba
fix condition order
May 16, 2023
cb583ef
spotless over and over again
May 16, 2023
b571205
fix test
May 16, 2023
36a0176
Merge branch 'main' into ilm-indicator
elasticmachine May 16, 2023
e25ee88
Merge remote-tracking branch 'elastic/main' into ilm-indicator
May 22, 2023
957c661
Merge remote-tracking branch 'elastic/main' into ilm-indicator
May 24, 2023
706284d
sort the affected resources
May 24, 2023
d37d03c
change to use a treeset
May 24, 2023
85ad50d
tackle some PR comments
May 29, 2023
ee4dab4
get rid of IndexIlmState record
May 29, 2023
ba066f2
Merge remote-tracking branch 'elastic/main' into ilm-indicator
May 29, 2023
a5aae7c
fix stagnatingIndicesFinder tests
May 29, 2023
21e204f
fix IlmHealthIndicatorServiceTests
May 29, 2023
eb0240b
change the rules arch
May 29, 2023
dc15d7e
fix tests
May 29, 2023
99153ef
improve API surface for RuleConfigs
May 31, 2023
f333627
removes unneeded indirection layetr
Jun 1, 2023
2bac0dc
add tests for RuleConfig
Jun 1, 2023
a633c7b
add test for RuleConfigBuilder
Jun 5, 2023
96763cf
stepRule receives the maxTimeOn
Jun 5, 2023
aecf96f
transform back to a class `StagnatingIndicesFinder`
Jun 5, 2023
90de8f6
Merge branch 'main' into ilm-indicator
elasticmachine Jun 7, 2023
22ae060
Merge remote-tracking branch 'elastic/main' into ilm-indicator
Jun 8, 2023
1539f90
add javadoc to Rule DSL builder
Jun 8, 2023
3655292
DELETE action does not check maxTimeOn
Jun 8, 2023
f2f1cd7
fix rules according PR comments
Jun 8, 2023
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/changelog/96092.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 96092
summary: Enhance ILM Health Indicator
area: ILM+SLM
type: feature
issues: []
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ public enum Type {
INDEX("indices"),
NODE("nodes"),
SLM_POLICY("slm_policies"),
ILM_POLICY("ilm_policies"),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since we're adding a new resource type we'll have to coordinate with the UI work needed to support this (if we release it without UI support we could end up with the health page signaling RED health without any affected resource being displayed)

Copy link
Author

@HiDAl HiDAl May 29, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, I created an issue for the team in charge

FEATURE_STATE("feature_states"),
SNAPSHOT_REPOSITORY("snapshot_repositories");

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,9 @@
/**
* Wait Step for index based on color. Optionally derives the index name using the provided prefix (if any).
*/
class WaitForIndexColorStep extends ClusterStateWaitStep {
public class WaitForIndexColorStep extends ClusterStateWaitStep {

static final String NAME = "wait-for-index-color";
public static final String NAME = "wait-for-index-color";

private static final Logger logger = LogManager.getLogger(WaitForIndexColorStep.class);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ public class WaitForNoFollowersStep extends AsyncWaitStep {

private static final Logger logger = LogManager.getLogger(WaitForNoFollowersStep.class);

static final String NAME = "wait-for-shard-history-leases";
public static final String NAME = "wait-for-shard-history-leases";
static final String CCR_LEASE_KEY = "ccr";
private static final String WAIT_MESSAGE = "this index is a leader index; waiting for all following indices to cease "
+ "following before proceeding";
Expand Down

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,16 @@ public Collection<Object> createComponents(
);
snapshotRetentionService.get().init(clusterService);
components.addAll(Arrays.asList(snapshotLifecycleService.get(), snapshotHistoryStore.get(), snapshotRetentionService.get()));
ilmHealthIndicatorService.set(new IlmHealthIndicatorService(clusterService));
ilmHealthIndicatorService.set(
new IlmHealthIndicatorService(
clusterService,
new IlmHealthIndicatorService.StagnatingIndicesFinder(
clusterService,
IlmHealthIndicatorService.ILM_RULE_EVALUATOR,
System::currentTimeMillis
)
)
);
slmHealthIndicatorService.set(new SlmHealthIndicatorService(clusterService));
reservedLifecycleAction.set(new ReservedLifecycleAction(xContentRegistry, client, XPackPlugin.getSharedLicenseState()));

Expand Down

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/

package org.elasticsearch.xpack.ilm;

import org.elasticsearch.Version;
import org.elasticsearch.cluster.metadata.IndexMetadata;
import org.elasticsearch.core.TimeValue;
import org.elasticsearch.test.ESTestCase;

import java.util.Map;

import static org.elasticsearch.cluster.metadata.LifecycleExecutionState.ILM_CUSTOM_METADATA_KEY;
import static org.mockito.Mockito.mock;

public class RuleConfigTests extends ESTestCase {
public void testActionRuleConfig() {
var actionName = randomAlphaOfLength(30);
assertTrue(
new IlmHealthIndicatorService.ActionRule(actionName, null).test(
randomNonNegativeLong(),
metadataAction(actionName, randomNonNegativeLong())
)
);
assertFalse(
new IlmHealthIndicatorService.ActionRule(actionName, null).test(
randomNonNegativeLong(),
metadataAction(randomAlphaOfLength(30), randomNonNegativeLong())
)
);

var maxTimeOn = TimeValue.parseTimeValue(randomTimeValue(), "");
var rule = new IlmHealthIndicatorService.ActionRule(actionName, maxTimeOn);
var now = System.currentTimeMillis();

assertFalse(rule.test(now, metadataAction(randomAlphaOfLength(30), now)));
assertFalse(rule.test(now, metadataAction(actionName, now)));
assertTrue(rule.test(now, metadataAction(actionName, now - maxTimeOn.millis() - randomIntBetween(1000, 100000))));
}

public void testStepRuleConfig() {
var stepName = randomAlphaOfLength(30);
var maxTimeOn = TimeValue.parseTimeValue(randomTimeValue(), "");
var maxRetries = randomIntBetween(11, 100);
var rule = new IlmHealthIndicatorService.StepRule(stepName, maxTimeOn, maxRetries);
var now = System.currentTimeMillis();

// rule is not for this step
assertFalse(rule.test(now, metadataStep(randomAlphaOfLength(30), now, maxRetries + 1)));

// step still has time to run && can continue retrying
assertFalse(rule.test(now, metadataStep(stepName, now, maxRetries - randomIntBetween(0, 10))));

// step still has run longer than expected
assertTrue(
rule.test(
now,
metadataStep(stepName, now - maxTimeOn.millis() - randomIntBetween(1000, 100000), maxRetries - randomIntBetween(0, 10))
)
);

// step still has time to run but have retried more than expected
assertTrue(rule.test(now, metadataStep(stepName, now, maxRetries + randomIntBetween(1, 10))));
}

public void testRuleChaining() {
var mockedMd = mock(IndexMetadata.class);
var someLong = randomLong();

assertTrue(ruleAlwaysReturn(true).test(someLong, mockedMd));
assertFalse(ruleAlwaysReturn(false).test(someLong, mockedMd));

// and
assertTrue(ruleAlwaysReturn(true).and(ruleAlwaysReturn(true)).test(someLong, mockedMd));
assertFalse(ruleAlwaysReturn(true).and(ruleAlwaysReturn(false)).test(someLong, mockedMd));
assertFalse(ruleAlwaysReturn(false).and(ruleAlwaysReturn(false)).test(someLong, mockedMd));
assertFalse(ruleAlwaysReturn(false).and(ruleAlwaysReturn(true)).test(someLong, mockedMd));

// or
assertTrue(ruleAlwaysReturn(true).or(ruleAlwaysReturn(true)).test(someLong, mockedMd));
assertTrue(ruleAlwaysReturn(true).or(ruleAlwaysReturn(false)).test(someLong, mockedMd));
assertFalse(ruleAlwaysReturn(false).or(ruleAlwaysReturn(false)).test(someLong, mockedMd));
assertTrue(ruleAlwaysReturn(false).or(ruleAlwaysReturn(true)).test(someLong, mockedMd));
}

public void testGetElapsedTime() {
var a = randomLongBetween(1000, 2000);
var b = a - randomLongBetween(0, 500);
assertEquals(IlmHealthIndicatorService.RuleConfig.getElapsedTime(a, b), TimeValue.timeValueMillis(a - b));
assertEquals(IlmHealthIndicatorService.RuleConfig.getElapsedTime(a, null), TimeValue.ZERO);
}

public void testRuleConfigBuilder() {
var now = System.currentTimeMillis();
var lastExecutionTime = System.currentTimeMillis() - TimeValue.timeValueDays(2).millis();
var maxTimeOnStep = TimeValue.timeValueDays(1);
var expectedAction = "some-action";
var rules = IlmHealthIndicatorService.RuleConfig.Builder.actionRule(expectedAction)
.maxTimeOnAction(TimeValue.timeValueDays(1))
.stepRules(
IlmHealthIndicatorService.StepRule.stepRule("step-1", maxTimeOnStep),
IlmHealthIndicatorService.StepRule.stepRule("step-2", maxTimeOnStep),
IlmHealthIndicatorService.StepRule.stepRule("step-3", maxTimeOnStep)
);

// An unknown action should not satisfy the conditions
assertFalse(rules.test(now, metadataAlwaysOutOfRule(lastExecutionTime, randomAlphaOfLength(30), "step-1")));
// The following 3 steps should satisfy the conditions
assertTrue(rules.test(now, metadataAlwaysOutOfRule(lastExecutionTime, expectedAction, "step-1")));
assertTrue(rules.test(now, metadataAlwaysOutOfRule(lastExecutionTime, expectedAction, "step-2")));
assertTrue(rules.test(now, metadataAlwaysOutOfRule(lastExecutionTime, expectedAction, "step-3")));
// An unknown step should not satisfy the conditions
assertFalse(rules.test(now, metadataAlwaysOutOfRule(lastExecutionTime, expectedAction, randomAlphaOfLength(30))));

rules = IlmHealthIndicatorService.RuleConfig.Builder.actionRule(expectedAction)
.maxTimeOnAction(TimeValue.timeValueDays(1))
.noStepRules();

assertTrue(rules.test(now, metadataAlwaysOutOfRule(lastExecutionTime, expectedAction, randomAlphaOfLength(30))));
// An unknown action and step should satisfy the conditions
assertFalse(rules.test(now, metadataAlwaysOutOfRule(lastExecutionTime, randomAlphaOfLength(30), randomAlphaOfLength(30))));
}

private IlmHealthIndicatorService.RuleConfig ruleAlwaysReturn(boolean shouldReturn) {
return (now, indexMetadata) -> shouldReturn;
}

private IndexMetadata metadataAlwaysOutOfRule(long lastExecutionTime, String action, String step) {
return baseBuilder().putCustom(
ILM_CUSTOM_METADATA_KEY,
Map.of(
"action",
action,
"action_time",
String.valueOf(lastExecutionTime),
"step",
step,
"step_time",
String.valueOf(lastExecutionTime),
"failed_step_retry_count",
"200"
)
).build();
}

private IndexMetadata metadataStep(String currentStep, long stepTime, long stepRetries) {
return baseBuilder().putCustom(
ILM_CUSTOM_METADATA_KEY,
Map.of("step", currentStep, "step_time", String.valueOf(stepTime), "failed_step_retry_count", String.valueOf(stepRetries))
).build();
}

private IndexMetadata metadataAction(String currentAction, long actionTime) {
return baseBuilder().putCustom(ILM_CUSTOM_METADATA_KEY, Map.of("action", currentAction, "action_time", String.valueOf(actionTime)))
.build();
}

private IndexMetadata.Builder baseBuilder() {
return IndexMetadata.builder("some-index")
.settings(settings(Version.CURRENT))
.numberOfShards(randomIntBetween(1, 5))
.numberOfReplicas(randomIntBetween(0, 5));
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/

package org.elasticsearch.xpack.ilm;

import org.elasticsearch.Version;
import org.elasticsearch.cluster.ClusterState;
import org.elasticsearch.cluster.metadata.IndexMetadata;
import org.elasticsearch.cluster.metadata.LifecycleExecutionState;
import org.elasticsearch.cluster.metadata.Metadata;
import org.elasticsearch.cluster.service.ClusterService;
import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.xpack.core.ilm.LifecycleSettings;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicReference;
import java.util.function.LongSupplier;
import java.util.stream.IntStream;

import static org.elasticsearch.cluster.metadata.LifecycleExecutionState.ILM_CUSTOM_METADATA_KEY;
import static org.elasticsearch.xpack.ilm.IlmHealthIndicatorService.isStagnated;
import static org.hamcrest.Matchers.containsInAnyOrder;
import static org.hamcrest.Matchers.hasSize;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;

public class StagnatingIndicesFinderTests extends ESTestCase {

public void testStagnatingIndicesFinder() {
var idxMd1 = randomIndexMetadata();
var idxMd2 = randomIndexMetadata();
var idxMd3 = randomIndexMetadata();
var stagnatingIndices = List.of(idxMd1.indexName, idxMd3.indexName);
var mockedTimeSupplier = mock(LongSupplier.class);
var instant = (long) randomIntBetween(100000, 200000);
var ruleEvaluator = List.<IlmHealthIndicatorService.RuleConfig>of(
(now, indexMetadata) -> now == instant && stagnatingIndices.contains(indexMetadata.getIndex().getName())
);
// Per the evaluator, the timeSupplier _must_ be called only twice
when(mockedTimeSupplier.getAsLong()).thenReturn(instant, instant);

var stagnatedIdx1 = indexMetadataFrom(idxMd1);
var stagnatedIdx3 = indexMetadataFrom(idxMd3);
var finder = createStagnatingIndicesFinder(
ruleEvaluator,
mockedTimeSupplier,
indexMetadataUnmanaged(randomAlphaOfLength(10)), // non-managed by ILM
stagnatedIdx1, // should be stagnated
indexMetadataFrom(idxMd2), // won't be stagnated
stagnatedIdx3, // should be stagnated
indexMetadataUnmanaged(randomAlphaOfLength(10)) // non-managed by ILM
);

var foundIndices = finder.find();

assertThat(foundIndices, hasSize(2));
assertThat(foundIndices, containsInAnyOrder(stagnatedIdx1, stagnatedIdx3));
}

public void testStagnatingIndicesEvaluator() {
var idxMd1 = randomIndexMetadata();
var indexMetadata = indexMetadataFrom(idxMd1);
Long moment = 111333111222L;
{
// no rule matches
var executions = randomIntBetween(3, 200);
var calls = new AtomicInteger(0);
var rules = IntStream.range(0, executions).mapToObj(i -> (IlmHealthIndicatorService.RuleConfig) (now, idxMd) -> {
assertEquals(now, moment);
assertSame(idxMd, indexMetadata);
calls.incrementAndGet();
return false;
}).toList();
assertFalse(isStagnated(rules, moment, indexMetadata));
assertEquals(calls.get(), executions);
}
{
var calls = new AtomicReference<>(new ArrayList<Integer>());
var rules = List.<IlmHealthIndicatorService.RuleConfig>of((now, idxMd) -> { // will be called
assertEquals(now, moment);
assertSame(idxMd, indexMetadata);
calls.get().add(1);
return false;
}, (now, idxMd) -> { // will be called and cut the execution
assertEquals(now, moment);
assertSame(idxMd, indexMetadata);
calls.get().add(2);
return true;
}, (now, idxMd) -> { // won't be called
assertEquals(now, moment);
assertSame(idxMd, indexMetadata);
calls.get().add(3);
return true;
}, (now, idxMd) -> { // won't be called
assertEquals(now, moment);
assertSame(idxMd, indexMetadata);
calls.get().add(4);
return false;
});

assertTrue(isStagnated(rules, moment, indexMetadata));
assertEquals(calls.get(), List.of(1, 2));
}
}

private static IndexMetadata indexMetadataUnmanaged(String indexName) {
return indexMetadataFrom(new IndexMetadataTestCase(indexName, null, null));
}

private static IndexMetadata indexMetadataFrom(IndexMetadataTestCase indexMetadataTestCase) {
var settings = settings(Version.CURRENT);
var indexMetadataBuilder = IndexMetadata.builder(indexMetadataTestCase.indexName);

if (indexMetadataTestCase.ilmState != null) {
settings.put(LifecycleSettings.LIFECYCLE_NAME, indexMetadataTestCase.policyName);
indexMetadataBuilder.putCustom(ILM_CUSTOM_METADATA_KEY, indexMetadataTestCase.ilmState.asMap());
}

return indexMetadataBuilder.settings(settings)
.numberOfShards(randomIntBetween(1, 5))
.numberOfReplicas(randomIntBetween(0, 5))
.build();
}

private IlmHealthIndicatorService.StagnatingIndicesFinder createStagnatingIndicesFinder(
Collection<IlmHealthIndicatorService.RuleConfig> evaluator,
LongSupplier timeSupplier,
IndexMetadata... indicesMetadata
) {
var clusterService = mock(ClusterService.class);
var state = mock(ClusterState.class);
var metadataBuilder = Metadata.builder();

Arrays.stream(indicesMetadata).forEach(im -> metadataBuilder.put(im, false));
when(state.metadata()).thenReturn(metadataBuilder.build());

when(clusterService.state()).thenReturn(state);

return new IlmHealthIndicatorService.StagnatingIndicesFinder(clusterService, evaluator, timeSupplier);
}

static IndexMetadataTestCase randomIndexMetadata() {
return new IndexMetadataTestCase(
randomAlphaOfLength(10),
randomAlphaOfLength(10),
LifecycleExecutionState.builder()
.setPhase(randomAlphaOfLength(5))
.setAction(randomAlphaOfLength(10))
.setActionTime((long) randomIntBetween(0, 10000))
.setStep(randomAlphaOfLength(20))
.setStepTime((long) randomIntBetween(0, 10000))
.setFailedStepRetryCount(randomIntBetween(0, 1000))
.build()
);
}

record IndexMetadataTestCase(String indexName, String policyName, LifecycleExecutionState ilmState) {}
}