Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/changelog/120458.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 120458
summary: Do not recommend increasing `max_shards_per_node`
area: Health
type: bug
issues: []
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ public enum ReferenceDocs {
ALLOCATION_EXPLAIN_NO_COPIES,
ALLOCATION_EXPLAIN_MAX_RETRY,
SECURE_SETTINGS,
CLUSTER_SHARD_LIMIT,
// this comment keeps the ';' on the next line so every entry above has a trailing ',' which makes the diff for adding new links cleaner
;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@
import org.elasticsearch.cluster.metadata.Metadata;
import org.elasticsearch.cluster.node.DiscoveryNodes;
import org.elasticsearch.cluster.service.ClusterService;
import org.elasticsearch.common.ReferenceDocs;
import org.elasticsearch.common.TriFunction;
import org.elasticsearch.common.settings.Setting;
import org.elasticsearch.features.FeatureService;
import org.elasticsearch.health.Diagnosis;
import org.elasticsearch.health.HealthIndicatorDetails;
import org.elasticsearch.health.HealthIndicatorImpact;
Expand Down Expand Up @@ -54,21 +54,18 @@ public class ShardsCapacityHealthIndicatorService implements HealthIndicatorServ
"The cluster is running low on room to add new shards. Adding data to new indices is at risk";
private static final String INDEX_CREATION_RISK =
"The cluster is running low on room to add new shards. Adding data to new indices might soon fail.";
private static final String HELP_GUIDE = "https://ela.st/fix-shards-capacity";
private static final TriFunction<String, Setting<?>, String, Diagnosis> SHARD_MAX_CAPACITY_REACHED_FN = (
id,
setting,
indexType) -> new Diagnosis(
new Diagnosis.Definition(
NAME,
id,
"Elasticsearch is about to reach the maximum number of shards it can host, based on your current settings.",
"Increase the value of ["
+ setting.getKey()
+ "] cluster setting or remove "
"Elasticsearch is about to reach the maximum number of shards it can host as set by [" + setting.getKey() + "].",
"Increase the number of nodes in your cluster or remove some "
+ indexType
+ " indices to clear up resources.",
HELP_GUIDE
+ " indices to reduce the number of shards in the cluster.",
ReferenceDocs.CLUSTER_SHARD_LIMIT.toString()
),
null
);
Expand All @@ -82,22 +79,20 @@ public class ShardsCapacityHealthIndicatorService implements HealthIndicatorServ
new HealthIndicatorImpact(NAME, "creation_of_new_indices_at_risk", 2, INDEX_CREATION_RISK, List.of(ImpactArea.INGEST))
);
static final Diagnosis SHARDS_MAX_CAPACITY_REACHED_DATA_NODES = SHARD_MAX_CAPACITY_REACHED_FN.apply(
"increase_max_shards_per_node",
"decrease_shards_per_non_frozen_node",
ShardLimitValidator.SETTING_CLUSTER_MAX_SHARDS_PER_NODE,
"data"
"non-frozen"
);
static final Diagnosis SHARDS_MAX_CAPACITY_REACHED_FROZEN_NODES = SHARD_MAX_CAPACITY_REACHED_FN.apply(
"increase_max_shards_per_node_frozen",
"decrease_shards_per_frozen_node",
ShardLimitValidator.SETTING_CLUSTER_MAX_SHARDS_PER_NODE_FROZEN,
Comment on lines 81 to 88
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The bad "increase the limit" advice was baked into the actual diagnosis IDs - fixed here, and see also https://github.com/elastic/telemetry/pull/4362 for the corresponding change to the telemetry cluster

"frozen"
);

private final ClusterService clusterService;
private final FeatureService featureService;

public ShardsCapacityHealthIndicatorService(ClusterService clusterService, FeatureService featureService) {
public ShardsCapacityHealthIndicatorService(ClusterService clusterService) {
this.clusterService = clusterService;
this.featureService = featureService;
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1352,7 +1352,7 @@ private Module loadDiagnosticServices(
new StableMasterHealthIndicatorService(coordinationDiagnosticsService, clusterService),
new RepositoryIntegrityHealthIndicatorService(clusterService),
new DiskHealthIndicatorService(clusterService, featureService),
new ShardsCapacityHealthIndicatorService(clusterService, featureService),
new ShardsCapacityHealthIndicatorService(clusterService),
fileSettingsHealthIndicatorService
);
var pluginHealthIndicatorServices = pluginsService.filterPlugins(HealthPlugin.class)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,4 @@ CIRCUIT_BREAKER_ERRORS circuit-breaker-
ALLOCATION_EXPLAIN_NO_COPIES cluster-allocation-explain.html#no-valid-shard-copy
ALLOCATION_EXPLAIN_MAX_RETRY cluster-allocation-explain.html#maximum-number-of-retries-exceeded
SECURE_SETTINGS secure-settings.html
CLUSTER_SHARD_LIMIT misc-cluster-settings.html#cluster-shard-limit
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.xcontent.LoggingDeprecationHandler;
import org.elasticsearch.features.FeatureService;
import org.elasticsearch.health.HealthIndicatorDetails;
import org.elasticsearch.health.HealthStatus;
import org.elasticsearch.health.metadata.HealthMetadata;
Expand All @@ -39,7 +38,6 @@
import org.junit.AfterClass;
import org.junit.Before;
import org.junit.BeforeClass;
import org.mockito.Mockito;

import java.io.IOException;
import java.util.List;
Expand All @@ -61,10 +59,13 @@
import static org.elasticsearch.indices.ShardLimitValidator.FROZEN_GROUP;
import static org.elasticsearch.indices.ShardLimitValidator.INDEX_SETTING_SHARD_LIMIT_GROUP;
import static org.elasticsearch.indices.ShardLimitValidator.NORMAL_GROUP;
import static org.elasticsearch.indices.ShardLimitValidator.SETTING_CLUSTER_MAX_SHARDS_PER_NODE;
import static org.elasticsearch.indices.ShardLimitValidator.SETTING_CLUSTER_MAX_SHARDS_PER_NODE_FROZEN;
import static org.hamcrest.Matchers.allOf;
import static org.hamcrest.Matchers.containsString;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.hasSize;
import static org.hamcrest.Matchers.is;
import static org.mockito.ArgumentMatchers.any;

public class ShardsCapacityHealthIndicatorServiceTests extends ESTestCase {

Expand All @@ -73,7 +74,6 @@ public class ShardsCapacityHealthIndicatorServiceTests extends ESTestCase {
private static ThreadPool threadPool;

private ClusterService clusterService;
private FeatureService featureService;
private DiscoveryNode dataNode;
private DiscoveryNode frozenNode;

Expand All @@ -92,9 +92,6 @@ public void setUp() throws Exception {
.build();

clusterService = ClusterServiceUtils.createClusterService(threadPool);

featureService = Mockito.mock(FeatureService.class);
Mockito.when(featureService.clusterHasFeature(any(), any())).thenReturn(true);
}

@After
Expand Down Expand Up @@ -122,7 +119,7 @@ public void testNoShardsCapacityMetadata() throws IOException {
createIndexInDataNode(100)
)
);
var target = new ShardsCapacityHealthIndicatorService(clusterService, featureService);
var target = new ShardsCapacityHealthIndicatorService(clusterService);
var indicatorResult = target.calculate(true, HealthInfo.EMPTY_HEALTH_INFO);

assertEquals(indicatorResult.status(), HealthStatus.UNKNOWN);
Expand All @@ -136,10 +133,7 @@ public void testIndicatorYieldsGreenInCaseThereIsRoom() throws IOException {
int maxShardsPerNode = randomValidMaxShards();
int maxShardsPerNodeFrozen = randomValidMaxShards();
var clusterService = createClusterService(maxShardsPerNode, maxShardsPerNodeFrozen, createIndexInDataNode(maxShardsPerNode / 4));
var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService, featureService).calculate(
true,
HealthInfo.EMPTY_HEALTH_INFO
);
var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService).calculate(true, HealthInfo.EMPTY_HEALTH_INFO);

assertEquals(indicatorResult.status(), HealthStatus.GREEN);
assertTrue(indicatorResult.impacts().isEmpty());
Expand All @@ -158,15 +152,36 @@ public void testIndicatorYieldsGreenInCaseThereIsRoom() throws IOException {
);
}

public void testDiagnoses() {
assertEquals("shards_capacity", SHARDS_MAX_CAPACITY_REACHED_DATA_NODES.definition().indicatorName());
assertEquals("decrease_shards_per_non_frozen_node", SHARDS_MAX_CAPACITY_REACHED_DATA_NODES.definition().id());
assertThat(
SHARDS_MAX_CAPACITY_REACHED_DATA_NODES.definition().cause(),
allOf(containsString("maximum number of shards"), containsString(SETTING_CLUSTER_MAX_SHARDS_PER_NODE.getKey()))
);
assertThat(
SHARDS_MAX_CAPACITY_REACHED_DATA_NODES.definition().action(),
allOf(containsString("Increase the number of nodes in your cluster"), containsString("remove some non-frozen indices"))
);

assertEquals("shards_capacity", SHARDS_MAX_CAPACITY_REACHED_FROZEN_NODES.definition().indicatorName());
assertEquals("decrease_shards_per_frozen_node", SHARDS_MAX_CAPACITY_REACHED_FROZEN_NODES.definition().id());
assertThat(
SHARDS_MAX_CAPACITY_REACHED_FROZEN_NODES.definition().cause(),
allOf(containsString("maximum number of shards"), containsString(SETTING_CLUSTER_MAX_SHARDS_PER_NODE_FROZEN.getKey()))
);
assertThat(
SHARDS_MAX_CAPACITY_REACHED_FROZEN_NODES.definition().action(),
allOf(containsString("Increase the number of nodes in your cluster"), containsString("remove some frozen indices"))
);
}

public void testIndicatorYieldsYellowInCaseThereIsNotEnoughRoom() throws IOException {
{
// Only data_nodes does not have enough space
int maxShardsPerNodeFrozen = randomValidMaxShards();
var clusterService = createClusterService(25, maxShardsPerNodeFrozen, createIndexInDataNode(4));
var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService, featureService).calculate(
true,
HealthInfo.EMPTY_HEALTH_INFO
);
var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService).calculate(true, HealthInfo.EMPTY_HEALTH_INFO);

assertEquals(indicatorResult.status(), YELLOW);
assertEquals(indicatorResult.symptom(), "Cluster is close to reaching the configured maximum number of shards for data nodes.");
Expand All @@ -189,10 +204,7 @@ public void testIndicatorYieldsYellowInCaseThereIsNotEnoughRoom() throws IOExcep
// Only frozen_nodes does not have enough space
int maxShardsPerNode = randomValidMaxShards();
var clusterService = createClusterService(maxShardsPerNode, 25, createIndexInFrozenNode(4));
var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService, featureService).calculate(
true,
HealthInfo.EMPTY_HEALTH_INFO
);
var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService).calculate(true, HealthInfo.EMPTY_HEALTH_INFO);

assertEquals(indicatorResult.status(), YELLOW);
assertEquals(
Expand All @@ -217,10 +229,7 @@ public void testIndicatorYieldsYellowInCaseThereIsNotEnoughRoom() throws IOExcep
{
// Both data and frozen nodes does not have enough space
var clusterService = createClusterService(25, 25, createIndexInDataNode(4), createIndexInFrozenNode(4));
var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService, featureService).calculate(
true,
HealthInfo.EMPTY_HEALTH_INFO
);
var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService).calculate(true, HealthInfo.EMPTY_HEALTH_INFO);

assertEquals(indicatorResult.status(), YELLOW);
assertEquals(
Expand Down Expand Up @@ -251,10 +260,7 @@ public void testIndicatorYieldsRedInCaseThereIsNotEnoughRoom() throws IOExceptio
// Only data_nodes does not have enough space
int maxShardsPerNodeFrozen = randomValidMaxShards();
var clusterService = createClusterService(25, maxShardsPerNodeFrozen, createIndexInDataNode(11));
var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService, featureService).calculate(
true,
HealthInfo.EMPTY_HEALTH_INFO
);
var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService).calculate(true, HealthInfo.EMPTY_HEALTH_INFO);

assertEquals(indicatorResult.status(), RED);
assertEquals(indicatorResult.symptom(), "Cluster is close to reaching the configured maximum number of shards for data nodes.");
Expand All @@ -277,10 +283,7 @@ public void testIndicatorYieldsRedInCaseThereIsNotEnoughRoom() throws IOExceptio
// Only frozen_nodes does not have enough space
int maxShardsPerNode = randomValidMaxShards();
var clusterService = createClusterService(maxShardsPerNode, 25, createIndexInFrozenNode(11));
var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService, featureService).calculate(
true,
HealthInfo.EMPTY_HEALTH_INFO
);
var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService).calculate(true, HealthInfo.EMPTY_HEALTH_INFO);

assertEquals(indicatorResult.status(), RED);
assertEquals(
Expand All @@ -305,10 +308,7 @@ public void testIndicatorYieldsRedInCaseThereIsNotEnoughRoom() throws IOExceptio
{
// Both data and frozen nodes does not have enough space
var clusterService = createClusterService(25, 25, createIndexInDataNode(11), createIndexInFrozenNode(11));
var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService, featureService).calculate(
true,
HealthInfo.EMPTY_HEALTH_INFO
);
var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService).calculate(true, HealthInfo.EMPTY_HEALTH_INFO);

assertEquals(indicatorResult.status(), RED);
assertEquals(
Expand Down Expand Up @@ -377,22 +377,19 @@ public void testCalculateMethods() {
public void testMappedFieldsForTelemetry() {
assertEquals(ShardsCapacityHealthIndicatorService.NAME, "shards_capacity");
assertEquals(
"elasticsearch:health:shards_capacity:diagnosis:increase_max_shards_per_node",
"elasticsearch:health:shards_capacity:diagnosis:decrease_shards_per_non_frozen_node",
SHARDS_MAX_CAPACITY_REACHED_DATA_NODES.definition().getUniqueId()
);
assertEquals(
"elasticsearch:health:shards_capacity:diagnosis:increase_max_shards_per_node_frozen",
"elasticsearch:health:shards_capacity:diagnosis:decrease_shards_per_frozen_node",
SHARDS_MAX_CAPACITY_REACHED_FROZEN_NODES.definition().getUniqueId()
);
}

public void testSkippingFieldsWhenVerboseIsFalse() {
int maxShardsPerNodeFrozen = randomValidMaxShards();
var clusterService = createClusterService(25, maxShardsPerNodeFrozen, createIndexInDataNode(11));
var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService, featureService).calculate(
false,
HealthInfo.EMPTY_HEALTH_INFO
);
var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService).calculate(false, HealthInfo.EMPTY_HEALTH_INFO);

assertEquals(indicatorResult.status(), RED);
assertEquals(indicatorResult.symptom(), "Cluster is close to reaching the configured maximum number of shards for data nodes.");
Expand Down Expand Up @@ -441,7 +438,7 @@ private ClusterState createClusterState(
var metadata = Metadata.builder()
.persistentSettings(
Settings.builder()
.put(ShardLimitValidator.SETTING_CLUSTER_MAX_SHARDS_PER_NODE.getKey(), maxShardsPerNode)
.put(SETTING_CLUSTER_MAX_SHARDS_PER_NODE.getKey(), maxShardsPerNode)
.put(ShardLimitValidator.SETTING_CLUSTER_MAX_SHARDS_PER_NODE_FROZEN.getKey(), maxShardsPerNodeFrozen)
.build()
);
Expand Down