Skip to content

Commit d83d19a

Browse files
authored
Fix SingleNodeDiscoveryStabilisesEvenWhenDisrupted (elastic#91325)
By increasing the voting duration in case of high delays, to avoid the possible endless repetition of voting rounds. Fixes elastic#89867
1 parent d37cae2 commit d83d19a

File tree

3 files changed

+24
-19
lines changed

3 files changed

+24
-19
lines changed

server/src/main/java/org/elasticsearch/cluster/coordination/ElectionSchedulerFactory.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -77,15 +77,15 @@ public class ElectionSchedulerFactory {
7777
ELECTION_MAX_TIMEOUT_SETTING_KEY,
7878
TimeValue.timeValueSeconds(10),
7979
TimeValue.timeValueMillis(200),
80-
TimeValue.timeValueSeconds(300),
80+
TimeValue.timeValueSeconds(600),
8181
Property.NodeScope
8282
);
8383

8484
public static final Setting<TimeValue> ELECTION_DURATION_SETTING = Setting.timeSetting(
8585
ELECTION_DURATION_SETTING_KEY,
8686
TimeValue.timeValueMillis(500),
8787
TimeValue.timeValueMillis(1),
88-
TimeValue.timeValueSeconds(300),
88+
TimeValue.timeValueSeconds(600),
8989
Property.NodeScope
9090
);
9191

server/src/test/java/org/elasticsearch/cluster/coordination/CoordinatorTests.java

Lines changed: 20 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1904,23 +1904,28 @@ public void testSingleNodeDiscoveryWithQuorum() {
19041904
}
19051905
}
19061906

1907-
@AwaitsFix(bugUrl = "https://github.com/elastic/elasticsearch/issues/89867")
19081907
public void testSingleNodeDiscoveryStabilisesEvenWhenDisrupted() {
1909-
try (
1910-
Cluster cluster = new Cluster(
1911-
1,
1912-
randomBoolean(),
1913-
Settings.builder().put(DiscoveryModule.DISCOVERY_TYPE_SETTING.getKey(), DiscoveryModule.SINGLE_NODE_DISCOVERY_TYPE).build()
1914-
)
1915-
) {
1916-
1917-
// A cluster using single-node discovery should not apply any timeouts to joining or cluster state publication. There are no
1918-
// other options, so there's no point in failing and retrying from scratch no matter how badly disrupted we are and we may as
1919-
// well just wait.
1908+
// A cluster using single-node discovery should not apply any timeouts to joining or cluster state publication. There are no
1909+
// other options, so there's no point in failing and retrying from scratch no matter how badly disrupted we are and we may as
1910+
// well just wait.
1911+
1912+
// larger variability is are good for checking that we don't time out, but smaller variability also tightens up the time bound
1913+
// within which we expect to converge, so use a mix of both
1914+
final long delayVariabilityMillis = randomLongBetween(DEFAULT_DELAY_VARIABILITY, TimeValue.timeValueMinutes(10).millis());
1915+
1916+
Settings.Builder settings = Settings.builder()
1917+
.put(DiscoveryModule.DISCOVERY_TYPE_SETTING.getKey(), DiscoveryModule.SINGLE_NODE_DISCOVERY_TYPE);
1918+
1919+
// If the delay variability is high, set election duration accordingly, to avoid the possible endless repetition of voting rounds.
1920+
// Note that elections could take even longer than the delay variability, but this seems to be long enough to avoid bad collisions.
1921+
if (ElectionSchedulerFactory.ELECTION_DURATION_SETTING.getDefault(Settings.EMPTY).getMillis() < delayVariabilityMillis) {
1922+
settings = settings.put(
1923+
ElectionSchedulerFactory.ELECTION_DURATION_SETTING.getKey(),
1924+
TimeValue.timeValueMillis(delayVariabilityMillis)
1925+
);
1926+
}
19201927

1921-
// larger variability is are good for checking that we don't time out, but smaller variability also tightens up the time bound
1922-
// within which we expect to converge, so use a mix of both
1923-
final long delayVariabilityMillis = randomLongBetween(DEFAULT_DELAY_VARIABILITY, TimeValue.timeValueMinutes(10).millis());
1928+
try (Cluster cluster = new Cluster(1, randomBoolean(), settings.build())) {
19241929
if (randomBoolean()) {
19251930
cluster.runRandomly(true, false, delayVariabilityMillis);
19261931
}

server/src/test/java/org/elasticsearch/cluster/coordination/ElectionSchedulerFactoryTests.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -192,9 +192,9 @@ public void testSettingsValidation() {
192192
}
193193

194194
{
195-
final Settings settings = Settings.builder().put(ELECTION_MAX_TIMEOUT_SETTING.getKey(), "301s").build();
195+
final Settings settings = Settings.builder().put(ELECTION_MAX_TIMEOUT_SETTING.getKey(), "601s").build();
196196
IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> ELECTION_MAX_TIMEOUT_SETTING.get(settings));
197-
assertThat(e.getMessage(), is("failed to parse value [301s] for setting [cluster.election.max_timeout], must be <= [300s]"));
197+
assertThat(e.getMessage(), is("failed to parse value [601s] for setting [cluster.election.max_timeout], must be <= [600s]"));
198198
}
199199

200200
{

0 commit comments

Comments
 (0)