- Notifications
You must be signed in to change notification settings - Fork 25.8k
Don't run async actions when ILM is stopped #133683
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
3b77520 930155f 6aadbb3 26b3dc4 370d966 0ed022c ebe7f23 80f5e81 02e978b File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| | @@ -184,11 +184,14 @@ void onMaster(ClusterState clusterState) { | |
| maybeScheduleJob(); | ||
| | ||
| for (var projectId : clusterState.metadata().projects().keySet()) { | ||
| onMaster(clusterState.projectState(projectId)); | ||
| maybeRunAsyncActions(clusterState.projectState(projectId)); | ||
| } | ||
| } | ||
| | ||
| void onMaster(ProjectState state) { | ||
| /** | ||
| * Kicks off any async actions that may not have been run due to either master failover or ILM being manually stopped. | ||
| */ | ||
| private void maybeRunAsyncActions(ProjectState state) { | ||
| final ProjectMetadata projectMetadata = state.metadata(); | ||
| final IndexLifecycleMetadata currentMetadata = projectMetadata.custom(IndexLifecycleMetadata.TYPE); | ||
| if (currentMetadata != null) { | ||
| | @@ -198,67 +201,51 @@ void onMaster(ProjectState state) { | |
| } | ||
| | ||
| boolean safeToStop = true; // true until proven false by a run policy | ||
| | ||
| Contributor Author There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The changes in this method aren't stricly necessary, I just took this opportunity to clean this method up a bit, as it was becoming hard to read. If there are concerns with this refactor, I can revert these stylistic changes and stick to the bug fix. | ||
| // If we just became master, we need to kick off any async actions that | ||
| // may have not been run due to master rollover | ||
| for (IndexMetadata idxMeta : projectMetadata.indices().values()) { | ||
| if (projectMetadata.isIndexManagedByILM(idxMeta)) { | ||
| String policyName = idxMeta.getLifecyclePolicyName(); | ||
| final LifecycleExecutionState lifecycleState = idxMeta.getLifecycleExecutionState(); | ||
| StepKey stepKey = Step.getCurrentStepKey(lifecycleState); | ||
| | ||
| try { | ||
| if (OperationMode.STOPPING == currentMode) { | ||
| if (stepKey != null && IGNORE_STEPS_MAINTENANCE_REQUESTED.contains(stepKey.name())) { | ||
| logger.info( | ||
| "waiting to stop ILM because index [{}] with policy [{}] is currently in step [{}]", | ||
| idxMeta.getIndex().getName(), | ||
| policyName, | ||
| stepKey.name() | ||
| ); | ||
| lifecycleRunner.maybeRunAsyncAction(state, idxMeta, policyName, stepKey); | ||
| // ILM is trying to stop, but this index is in a Shrink step (or other dangerous step) so we can't stop | ||
| safeToStop = false; | ||
| } else { | ||
| logger.info( | ||
| "skipping policy execution of step [{}] for index [{}] with policy [{}]" + " because ILM is stopping", | ||
| stepKey == null ? "n/a" : stepKey.name(), | ||
| idxMeta.getIndex().getName(), | ||
| policyName | ||
| ); | ||
| } | ||
| } else { | ||
| lifecycleRunner.maybeRunAsyncAction(state, idxMeta, policyName, stepKey); | ||
| } | ||
| } catch (Exception e) { | ||
| if (logger.isTraceEnabled()) { | ||
| logger.warn( | ||
| () -> format( | ||
| "async action execution failed during master election trigger" | ||
| + " for index [%s] with policy [%s] in step [%s], lifecycle state: [%s]", | ||
| idxMeta.getIndex().getName(), | ||
| policyName, | ||
| stepKey, | ||
| lifecycleState.asMap() | ||
| ), | ||
| e | ||
| ); | ||
| } else { | ||
| logger.warn( | ||
| () -> format( | ||
| "async action execution failed during master election trigger" | ||
| + " for index [%s] with policy [%s] in step [%s]", | ||
| idxMeta.getIndex().getName(), | ||
| policyName, | ||
| stepKey | ||
| ), | ||
| e | ||
| ); | ||
| if (projectMetadata.isIndexManagedByILM(idxMeta) == false) { | ||
| continue; | ||
| } | ||
| String policyName = idxMeta.getLifecyclePolicyName(); | ||
| final LifecycleExecutionState lifecycleState = idxMeta.getLifecycleExecutionState(); | ||
| StepKey stepKey = Step.getCurrentStepKey(lifecycleState); | ||
| | ||
| } | ||
| // Don't rethrow the exception, we don't want a failure for one index to be | ||
| // called to cause actions not to be triggered for further indices | ||
| try { | ||
| if (currentMode == OperationMode.RUNNING) { | ||
| lifecycleRunner.maybeRunAsyncAction(state, idxMeta, policyName, stepKey); | ||
| continue; | ||
| } | ||
| if (stepKey != null && IGNORE_STEPS_MAINTENANCE_REQUESTED.contains(stepKey.name())) { | ||
| ||
| logger.info( | ||
| "waiting to stop ILM because index [{}] with policy [{}] is currently in step [{}]", | ||
| idxMeta.getIndex().getName(), | ||
| policyName, | ||
| stepKey.name() | ||
| ); | ||
| lifecycleRunner.maybeRunAsyncAction(state, idxMeta, policyName, stepKey); | ||
| // ILM is trying to stop, but this index is in a Shrink step (or other dangerous step) so we can't stop | ||
| safeToStop = false; | ||
| } else { | ||
| logger.info( | ||
| "skipping policy execution of step [{}] for index [{}] with policy [{}]" + " because ILM is stopping", | ||
| stepKey == null ? "n/a" : stepKey.name(), | ||
| idxMeta.getIndex().getName(), | ||
| policyName | ||
| ); | ||
| } | ||
| } catch (Exception e) { | ||
| String format = format( | ||
| "async action execution failed during master election trigger for index [%s] with policy [%s] in step [%s]", | ||
| idxMeta.getIndex().getName(), | ||
| policyName, | ||
| stepKey | ||
| ); | ||
| if (logger.isTraceEnabled()) { | ||
| format += format(", lifecycle state: [%s]", lifecycleState.asMap()); | ||
| } | ||
| logger.warn(format, e); | ||
| ||
| | ||
| // Don't rethrow the exception, we don't want a failure for one index to be | ||
| // called to cause actions not to be triggered for further indices | ||
| } | ||
| } | ||
| | ||
| | @@ -333,6 +320,20 @@ public void clusterChanged(ClusterChangedEvent event) { | |
| cancelJob(); | ||
| policyRegistry.clear(); | ||
| } | ||
| } else if (this.isMaster) { | ||
| // If we are the master and we were before, check if any projects changed their ILM mode from non-RUNNING to RUNNING. | ||
| // If so, kick off any async actions that may not have run while not in RUNNING mode. | ||
| for (ProjectMetadata project : event.state().metadata().projects().values()) { | ||
| final var previousProject = event.previousState().metadata().projects().get(project.id()); | ||
| if (previousProject == null || project == previousProject) { | ||
| continue; | ||
| } | ||
| final OperationMode currentMode = currentILMMode(project); | ||
| final OperationMode previousMode = currentILMMode(previousProject); | ||
| if (currentMode == OperationMode.RUNNING && previousMode != OperationMode.RUNNING) { | ||
| maybeRunAsyncActions(event.state().projectState(project.id())); | ||
| } | ||
| } | ||
| } | ||
| | ||
| // if we're the master, then process deleted indices and trigger policies | ||
| | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Any suggestions for improved assertions are welcome. Adding a
Thread.sleephere would increase our confidence, but still wouldn't make any guarantee.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could we write a test that wraps the
Clientpassed toIndexLifecycleServicewith a wrapper that asserts that we never perform some particular ILM action?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I added a test to
IndexLifecycleRunnerTestsin 370d966 that verifies the action isn't executed when ILM is stopped. Let me know if that's what you had in mind. I chose for testingIndexLifecycleRunner#maybeRunAsyncActioninstead ofIndexLifecycleService#maybeRunAsyncAction, as the latter is only used by APIs and the former is also used internally by ILM.