From 3caf4ab4e0c71b0566023151dd948c4da50814a0 Mon Sep 17 00:00:00 2001 From: Julia Bardi <90178898+juliaElastic@users.noreply.github.com> Date: Thu, 8 Dec 2022 09:14:33 +0100 Subject: [PATCH] [Fleet] cancel tasks when 3rd retry failed (#147190) ## Summary Related to https://github.com/elastic/kibana/issues/144161 Found that on a bulk update tags task failure, the task didn't stop after 3 retries (should be over in less then a minute), the retries kept happening for 2 hours. This change removes the retry task if 3 retries are reached. Also testing in cloud deployment to see if the tags error can be reproduced with this fix. I could reproduce the reported error locally, and seeing it goes away with this fix. To verify: - Add at least 50k agents with the `create_agents` script in kibana repo - open Kibana, select the 50k agents, and open Actions / Add tags - Try this in a few seconds: add 2 new tags, and remove one of them - Wait about 30s, the agents should reflect the changes - Check the logs to see that the tasks are removed after 3rd retry is reached or successful. - Check that there are no more running tasks. Any running task can be found in Kibana Console by running this query: `GET .kibana_task_manager/_search?q=task.taskType:"fleet:update_agent_tags:retry"` Locally simulated an error to test that the retry (and check) task is removed: ``` [2022-12-07T15:52:16.415+01:00][ERROR][plugins.fleet] Retry #3 of task fleet:update_agent_tags:retry:848984ab-c11d-4ebe-8d1f-606143dd656b failed: failing task [2022-12-07T15:52:16.416+01:00][WARN ][plugins.fleet] Stopping after 3rd retry. Error: failing task [2022-12-07T15:52:16.416+01:00][INFO ][plugins.fleet] Removing task fleet:update_agent_tags:retry:check:848984ab-c11d-4ebe-8d1f-606143dd656b [2022-12-07T15:52:16.416+01:00][INFO ][plugins.fleet] Removing task fleet:update_agent_tags:retry:848984ab-c11d-4ebe-8d1f-606143dd656b ``` (cherry picked from commit 431c32b894077fc5910380252086442083734fce) --- .../plugins/fleet/server/services/agents/action_runner.ts | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/x-pack/plugins/fleet/server/services/agents/action_runner.ts b/x-pack/plugins/fleet/server/services/agents/action_runner.ts index 41f9a44099b5f7..18af331980238d 100644 --- a/x-pack/plugins/fleet/server/services/agents/action_runner.ts +++ b/x-pack/plugins/fleet/server/services/agents/action_runner.ts @@ -113,6 +113,13 @@ export abstract class ActionRunner { if (this.retryParams.retryCount === 3) { const errorMessage = 'Stopping after 3rd retry. Error: ' + error.message; appContextService.getLogger().warn(errorMessage); + + // clean up tasks after 3rd retry reached + await Promise.all([ + this.bulkActionsResolver!.removeIfExists(this.checkTaskId!), + this.bulkActionsResolver!.removeIfExists(this.retryParams.taskId!), + ]); + return; } } else {