From 2024ee7f78aee69e7b7a4d7371c31cb593b8f3ee Mon Sep 17 00:00:00 2001
From: Michael Derynck <michael.derynck@grafana.com>
Date: Tue, 19 Nov 2024 15:23:15 -0700
Subject: [PATCH] feat: Auto retry escalation on failed audit (#5265)

# What this PR does
Automatically retries escalation when alert groups fail auditing. This
is the same effect as the continue_escalation command without any of the
extra arguments.

## Checklist

- [x] Unit, integration, and e2e (if applicable) tests updated
- [x] Documentation added (or `pr:no public docs` PR label added if not
required)
- [x] Added the relevant release notes label (see labels prefixed w/
`release:`). These labels dictate how your PR will
    show up in the autogenerated release notes.
---
 .../alerts/tasks/check_escalation_finished.py |  41 +++++-
 .../test_check_escalation_finished_task.py    | 123 ++++++++++++++++++
 engine/settings/base.py                       |   2 +
 3 files changed, 165 insertions(+), 1 deletion(-)

diff --git a/engine/apps/alerts/tasks/check_escalation_finished.py b/engine/apps/alerts/tasks/check_escalation_finished.py
index 9f3fb62d..8ae6d814 100644
--- a/engine/apps/alerts/tasks/check_escalation_finished.py
+++ b/engine/apps/alerts/tasks/check_escalation_finished.py
@@ -2,7 +2,9 @@ import datetime
 import typing
 
 import requests
+from celery import uuid as celery_uuid
 from django.conf import settings
+from django.core.cache import cache
 from django.db.models import Avg, F, Max, Q
 from django.utils import timezone
 
@@ -174,6 +176,42 @@ def check_personal_notifications_task() -> None:
     task_logger.info(f"personal_notifications_triggered={triggered} personal_notifications_completed={completed}")
 
 
+# Retries an alert group that has failed auditing if it is within the retry limit
+# Returns whether an alert group escalation is being retried
+def retry_audited_alert_group(alert_group) -> bool:
+    cache_key = f"audited-alert-group-retry-count-{alert_group.id}"
+    retry_count = cache.get(cache_key, 0)
+    if retry_count >= settings.AUDITED_ALERT_GROUP_MAX_RETRIES:
+        task_logger.info(f"Not retrying audited alert_group={alert_group.id} max retries exceeded.")
+        return False
+
+    if alert_group.is_silenced_for_period:
+        task_logger.info(f"Not retrying audited alert_group={alert_group.id} as it is silenced.")
+        return False
+
+    if not alert_group.escalation_snapshot:
+        task_logger.info(f"Not retrying audited alert_group={alert_group.id} as its escalation snapshot is empty.")
+        return False
+
+    retry_count += 1
+    cache.set(cache_key, retry_count, timeout=3600)
+
+    task_id = celery_uuid()
+    alert_group.active_escalation_id = task_id
+    alert_group.save(update_fields=["active_escalation_id"])
+
+    from apps.alerts.tasks import escalate_alert_group
+
+    escalate_alert_group.apply_async(
+        args=(alert_group.pk,),
+        immutable=True,
+        task_id=task_id,
+        eta=alert_group.next_step_eta,
+    )
+    task_logger.info(f"Retrying audited alert_group={alert_group.id} attempt={retry_count}")
+    return True
+
+
 @shared_log_exception_on_failure_task
 def check_escalation_finished_task() -> None:
     """
@@ -221,7 +259,8 @@ def check_escalation_finished_task() -> None:
         try:
             audit_alert_group_escalation(alert_group)
         except AlertGroupEscalationPolicyExecutionAuditException:
-            alert_group_ids_that_failed_audit.append(str(alert_group.id))
+            if not retry_audited_alert_group(alert_group):
+                alert_group_ids_that_failed_audit.append(str(alert_group.id))
 
     failed_alert_groups_count = len(alert_group_ids_that_failed_audit)
     success_ratio = (
diff --git a/engine/apps/alerts/tests/test_check_escalation_finished_task.py b/engine/apps/alerts/tests/test_check_escalation_finished_task.py
index 8aa5cbbd..229fabff 100644
--- a/engine/apps/alerts/tests/test_check_escalation_finished_task.py
+++ b/engine/apps/alerts/tests/test_check_escalation_finished_task.py
@@ -6,12 +6,14 @@ from django.test import override_settings
 from django.utils import timezone
 
 from apps.alerts.models import EscalationPolicy
+from apps.alerts.tasks import escalate_alert_group
 from apps.alerts.tasks.check_escalation_finished import (
     AlertGroupEscalationPolicyExecutionAuditException,
     audit_alert_group_escalation,
     check_alert_group_personal_notifications_task,
     check_escalation_finished_task,
     check_personal_notifications_task,
+    retry_audited_alert_group,
     send_alert_group_escalation_auditor_task_heartbeat,
 )
 from apps.base.models import UserNotificationPolicy, UserNotificationPolicyLogRecord
@@ -580,3 +582,124 @@ def test_check_escalation_finished_task_calls_audit_alert_group_personal_notific
     check_personal_notifications_task()
 
     assert "personal_notifications_triggered=6 personal_notifications_completed=2" in caplog.text
+
+
+@patch("apps.alerts.tasks.check_escalation_finished.audit_alert_group_escalation")
+@patch("apps.alerts.tasks.check_escalation_finished.retry_audited_alert_group")
+@patch("apps.alerts.tasks.check_escalation_finished.send_alert_group_escalation_auditor_task_heartbeat")
+@pytest.mark.django_db
+def test_invoke_retry_from_check_escalation_finished_task(
+    mocked_send_alert_group_escalation_auditor_task_heartbeat,
+    mocked_retry_audited_alert_group,
+    mocked_audit_alert_group_escalation,
+    make_organization_and_user,
+    make_alert_receive_channel,
+    make_alert_group_that_started_at_specific_date,
+):
+    organization, _ = make_organization_and_user()
+    alert_receive_channel = make_alert_receive_channel(organization)
+
+    # Pass audit (should not be counted in final message or go to retry function)
+    alert_group1 = make_alert_group_that_started_at_specific_date(alert_receive_channel, received_delta=1)
+    # Fail audit but not retrying (should be counted in final message)
+    alert_group2 = make_alert_group_that_started_at_specific_date(alert_receive_channel, received_delta=5)
+    # Fail audit but retry (should not be counted in final message)
+    alert_group3 = make_alert_group_that_started_at_specific_date(alert_receive_channel, received_delta=10)
+
+    def _mocked_audit_alert_group_escalation(alert_group):
+        if alert_group.id == alert_group2.id or alert_group.id == alert_group3.id:
+            raise AlertGroupEscalationPolicyExecutionAuditException(f"{alert_group2.id} failed audit")
+
+    mocked_audit_alert_group_escalation.side_effect = _mocked_audit_alert_group_escalation
+
+    def _mocked_retry_audited_alert_group(alert_group):
+        if alert_group.id == alert_group2.id:
+            return False
+        return True
+
+    mocked_retry_audited_alert_group.side_effect = _mocked_retry_audited_alert_group
+
+    with pytest.raises(AlertGroupEscalationPolicyExecutionAuditException) as exc:
+        check_escalation_finished_task()
+
+    error_msg = str(exc.value)
+
+    assert "The following alert group id(s) failed auditing:" in error_msg
+    assert str(alert_group1.id) not in error_msg
+    assert str(alert_group2.id) in error_msg
+    assert str(alert_group3.id) not in error_msg
+
+    assert mocked_retry_audited_alert_group.call_count == 2
+    mocked_send_alert_group_escalation_auditor_task_heartbeat.assert_not_called()
+
+
+@patch.object(escalate_alert_group, "apply_async")
+@override_settings(AUDITED_ALERT_GROUP_MAX_RETRIES=1)
+@pytest.mark.django_db
+def test_retry_audited_alert_group(
+    mocked_escalate_alert_group,
+    make_organization_and_user,
+    make_user_for_organization,
+    make_user_notification_policy,
+    make_escalation_chain,
+    make_escalation_policy,
+    make_channel_filter,
+    make_alert_receive_channel,
+    make_alert_group_that_started_at_specific_date,
+):
+    organization, user = make_organization_and_user()
+    make_user_notification_policy(
+        user=user,
+        step=UserNotificationPolicy.Step.NOTIFY,
+        notify_by=UserNotificationPolicy.NotificationChannel.SLACK,
+    )
+
+    alert_receive_channel = make_alert_receive_channel(organization)
+    escalation_chain = make_escalation_chain(organization)
+    channel_filter = make_channel_filter(alert_receive_channel, escalation_chain=escalation_chain)
+    notify_to_multiple_users_step = make_escalation_policy(
+        escalation_chain=channel_filter.escalation_chain,
+        escalation_policy_step=EscalationPolicy.STEP_NOTIFY_MULTIPLE_USERS,
+    )
+    notify_to_multiple_users_step.notify_to_users_queue.set([user])
+
+    alert_group1 = make_alert_group_that_started_at_specific_date(alert_receive_channel, channel_filter=channel_filter)
+    alert_group1.raw_escalation_snapshot = alert_group1.build_raw_escalation_snapshot()
+    alert_group1.raw_escalation_snapshot["last_active_escalation_policy_order"] = 1
+    alert_group1.save()
+
+    # Retry should occur
+    is_retrying = retry_audited_alert_group(alert_group1)
+    assert is_retrying
+    mocked_escalate_alert_group.assert_called()
+    mocked_escalate_alert_group.reset_mock()
+
+    # No retry as attempts == max
+    is_retrying = retry_audited_alert_group(alert_group1)
+    assert not is_retrying
+    mocked_escalate_alert_group.assert_not_called()
+    mocked_escalate_alert_group.reset_mock()
+
+    alert_group2 = make_alert_group_that_started_at_specific_date(alert_receive_channel, channel_filter=channel_filter)
+    # No retry because no escalation snapshot
+    is_retrying = retry_audited_alert_group(alert_group2)
+    assert not is_retrying
+    mocked_escalate_alert_group.assert_not_called()
+    mocked_escalate_alert_group.reset_mock()
+
+    alert_group3 = make_alert_group_that_started_at_specific_date(
+        alert_receive_channel,
+        channel_filter=channel_filter,
+        silenced=True,
+        silenced_at=timezone.now(),
+        silenced_by_user=user,
+        silenced_until=(now + timezone.timedelta(hours=1)),
+    )
+    alert_group3.raw_escalation_snapshot = alert_group1.build_raw_escalation_snapshot()
+    alert_group3.raw_escalation_snapshot["last_active_escalation_policy_order"] = 1
+    alert_group3.save()
+
+    # No retry because alert group silenced
+    is_retrying = retry_audited_alert_group(alert_group3)
+    assert not is_retrying
+    mocked_escalate_alert_group.assert_not_called()
diff --git a/engine/settings/base.py b/engine/settings/base.py
index 25ef7dc1..2b3cc971 100644
--- a/engine/settings/base.py
+++ b/engine/settings/base.py
@@ -988,3 +988,5 @@ CUSTOM_RATELIMITS = getenv_custom_ratelimit("CUSTOM_RATELIMITS", default={})
 SYNC_V2_MAX_TASKS = getenv_integer("SYNC_V2_MAX_TASKS", 6)
 SYNC_V2_PERIOD_SECONDS = getenv_integer("SYNC_V2_PERIOD_SECONDS", 240)
 SYNC_V2_BATCH_SIZE = getenv_integer("SYNC_V2_BATCH_SIZE", 500)
+
+AUDITED_ALERT_GROUP_MAX_RETRIES = getenv_integer("AUDITED_ALERT_GROUP_MAX_RETRIES", 1)