From 2024ee7f78aee69e7b7a4d7371c31cb593b8f3ee Mon Sep 17 00:00:00 2001 From: Michael Derynck Date: Tue, 19 Nov 2024 15:23:15 -0700 Subject: [PATCH] feat: Auto retry escalation on failed audit (#5265) # What this PR does Automatically retries escalation when alert groups fail auditing. This is the same effect as the continue_escalation command without any of the extra arguments. ## Checklist - [x] Unit, integration, and e2e (if applicable) tests updated - [x] Documentation added (or `pr:no public docs` PR label added if not required) - [x] Added the relevant release notes label (see labels prefixed w/ `release:`). These labels dictate how your PR will show up in the autogenerated release notes. --- .../alerts/tasks/check_escalation_finished.py | 41 +++++- .../test_check_escalation_finished_task.py | 123 ++++++++++++++++++ engine/settings/base.py | 2 + 3 files changed, 165 insertions(+), 1 deletion(-) diff --git a/engine/apps/alerts/tasks/check_escalation_finished.py b/engine/apps/alerts/tasks/check_escalation_finished.py index 9f3fb62d..8ae6d814 100644 --- a/engine/apps/alerts/tasks/check_escalation_finished.py +++ b/engine/apps/alerts/tasks/check_escalation_finished.py @@ -2,7 +2,9 @@ import datetime import typing import requests +from celery import uuid as celery_uuid from django.conf import settings +from django.core.cache import cache from django.db.models import Avg, F, Max, Q from django.utils import timezone @@ -174,6 +176,42 @@ def check_personal_notifications_task() -> None: task_logger.info(f"personal_notifications_triggered={triggered} personal_notifications_completed={completed}") +# Retries an alert group that has failed auditing if it is within the retry limit +# Returns whether an alert group escalation is being retried +def retry_audited_alert_group(alert_group) -> bool: + cache_key = f"audited-alert-group-retry-count-{alert_group.id}" + retry_count = cache.get(cache_key, 0) + if retry_count >= settings.AUDITED_ALERT_GROUP_MAX_RETRIES: + task_logger.info(f"Not retrying audited alert_group={alert_group.id} max retries exceeded.") + return False + + if alert_group.is_silenced_for_period: + task_logger.info(f"Not retrying audited alert_group={alert_group.id} as it is silenced.") + return False + + if not alert_group.escalation_snapshot: + task_logger.info(f"Not retrying audited alert_group={alert_group.id} as its escalation snapshot is empty.") + return False + + retry_count += 1 + cache.set(cache_key, retry_count, timeout=3600) + + task_id = celery_uuid() + alert_group.active_escalation_id = task_id + alert_group.save(update_fields=["active_escalation_id"]) + + from apps.alerts.tasks import escalate_alert_group + + escalate_alert_group.apply_async( + args=(alert_group.pk,), + immutable=True, + task_id=task_id, + eta=alert_group.next_step_eta, + ) + task_logger.info(f"Retrying audited alert_group={alert_group.id} attempt={retry_count}") + return True + + @shared_log_exception_on_failure_task def check_escalation_finished_task() -> None: """ @@ -221,7 +259,8 @@ def check_escalation_finished_task() -> None: try: audit_alert_group_escalation(alert_group) except AlertGroupEscalationPolicyExecutionAuditException: - alert_group_ids_that_failed_audit.append(str(alert_group.id)) + if not retry_audited_alert_group(alert_group): + alert_group_ids_that_failed_audit.append(str(alert_group.id)) failed_alert_groups_count = len(alert_group_ids_that_failed_audit) success_ratio = ( diff --git a/engine/apps/alerts/tests/test_check_escalation_finished_task.py b/engine/apps/alerts/tests/test_check_escalation_finished_task.py index 8aa5cbbd..229fabff 100644 --- a/engine/apps/alerts/tests/test_check_escalation_finished_task.py +++ b/engine/apps/alerts/tests/test_check_escalation_finished_task.py @@ -6,12 +6,14 @@ from django.test import override_settings from django.utils import timezone from apps.alerts.models import EscalationPolicy +from apps.alerts.tasks import escalate_alert_group from apps.alerts.tasks.check_escalation_finished import ( AlertGroupEscalationPolicyExecutionAuditException, audit_alert_group_escalation, check_alert_group_personal_notifications_task, check_escalation_finished_task, check_personal_notifications_task, + retry_audited_alert_group, send_alert_group_escalation_auditor_task_heartbeat, ) from apps.base.models import UserNotificationPolicy, UserNotificationPolicyLogRecord @@ -580,3 +582,124 @@ def test_check_escalation_finished_task_calls_audit_alert_group_personal_notific check_personal_notifications_task() assert "personal_notifications_triggered=6 personal_notifications_completed=2" in caplog.text + + +@patch("apps.alerts.tasks.check_escalation_finished.audit_alert_group_escalation") +@patch("apps.alerts.tasks.check_escalation_finished.retry_audited_alert_group") +@patch("apps.alerts.tasks.check_escalation_finished.send_alert_group_escalation_auditor_task_heartbeat") +@pytest.mark.django_db +def test_invoke_retry_from_check_escalation_finished_task( + mocked_send_alert_group_escalation_auditor_task_heartbeat, + mocked_retry_audited_alert_group, + mocked_audit_alert_group_escalation, + make_organization_and_user, + make_alert_receive_channel, + make_alert_group_that_started_at_specific_date, +): + organization, _ = make_organization_and_user() + alert_receive_channel = make_alert_receive_channel(organization) + + # Pass audit (should not be counted in final message or go to retry function) + alert_group1 = make_alert_group_that_started_at_specific_date(alert_receive_channel, received_delta=1) + # Fail audit but not retrying (should be counted in final message) + alert_group2 = make_alert_group_that_started_at_specific_date(alert_receive_channel, received_delta=5) + # Fail audit but retry (should not be counted in final message) + alert_group3 = make_alert_group_that_started_at_specific_date(alert_receive_channel, received_delta=10) + + def _mocked_audit_alert_group_escalation(alert_group): + if alert_group.id == alert_group2.id or alert_group.id == alert_group3.id: + raise AlertGroupEscalationPolicyExecutionAuditException(f"{alert_group2.id} failed audit") + + mocked_audit_alert_group_escalation.side_effect = _mocked_audit_alert_group_escalation + + def _mocked_retry_audited_alert_group(alert_group): + if alert_group.id == alert_group2.id: + return False + return True + + mocked_retry_audited_alert_group.side_effect = _mocked_retry_audited_alert_group + + with pytest.raises(AlertGroupEscalationPolicyExecutionAuditException) as exc: + check_escalation_finished_task() + + error_msg = str(exc.value) + + assert "The following alert group id(s) failed auditing:" in error_msg + assert str(alert_group1.id) not in error_msg + assert str(alert_group2.id) in error_msg + assert str(alert_group3.id) not in error_msg + + assert mocked_retry_audited_alert_group.call_count == 2 + mocked_send_alert_group_escalation_auditor_task_heartbeat.assert_not_called() + + +@patch.object(escalate_alert_group, "apply_async") +@override_settings(AUDITED_ALERT_GROUP_MAX_RETRIES=1) +@pytest.mark.django_db +def test_retry_audited_alert_group( + mocked_escalate_alert_group, + make_organization_and_user, + make_user_for_organization, + make_user_notification_policy, + make_escalation_chain, + make_escalation_policy, + make_channel_filter, + make_alert_receive_channel, + make_alert_group_that_started_at_specific_date, +): + organization, user = make_organization_and_user() + make_user_notification_policy( + user=user, + step=UserNotificationPolicy.Step.NOTIFY, + notify_by=UserNotificationPolicy.NotificationChannel.SLACK, + ) + + alert_receive_channel = make_alert_receive_channel(organization) + escalation_chain = make_escalation_chain(organization) + channel_filter = make_channel_filter(alert_receive_channel, escalation_chain=escalation_chain) + notify_to_multiple_users_step = make_escalation_policy( + escalation_chain=channel_filter.escalation_chain, + escalation_policy_step=EscalationPolicy.STEP_NOTIFY_MULTIPLE_USERS, + ) + notify_to_multiple_users_step.notify_to_users_queue.set([user]) + + alert_group1 = make_alert_group_that_started_at_specific_date(alert_receive_channel, channel_filter=channel_filter) + alert_group1.raw_escalation_snapshot = alert_group1.build_raw_escalation_snapshot() + alert_group1.raw_escalation_snapshot["last_active_escalation_policy_order"] = 1 + alert_group1.save() + + # Retry should occur + is_retrying = retry_audited_alert_group(alert_group1) + assert is_retrying + mocked_escalate_alert_group.assert_called() + mocked_escalate_alert_group.reset_mock() + + # No retry as attempts == max + is_retrying = retry_audited_alert_group(alert_group1) + assert not is_retrying + mocked_escalate_alert_group.assert_not_called() + mocked_escalate_alert_group.reset_mock() + + alert_group2 = make_alert_group_that_started_at_specific_date(alert_receive_channel, channel_filter=channel_filter) + # No retry because no escalation snapshot + is_retrying = retry_audited_alert_group(alert_group2) + assert not is_retrying + mocked_escalate_alert_group.assert_not_called() + mocked_escalate_alert_group.reset_mock() + + alert_group3 = make_alert_group_that_started_at_specific_date( + alert_receive_channel, + channel_filter=channel_filter, + silenced=True, + silenced_at=timezone.now(), + silenced_by_user=user, + silenced_until=(now + timezone.timedelta(hours=1)), + ) + alert_group3.raw_escalation_snapshot = alert_group1.build_raw_escalation_snapshot() + alert_group3.raw_escalation_snapshot["last_active_escalation_policy_order"] = 1 + alert_group3.save() + + # No retry because alert group silenced + is_retrying = retry_audited_alert_group(alert_group3) + assert not is_retrying + mocked_escalate_alert_group.assert_not_called() diff --git a/engine/settings/base.py b/engine/settings/base.py index 25ef7dc1..2b3cc971 100644 --- a/engine/settings/base.py +++ b/engine/settings/base.py @@ -988,3 +988,5 @@ CUSTOM_RATELIMITS = getenv_custom_ratelimit("CUSTOM_RATELIMITS", default={}) SYNC_V2_MAX_TASKS = getenv_integer("SYNC_V2_MAX_TASKS", 6) SYNC_V2_PERIOD_SECONDS = getenv_integer("SYNC_V2_PERIOD_SECONDS", 240) SYNC_V2_BATCH_SIZE = getenv_integer("SYNC_V2_BATCH_SIZE", 500) + +AUDITED_ALERT_GROUP_MAX_RETRIES = getenv_integer("AUDITED_ALERT_GROUP_MAX_RETRIES", 1)