feat: Auto retry escalation on failed audit (#5265)

# What this PR does
Automatically retries escalation when alert groups fail auditing. This
is the same effect as the continue_escalation command without any of the
extra arguments.

## Checklist

- [x] Unit, integration, and e2e (if applicable) tests updated
- [x] Documentation added (or `pr:no public docs` PR label added if not
required)
- [x] Added the relevant release notes label (see labels prefixed w/
`release:`). These labels dictate how your PR will
    show up in the autogenerated release notes.
This commit is contained in:
Michael Derynck 2024-11-19 15:23:15 -07:00 committed by GitHub
parent 1bd30b3cf8
commit 2024ee7f78
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 165 additions and 1 deletions

View file

@ -2,7 +2,9 @@ import datetime
import typing
import requests
from celery import uuid as celery_uuid
from django.conf import settings
from django.core.cache import cache
from django.db.models import Avg, F, Max, Q
from django.utils import timezone
@ -174,6 +176,42 @@ def check_personal_notifications_task() -> None:
task_logger.info(f"personal_notifications_triggered={triggered} personal_notifications_completed={completed}")
# Retries an alert group that has failed auditing if it is within the retry limit
# Returns whether an alert group escalation is being retried
def retry_audited_alert_group(alert_group) -> bool:
cache_key = f"audited-alert-group-retry-count-{alert_group.id}"
retry_count = cache.get(cache_key, 0)
if retry_count >= settings.AUDITED_ALERT_GROUP_MAX_RETRIES:
task_logger.info(f"Not retrying audited alert_group={alert_group.id} max retries exceeded.")
return False
if alert_group.is_silenced_for_period:
task_logger.info(f"Not retrying audited alert_group={alert_group.id} as it is silenced.")
return False
if not alert_group.escalation_snapshot:
task_logger.info(f"Not retrying audited alert_group={alert_group.id} as its escalation snapshot is empty.")
return False
retry_count += 1
cache.set(cache_key, retry_count, timeout=3600)
task_id = celery_uuid()
alert_group.active_escalation_id = task_id
alert_group.save(update_fields=["active_escalation_id"])
from apps.alerts.tasks import escalate_alert_group
escalate_alert_group.apply_async(
args=(alert_group.pk,),
immutable=True,
task_id=task_id,
eta=alert_group.next_step_eta,
)
task_logger.info(f"Retrying audited alert_group={alert_group.id} attempt={retry_count}")
return True
@shared_log_exception_on_failure_task
def check_escalation_finished_task() -> None:
"""
@ -221,7 +259,8 @@ def check_escalation_finished_task() -> None:
try:
audit_alert_group_escalation(alert_group)
except AlertGroupEscalationPolicyExecutionAuditException:
alert_group_ids_that_failed_audit.append(str(alert_group.id))
if not retry_audited_alert_group(alert_group):
alert_group_ids_that_failed_audit.append(str(alert_group.id))
failed_alert_groups_count = len(alert_group_ids_that_failed_audit)
success_ratio = (

View file

@ -6,12 +6,14 @@ from django.test import override_settings
from django.utils import timezone
from apps.alerts.models import EscalationPolicy
from apps.alerts.tasks import escalate_alert_group
from apps.alerts.tasks.check_escalation_finished import (
AlertGroupEscalationPolicyExecutionAuditException,
audit_alert_group_escalation,
check_alert_group_personal_notifications_task,
check_escalation_finished_task,
check_personal_notifications_task,
retry_audited_alert_group,
send_alert_group_escalation_auditor_task_heartbeat,
)
from apps.base.models import UserNotificationPolicy, UserNotificationPolicyLogRecord
@ -580,3 +582,124 @@ def test_check_escalation_finished_task_calls_audit_alert_group_personal_notific
check_personal_notifications_task()
assert "personal_notifications_triggered=6 personal_notifications_completed=2" in caplog.text
@patch("apps.alerts.tasks.check_escalation_finished.audit_alert_group_escalation")
@patch("apps.alerts.tasks.check_escalation_finished.retry_audited_alert_group")
@patch("apps.alerts.tasks.check_escalation_finished.send_alert_group_escalation_auditor_task_heartbeat")
@pytest.mark.django_db
def test_invoke_retry_from_check_escalation_finished_task(
mocked_send_alert_group_escalation_auditor_task_heartbeat,
mocked_retry_audited_alert_group,
mocked_audit_alert_group_escalation,
make_organization_and_user,
make_alert_receive_channel,
make_alert_group_that_started_at_specific_date,
):
organization, _ = make_organization_and_user()
alert_receive_channel = make_alert_receive_channel(organization)
# Pass audit (should not be counted in final message or go to retry function)
alert_group1 = make_alert_group_that_started_at_specific_date(alert_receive_channel, received_delta=1)
# Fail audit but not retrying (should be counted in final message)
alert_group2 = make_alert_group_that_started_at_specific_date(alert_receive_channel, received_delta=5)
# Fail audit but retry (should not be counted in final message)
alert_group3 = make_alert_group_that_started_at_specific_date(alert_receive_channel, received_delta=10)
def _mocked_audit_alert_group_escalation(alert_group):
if alert_group.id == alert_group2.id or alert_group.id == alert_group3.id:
raise AlertGroupEscalationPolicyExecutionAuditException(f"{alert_group2.id} failed audit")
mocked_audit_alert_group_escalation.side_effect = _mocked_audit_alert_group_escalation
def _mocked_retry_audited_alert_group(alert_group):
if alert_group.id == alert_group2.id:
return False
return True
mocked_retry_audited_alert_group.side_effect = _mocked_retry_audited_alert_group
with pytest.raises(AlertGroupEscalationPolicyExecutionAuditException) as exc:
check_escalation_finished_task()
error_msg = str(exc.value)
assert "The following alert group id(s) failed auditing:" in error_msg
assert str(alert_group1.id) not in error_msg
assert str(alert_group2.id) in error_msg
assert str(alert_group3.id) not in error_msg
assert mocked_retry_audited_alert_group.call_count == 2
mocked_send_alert_group_escalation_auditor_task_heartbeat.assert_not_called()
@patch.object(escalate_alert_group, "apply_async")
@override_settings(AUDITED_ALERT_GROUP_MAX_RETRIES=1)
@pytest.mark.django_db
def test_retry_audited_alert_group(
mocked_escalate_alert_group,
make_organization_and_user,
make_user_for_organization,
make_user_notification_policy,
make_escalation_chain,
make_escalation_policy,
make_channel_filter,
make_alert_receive_channel,
make_alert_group_that_started_at_specific_date,
):
organization, user = make_organization_and_user()
make_user_notification_policy(
user=user,
step=UserNotificationPolicy.Step.NOTIFY,
notify_by=UserNotificationPolicy.NotificationChannel.SLACK,
)
alert_receive_channel = make_alert_receive_channel(organization)
escalation_chain = make_escalation_chain(organization)
channel_filter = make_channel_filter(alert_receive_channel, escalation_chain=escalation_chain)
notify_to_multiple_users_step = make_escalation_policy(
escalation_chain=channel_filter.escalation_chain,
escalation_policy_step=EscalationPolicy.STEP_NOTIFY_MULTIPLE_USERS,
)
notify_to_multiple_users_step.notify_to_users_queue.set([user])
alert_group1 = make_alert_group_that_started_at_specific_date(alert_receive_channel, channel_filter=channel_filter)
alert_group1.raw_escalation_snapshot = alert_group1.build_raw_escalation_snapshot()
alert_group1.raw_escalation_snapshot["last_active_escalation_policy_order"] = 1
alert_group1.save()
# Retry should occur
is_retrying = retry_audited_alert_group(alert_group1)
assert is_retrying
mocked_escalate_alert_group.assert_called()
mocked_escalate_alert_group.reset_mock()
# No retry as attempts == max
is_retrying = retry_audited_alert_group(alert_group1)
assert not is_retrying
mocked_escalate_alert_group.assert_not_called()
mocked_escalate_alert_group.reset_mock()
alert_group2 = make_alert_group_that_started_at_specific_date(alert_receive_channel, channel_filter=channel_filter)
# No retry because no escalation snapshot
is_retrying = retry_audited_alert_group(alert_group2)
assert not is_retrying
mocked_escalate_alert_group.assert_not_called()
mocked_escalate_alert_group.reset_mock()
alert_group3 = make_alert_group_that_started_at_specific_date(
alert_receive_channel,
channel_filter=channel_filter,
silenced=True,
silenced_at=timezone.now(),
silenced_by_user=user,
silenced_until=(now + timezone.timedelta(hours=1)),
)
alert_group3.raw_escalation_snapshot = alert_group1.build_raw_escalation_snapshot()
alert_group3.raw_escalation_snapshot["last_active_escalation_policy_order"] = 1
alert_group3.save()
# No retry because alert group silenced
is_retrying = retry_audited_alert_group(alert_group3)
assert not is_retrying
mocked_escalate_alert_group.assert_not_called()

View file

@ -988,3 +988,5 @@ CUSTOM_RATELIMITS = getenv_custom_ratelimit("CUSTOM_RATELIMITS", default={})
SYNC_V2_MAX_TASKS = getenv_integer("SYNC_V2_MAX_TASKS", 6)
SYNC_V2_PERIOD_SECONDS = getenv_integer("SYNC_V2_PERIOD_SECONDS", 240)
SYNC_V2_BATCH_SIZE = getenv_integer("SYNC_V2_BATCH_SIZE", 500)
AUDITED_ALERT_GROUP_MAX_RETRIES = getenv_integer("AUDITED_ALERT_GROUP_MAX_RETRIES", 1)