feat: Auto retry escalation on failed audit (#5265)
# What this PR does
Automatically retries escalation when alert groups fail auditing. This
is the same effect as the continue_escalation command without any of the
extra arguments.
## Checklist
- [x] Unit, integration, and e2e (if applicable) tests updated
- [x] Documentation added (or `pr:no public docs` PR label added if not
required)
- [x] Added the relevant release notes label (see labels prefixed w/
`release:`). These labels dictate how your PR will
show up in the autogenerated release notes.
This commit is contained in:
parent
1bd30b3cf8
commit
2024ee7f78
3 changed files with 165 additions and 1 deletions
|
|
@ -2,7 +2,9 @@ import datetime
|
|||
import typing
|
||||
|
||||
import requests
|
||||
from celery import uuid as celery_uuid
|
||||
from django.conf import settings
|
||||
from django.core.cache import cache
|
||||
from django.db.models import Avg, F, Max, Q
|
||||
from django.utils import timezone
|
||||
|
||||
|
|
@ -174,6 +176,42 @@ def check_personal_notifications_task() -> None:
|
|||
task_logger.info(f"personal_notifications_triggered={triggered} personal_notifications_completed={completed}")
|
||||
|
||||
|
||||
# Retries an alert group that has failed auditing if it is within the retry limit
|
||||
# Returns whether an alert group escalation is being retried
|
||||
def retry_audited_alert_group(alert_group) -> bool:
|
||||
cache_key = f"audited-alert-group-retry-count-{alert_group.id}"
|
||||
retry_count = cache.get(cache_key, 0)
|
||||
if retry_count >= settings.AUDITED_ALERT_GROUP_MAX_RETRIES:
|
||||
task_logger.info(f"Not retrying audited alert_group={alert_group.id} max retries exceeded.")
|
||||
return False
|
||||
|
||||
if alert_group.is_silenced_for_period:
|
||||
task_logger.info(f"Not retrying audited alert_group={alert_group.id} as it is silenced.")
|
||||
return False
|
||||
|
||||
if not alert_group.escalation_snapshot:
|
||||
task_logger.info(f"Not retrying audited alert_group={alert_group.id} as its escalation snapshot is empty.")
|
||||
return False
|
||||
|
||||
retry_count += 1
|
||||
cache.set(cache_key, retry_count, timeout=3600)
|
||||
|
||||
task_id = celery_uuid()
|
||||
alert_group.active_escalation_id = task_id
|
||||
alert_group.save(update_fields=["active_escalation_id"])
|
||||
|
||||
from apps.alerts.tasks import escalate_alert_group
|
||||
|
||||
escalate_alert_group.apply_async(
|
||||
args=(alert_group.pk,),
|
||||
immutable=True,
|
||||
task_id=task_id,
|
||||
eta=alert_group.next_step_eta,
|
||||
)
|
||||
task_logger.info(f"Retrying audited alert_group={alert_group.id} attempt={retry_count}")
|
||||
return True
|
||||
|
||||
|
||||
@shared_log_exception_on_failure_task
|
||||
def check_escalation_finished_task() -> None:
|
||||
"""
|
||||
|
|
@ -221,7 +259,8 @@ def check_escalation_finished_task() -> None:
|
|||
try:
|
||||
audit_alert_group_escalation(alert_group)
|
||||
except AlertGroupEscalationPolicyExecutionAuditException:
|
||||
alert_group_ids_that_failed_audit.append(str(alert_group.id))
|
||||
if not retry_audited_alert_group(alert_group):
|
||||
alert_group_ids_that_failed_audit.append(str(alert_group.id))
|
||||
|
||||
failed_alert_groups_count = len(alert_group_ids_that_failed_audit)
|
||||
success_ratio = (
|
||||
|
|
|
|||
|
|
@ -6,12 +6,14 @@ from django.test import override_settings
|
|||
from django.utils import timezone
|
||||
|
||||
from apps.alerts.models import EscalationPolicy
|
||||
from apps.alerts.tasks import escalate_alert_group
|
||||
from apps.alerts.tasks.check_escalation_finished import (
|
||||
AlertGroupEscalationPolicyExecutionAuditException,
|
||||
audit_alert_group_escalation,
|
||||
check_alert_group_personal_notifications_task,
|
||||
check_escalation_finished_task,
|
||||
check_personal_notifications_task,
|
||||
retry_audited_alert_group,
|
||||
send_alert_group_escalation_auditor_task_heartbeat,
|
||||
)
|
||||
from apps.base.models import UserNotificationPolicy, UserNotificationPolicyLogRecord
|
||||
|
|
@ -580,3 +582,124 @@ def test_check_escalation_finished_task_calls_audit_alert_group_personal_notific
|
|||
check_personal_notifications_task()
|
||||
|
||||
assert "personal_notifications_triggered=6 personal_notifications_completed=2" in caplog.text
|
||||
|
||||
|
||||
@patch("apps.alerts.tasks.check_escalation_finished.audit_alert_group_escalation")
|
||||
@patch("apps.alerts.tasks.check_escalation_finished.retry_audited_alert_group")
|
||||
@patch("apps.alerts.tasks.check_escalation_finished.send_alert_group_escalation_auditor_task_heartbeat")
|
||||
@pytest.mark.django_db
|
||||
def test_invoke_retry_from_check_escalation_finished_task(
|
||||
mocked_send_alert_group_escalation_auditor_task_heartbeat,
|
||||
mocked_retry_audited_alert_group,
|
||||
mocked_audit_alert_group_escalation,
|
||||
make_organization_and_user,
|
||||
make_alert_receive_channel,
|
||||
make_alert_group_that_started_at_specific_date,
|
||||
):
|
||||
organization, _ = make_organization_and_user()
|
||||
alert_receive_channel = make_alert_receive_channel(organization)
|
||||
|
||||
# Pass audit (should not be counted in final message or go to retry function)
|
||||
alert_group1 = make_alert_group_that_started_at_specific_date(alert_receive_channel, received_delta=1)
|
||||
# Fail audit but not retrying (should be counted in final message)
|
||||
alert_group2 = make_alert_group_that_started_at_specific_date(alert_receive_channel, received_delta=5)
|
||||
# Fail audit but retry (should not be counted in final message)
|
||||
alert_group3 = make_alert_group_that_started_at_specific_date(alert_receive_channel, received_delta=10)
|
||||
|
||||
def _mocked_audit_alert_group_escalation(alert_group):
|
||||
if alert_group.id == alert_group2.id or alert_group.id == alert_group3.id:
|
||||
raise AlertGroupEscalationPolicyExecutionAuditException(f"{alert_group2.id} failed audit")
|
||||
|
||||
mocked_audit_alert_group_escalation.side_effect = _mocked_audit_alert_group_escalation
|
||||
|
||||
def _mocked_retry_audited_alert_group(alert_group):
|
||||
if alert_group.id == alert_group2.id:
|
||||
return False
|
||||
return True
|
||||
|
||||
mocked_retry_audited_alert_group.side_effect = _mocked_retry_audited_alert_group
|
||||
|
||||
with pytest.raises(AlertGroupEscalationPolicyExecutionAuditException) as exc:
|
||||
check_escalation_finished_task()
|
||||
|
||||
error_msg = str(exc.value)
|
||||
|
||||
assert "The following alert group id(s) failed auditing:" in error_msg
|
||||
assert str(alert_group1.id) not in error_msg
|
||||
assert str(alert_group2.id) in error_msg
|
||||
assert str(alert_group3.id) not in error_msg
|
||||
|
||||
assert mocked_retry_audited_alert_group.call_count == 2
|
||||
mocked_send_alert_group_escalation_auditor_task_heartbeat.assert_not_called()
|
||||
|
||||
|
||||
@patch.object(escalate_alert_group, "apply_async")
|
||||
@override_settings(AUDITED_ALERT_GROUP_MAX_RETRIES=1)
|
||||
@pytest.mark.django_db
|
||||
def test_retry_audited_alert_group(
|
||||
mocked_escalate_alert_group,
|
||||
make_organization_and_user,
|
||||
make_user_for_organization,
|
||||
make_user_notification_policy,
|
||||
make_escalation_chain,
|
||||
make_escalation_policy,
|
||||
make_channel_filter,
|
||||
make_alert_receive_channel,
|
||||
make_alert_group_that_started_at_specific_date,
|
||||
):
|
||||
organization, user = make_organization_and_user()
|
||||
make_user_notification_policy(
|
||||
user=user,
|
||||
step=UserNotificationPolicy.Step.NOTIFY,
|
||||
notify_by=UserNotificationPolicy.NotificationChannel.SLACK,
|
||||
)
|
||||
|
||||
alert_receive_channel = make_alert_receive_channel(organization)
|
||||
escalation_chain = make_escalation_chain(organization)
|
||||
channel_filter = make_channel_filter(alert_receive_channel, escalation_chain=escalation_chain)
|
||||
notify_to_multiple_users_step = make_escalation_policy(
|
||||
escalation_chain=channel_filter.escalation_chain,
|
||||
escalation_policy_step=EscalationPolicy.STEP_NOTIFY_MULTIPLE_USERS,
|
||||
)
|
||||
notify_to_multiple_users_step.notify_to_users_queue.set([user])
|
||||
|
||||
alert_group1 = make_alert_group_that_started_at_specific_date(alert_receive_channel, channel_filter=channel_filter)
|
||||
alert_group1.raw_escalation_snapshot = alert_group1.build_raw_escalation_snapshot()
|
||||
alert_group1.raw_escalation_snapshot["last_active_escalation_policy_order"] = 1
|
||||
alert_group1.save()
|
||||
|
||||
# Retry should occur
|
||||
is_retrying = retry_audited_alert_group(alert_group1)
|
||||
assert is_retrying
|
||||
mocked_escalate_alert_group.assert_called()
|
||||
mocked_escalate_alert_group.reset_mock()
|
||||
|
||||
# No retry as attempts == max
|
||||
is_retrying = retry_audited_alert_group(alert_group1)
|
||||
assert not is_retrying
|
||||
mocked_escalate_alert_group.assert_not_called()
|
||||
mocked_escalate_alert_group.reset_mock()
|
||||
|
||||
alert_group2 = make_alert_group_that_started_at_specific_date(alert_receive_channel, channel_filter=channel_filter)
|
||||
# No retry because no escalation snapshot
|
||||
is_retrying = retry_audited_alert_group(alert_group2)
|
||||
assert not is_retrying
|
||||
mocked_escalate_alert_group.assert_not_called()
|
||||
mocked_escalate_alert_group.reset_mock()
|
||||
|
||||
alert_group3 = make_alert_group_that_started_at_specific_date(
|
||||
alert_receive_channel,
|
||||
channel_filter=channel_filter,
|
||||
silenced=True,
|
||||
silenced_at=timezone.now(),
|
||||
silenced_by_user=user,
|
||||
silenced_until=(now + timezone.timedelta(hours=1)),
|
||||
)
|
||||
alert_group3.raw_escalation_snapshot = alert_group1.build_raw_escalation_snapshot()
|
||||
alert_group3.raw_escalation_snapshot["last_active_escalation_policy_order"] = 1
|
||||
alert_group3.save()
|
||||
|
||||
# No retry because alert group silenced
|
||||
is_retrying = retry_audited_alert_group(alert_group3)
|
||||
assert not is_retrying
|
||||
mocked_escalate_alert_group.assert_not_called()
|
||||
|
|
|
|||
|
|
@ -988,3 +988,5 @@ CUSTOM_RATELIMITS = getenv_custom_ratelimit("CUSTOM_RATELIMITS", default={})
|
|||
SYNC_V2_MAX_TASKS = getenv_integer("SYNC_V2_MAX_TASKS", 6)
|
||||
SYNC_V2_PERIOD_SECONDS = getenv_integer("SYNC_V2_PERIOD_SECONDS", 240)
|
||||
SYNC_V2_BATCH_SIZE = getenv_integer("SYNC_V2_BATCH_SIZE", 500)
|
||||
|
||||
AUDITED_ALERT_GROUP_MAX_RETRIES = getenv_integer("AUDITED_ALERT_GROUP_MAX_RETRIES", 1)
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue