diff --git a/engine/apps/alerts/tasks/check_escalation_finished.py b/engine/apps/alerts/tasks/check_escalation_finished.py index 30ae392e..fee4e141 100644 --- a/engine/apps/alerts/tasks/check_escalation_finished.py +++ b/engine/apps/alerts/tasks/check_escalation_finished.py @@ -99,9 +99,14 @@ def check_escalation_finished_task() -> None: now = timezone.now() - datetime.timedelta(minutes=5) two_days_ago = now - datetime.timedelta(days=2) - alert_groups = AlertGroup.objects.using( - get_random_readonly_database_key_if_present_otherwise_default() - ).filter_active(started_at__range=(two_days_ago, now)) + # Total alert groups over last 2 days + alert_groups = AlertGroup.objects.using(get_random_readonly_database_key_if_present_otherwise_default()).filter( + started_at__range=(two_days_ago, now), + ) + total_alert_groups_count = alert_groups.count() + + # Filter alert groups with active escalations (that could fail) + alert_groups = alert_groups.filter_active() task_logger.info( f"There are {len(alert_groups)} alert group(s) to audit" @@ -117,6 +122,14 @@ def check_escalation_finished_task() -> None: except AlertGroupEscalationPolicyExecutionAuditException: alert_group_ids_that_failed_audit.append(str(alert_group.id)) + failed_alert_groups_count = len(alert_group_ids_that_failed_audit) + success_ratio = ( + 100 + if total_alert_groups_count == 0 + else (total_alert_groups_count - failed_alert_groups_count) / total_alert_groups_count * 100 + ) + task_logger.info(f"Alert group notifications success ratio: {success_ratio:.2f}") + if alert_group_ids_that_failed_audit: msg = f"The following alert group id(s) failed auditing: {', '.join(alert_group_ids_that_failed_audit)}" diff --git a/engine/apps/alerts/tests/test_check_escalation_finished_task.py b/engine/apps/alerts/tests/test_check_escalation_finished_task.py index f6a9aeeb..0e584a4c 100644 --- a/engine/apps/alerts/tests/test_check_escalation_finished_task.py +++ b/engine/apps/alerts/tests/test_check_escalation_finished_task.py @@ -341,12 +341,13 @@ def test_check_escalation_finished_task_simply_calls_heartbeat_when_no_alert_gro @patch("apps.alerts.tasks.check_escalation_finished.audit_alert_group_escalation") @patch("apps.alerts.tasks.check_escalation_finished.send_alert_group_escalation_auditor_task_heartbeat") @pytest.mark.django_db -def test_check_escalation_finished_task_calls_audit_alert_group_escalation_for_every_alert_group_even_if_one_fails( +def test_check_escalation_finished_task_calls_audit_alert_group_escalation_for_every_alert_group_even_if_one_fails_and_returns_success_ratio( mocked_send_alert_group_escalation_auditor_task_heartbeat, mocked_audit_alert_group_escalation, make_organization_and_user, make_alert_receive_channel, make_alert_group_that_started_at_specific_date, + caplog, ): organization, _ = make_organization_and_user() alert_receive_channel = make_alert_receive_channel(organization) @@ -370,6 +371,8 @@ def test_check_escalation_finished_task_calls_audit_alert_group_escalation_for_e assert str(alert_group1.id) in error_msg assert str(alert_group2.id) in error_msg + assert "Alert group notifications success ratio: 33.33" in caplog.text + mocked_audit_alert_group_escalation.assert_any_call(alert_group1) mocked_audit_alert_group_escalation.assert_any_call(alert_group2) mocked_audit_alert_group_escalation.assert_any_call(alert_group3)