Add notifications success ratio log to auditor (#3312)

# What this PR does

This PR adds alert groups success ratio over last 48 hours

## Which issue(s) this PR fixes

## Checklist

- [ ] Unit, integration, and e2e (if applicable) tests updated
- [ ] Documentation added (or `pr:no public docs` PR label added if not
required)
- [ ] `CHANGELOG.md` updated (or `pr:no changelog` PR label added if not
required)
This commit is contained in:
Ildar Iskhakov 2023-11-10 16:39:13 +08:00 committed by GitHub
parent e41ccb9d9b
commit 784c5ee7c1
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 20 additions and 4 deletions

View file

@ -99,9 +99,14 @@ def check_escalation_finished_task() -> None:
now = timezone.now() - datetime.timedelta(minutes=5)
two_days_ago = now - datetime.timedelta(days=2)
alert_groups = AlertGroup.objects.using(
get_random_readonly_database_key_if_present_otherwise_default()
).filter_active(started_at__range=(two_days_ago, now))
# Total alert groups over last 2 days
alert_groups = AlertGroup.objects.using(get_random_readonly_database_key_if_present_otherwise_default()).filter(
started_at__range=(two_days_ago, now),
)
total_alert_groups_count = alert_groups.count()
# Filter alert groups with active escalations (that could fail)
alert_groups = alert_groups.filter_active()
task_logger.info(
f"There are {len(alert_groups)} alert group(s) to audit"
@ -117,6 +122,14 @@ def check_escalation_finished_task() -> None:
except AlertGroupEscalationPolicyExecutionAuditException:
alert_group_ids_that_failed_audit.append(str(alert_group.id))
failed_alert_groups_count = len(alert_group_ids_that_failed_audit)
success_ratio = (
100
if total_alert_groups_count == 0
else (total_alert_groups_count - failed_alert_groups_count) / total_alert_groups_count * 100
)
task_logger.info(f"Alert group notifications success ratio: {success_ratio:.2f}")
if alert_group_ids_that_failed_audit:
msg = f"The following alert group id(s) failed auditing: {', '.join(alert_group_ids_that_failed_audit)}"

View file

@ -341,12 +341,13 @@ def test_check_escalation_finished_task_simply_calls_heartbeat_when_no_alert_gro
@patch("apps.alerts.tasks.check_escalation_finished.audit_alert_group_escalation")
@patch("apps.alerts.tasks.check_escalation_finished.send_alert_group_escalation_auditor_task_heartbeat")
@pytest.mark.django_db
def test_check_escalation_finished_task_calls_audit_alert_group_escalation_for_every_alert_group_even_if_one_fails(
def test_check_escalation_finished_task_calls_audit_alert_group_escalation_for_every_alert_group_even_if_one_fails_and_returns_success_ratio(
mocked_send_alert_group_escalation_auditor_task_heartbeat,
mocked_audit_alert_group_escalation,
make_organization_and_user,
make_alert_receive_channel,
make_alert_group_that_started_at_specific_date,
caplog,
):
organization, _ = make_organization_and_user()
alert_receive_channel = make_alert_receive_channel(organization)
@ -370,6 +371,8 @@ def test_check_escalation_finished_task_calls_audit_alert_group_escalation_for_e
assert str(alert_group1.id) in error_msg
assert str(alert_group2.id) in error_msg
assert "Alert group notifications success ratio: 33.33" in caplog.text
mocked_audit_alert_group_escalation.assert_any_call(alert_group1)
mocked_audit_alert_group_escalation.assert_any_call(alert_group2)
mocked_audit_alert_group_escalation.assert_any_call(alert_group3)