From 4e2e7e0a15c89870fc5a61f2b70dfde953a225fb Mon Sep 17 00:00:00 2001 From: Matias Bordese Date: Wed, 10 Jan 2024 15:54:27 -0300 Subject: [PATCH] Add task logging personal notifications triggered/completed counts (#3638) Related to https://github.com/grafana/oncall-private/issues/2347 --- .../alerts/tasks/check_escalation_finished.py | 52 ++++++++++++++++++- .../test_check_escalation_finished_task.py | 12 ++++- engine/settings/base.py | 5 ++ engine/settings/celery_task_routes.py | 1 + 4 files changed, 66 insertions(+), 4 deletions(-) diff --git a/engine/apps/alerts/tasks/check_escalation_finished.py b/engine/apps/alerts/tasks/check_escalation_finished.py index 0204aaea..f859bbb5 100644 --- a/engine/apps/alerts/tasks/check_escalation_finished.py +++ b/engine/apps/alerts/tasks/check_escalation_finished.py @@ -82,13 +82,13 @@ def audit_alert_group_escalation(alert_group: "AlertGroup") -> None: f"{base_msg}'s escalation snapshot has {num_of_executed_escalation_policy_snapshots} executed escalation policies" ) - check_personal_notifications_task.apply_async((alert_group_id,)) + check_alert_group_personal_notifications_task.apply_async((alert_group_id,)) task_logger.info(f"{base_msg} passed the audit checks") @shared_task -def check_personal_notifications_task(alert_group_id) -> None: +def check_alert_group_personal_notifications_task(alert_group_id) -> None: # Check personal notifications are completed # triggered (< 5min ago) == failed + success from apps.base.models import UserNotificationPolicy, UserNotificationPolicyLogRecord @@ -115,6 +115,54 @@ def check_personal_notifications_task(alert_group_id) -> None: task_logger.info(f"{base_msg} personal notifications check passed") +@shared_task +def check_personal_notifications_task() -> None: + """ + This task checks that triggered personal notifications are completed. + It will log the triggered/completed values to be used as metrics. + + Attention: don't retry this task, the idea is to be alerted of failures + """ + from apps.alerts.models import AlertGroup + from apps.base.models import UserNotificationPolicy, UserNotificationPolicyLogRecord + + # use readonly database if available + readonly_db = get_random_readonly_database_key_if_present_otherwise_default() + + now = timezone.now() + + # consider alert groups from the last 2 days + alert_groups = AlertGroup.objects.using(readonly_db).filter( + started_at__range=(now - timezone.timedelta(days=2), now), + ) + + # review notifications triggered in the last 20-minute window + # (task should run periodically about every 15 minutes) + since = now - timezone.timedelta(minutes=20) + + log_records_qs = UserNotificationPolicyLogRecord.objects.using(readonly_db) + # personal notifications triggered in the given window for those alert groups + triggered = log_records_qs.filter( + type=UserNotificationPolicyLogRecord.TYPE_PERSONAL_NOTIFICATION_TRIGGERED, + notification_step=UserNotificationPolicy.Step.NOTIFY, + created_at__gte=since, + created_at__lte=now, + alert_group__in=alert_groups, + ).count() + + # personal notifications completed in the given window for those alert groups + completed = log_records_qs.filter( + Q(type=UserNotificationPolicyLogRecord.TYPE_PERSONAL_NOTIFICATION_FAILED) + | Q(type=UserNotificationPolicyLogRecord.TYPE_PERSONAL_NOTIFICATION_SUCCESS), + notification_step=UserNotificationPolicy.Step.NOTIFY, + created_at__gt=since, + created_at__lte=now, + alert_group__in=alert_groups, + ).count() + + task_logger.info(f"personal_notifications_triggered={triggered} personal_notifications_completed={completed}") + + @shared_task def check_escalation_finished_task() -> None: """ diff --git a/engine/apps/alerts/tests/test_check_escalation_finished_task.py b/engine/apps/alerts/tests/test_check_escalation_finished_task.py index 8a8c7b85..3b57061d 100644 --- a/engine/apps/alerts/tests/test_check_escalation_finished_task.py +++ b/engine/apps/alerts/tests/test_check_escalation_finished_task.py @@ -9,6 +9,7 @@ from apps.alerts.models import EscalationPolicy from apps.alerts.tasks.check_escalation_finished import ( AlertGroupEscalationPolicyExecutionAuditException, audit_alert_group_escalation, + check_alert_group_personal_notifications_task, check_escalation_finished_task, check_personal_notifications_task, send_alert_group_escalation_auditor_task_heartbeat, @@ -502,15 +503,22 @@ def test_check_escalation_finished_task_calls_audit_alert_group_personal_notific alert_group4.personal_log_records.update(created_at=now - timezone.timedelta(minutes=2)) # trigger task - with patch("apps.alerts.tasks.check_escalation_finished.check_personal_notifications_task") as mock_check_notif: + with patch( + "apps.alerts.tasks.check_escalation_finished.check_alert_group_personal_notifications_task" + ) as mock_check_notif: check_escalation_finished_task() for alert_group in alert_groups: mock_check_notif.apply_async.assert_any_call((alert_group.id,)) - check_personal_notifications_task(alert_group.id) + check_alert_group_personal_notifications_task(alert_group.id) if alert_group == alert_group3: assert f"Alert group {alert_group3.id} has (1) uncompleted personal notifications" in caplog.text else: assert f"Alert group {alert_group.id} personal notifications check passed" in caplog.text mocked_send_alert_group_escalation_auditor_task_heartbeat.assert_called() + + # also trigger the general personal notification checker + check_personal_notifications_task() + + assert "personal_notifications_triggered=4 personal_notifications_completed=2" in caplog.text diff --git a/engine/settings/base.py b/engine/settings/base.py index bc8935ac..a80c1166 100644 --- a/engine/settings/base.py +++ b/engine/settings/base.py @@ -586,6 +586,11 @@ if ESCALATION_AUDITOR_ENABLED: ), "args": (), } + CELERY_BEAT_SCHEDULE["check_personal_notifications"] = { + "task": "apps.alerts.tasks.check_escalation_finished.check_personal_notifications_task", + "schedule": crontab(minute="*/15"), # every 15 minutes + "args": (), + } INTERNAL_IPS = ["127.0.0.1"] diff --git a/engine/settings/celery_task_routes.py b/engine/settings/celery_task_routes.py index 142a0ae6..c44c3687 100644 --- a/engine/settings/celery_task_routes.py +++ b/engine/settings/celery_task_routes.py @@ -121,6 +121,7 @@ CELERY_TASK_ROUTES = { "apps.alerts.tasks.alert_group_web_title_cache.update_web_title_cache_for_alert_receive_channel": {"queue": "long"}, "apps.alerts.tasks.alert_group_web_title_cache.update_web_title_cache": {"queue": "long"}, "apps.alerts.tasks.check_escalation_finished.check_escalation_finished_task": {"queue": "long"}, + "apps.alerts.tasks.check_escalation_finished.check_alert_group_personal_notifications_task": {"queue": "long"}, "apps.alerts.tasks.check_escalation_finished.check_personal_notifications_task": {"queue": "long"}, "apps.grafana_plugin.tasks.sync.cleanup_organization_async": {"queue": "long"}, "apps.grafana_plugin.tasks.sync.start_cleanup_deleted_organizations": {"queue": "long"},