From f68b9dd004b3262512d2a0f7ded37cb60ad1afdb Mon Sep 17 00:00:00 2001 From: Matias Bordese Date: Mon, 18 Dec 2023 13:13:18 -0300 Subject: [PATCH] Update auditor to check personal notifications (#3563) Requires https://github.com/grafana/oncall/pull/3557 Related to https://github.com/grafana/oncall-private/issues/2347 --- .../alerts/tasks/check_escalation_finished.py | 32 ++++- engine/apps/alerts/tasks/notify_user.py | 18 ++- .../test_check_escalation_finished_task.py | 125 ++++++++++++++++++ engine/apps/alerts/tests/test_notify_user.py | 45 +++++++ engine/settings/celery_task_routes.py | 1 + 5 files changed, 219 insertions(+), 2 deletions(-) diff --git a/engine/apps/alerts/tasks/check_escalation_finished.py b/engine/apps/alerts/tasks/check_escalation_finished.py index 88d1ae4f..0204aaea 100644 --- a/engine/apps/alerts/tasks/check_escalation_finished.py +++ b/engine/apps/alerts/tasks/check_escalation_finished.py @@ -4,7 +4,7 @@ import typing import requests from celery import shared_task from django.conf import settings -from django.db.models import Avg, F, Max +from django.db.models import Avg, F, Max, Q from django.utils import timezone from apps.alerts.tasks.task_logger import task_logger @@ -82,9 +82,39 @@ def audit_alert_group_escalation(alert_group: "AlertGroup") -> None: f"{base_msg}'s escalation snapshot has {num_of_executed_escalation_policy_snapshots} executed escalation policies" ) + check_personal_notifications_task.apply_async((alert_group_id,)) + task_logger.info(f"{base_msg} passed the audit checks") +@shared_task +def check_personal_notifications_task(alert_group_id) -> None: + # Check personal notifications are completed + # triggered (< 5min ago) == failed + success + from apps.base.models import UserNotificationPolicy, UserNotificationPolicyLogRecord + + triggered = UserNotificationPolicyLogRecord.objects.filter( + alert_group_id=alert_group_id, + type=UserNotificationPolicyLogRecord.TYPE_PERSONAL_NOTIFICATION_TRIGGERED, + notification_step=UserNotificationPolicy.Step.NOTIFY, + created_at__lte=timezone.now() - timezone.timedelta(minutes=5), + ).count() + completed = UserNotificationPolicyLogRecord.objects.filter( + Q(type=UserNotificationPolicyLogRecord.TYPE_PERSONAL_NOTIFICATION_FAILED) + | Q(type=UserNotificationPolicyLogRecord.TYPE_PERSONAL_NOTIFICATION_SUCCESS), + alert_group_id=alert_group_id, + notification_step=UserNotificationPolicy.Step.NOTIFY, + ).count() + + base_msg = f"Alert group {alert_group_id}" + delta = triggered - completed + if delta > 0: + # TODO: when success notifications are setup for every backend, raise exception here + task_logger.info(f"{base_msg} has ({delta}) uncompleted personal notifications") + else: + task_logger.info(f"{base_msg} personal notifications check passed") + + @shared_task def check_escalation_finished_task() -> None: """ diff --git a/engine/apps/alerts/tasks/notify_user.py b/engine/apps/alerts/tasks/notify_user.py index 4a125b64..7578ba53 100644 --- a/engine/apps/alerts/tasks/notify_user.py +++ b/engine/apps/alerts/tasks/notify_user.py @@ -313,7 +313,7 @@ def perform_notification(log_record_pk): ).save() return - if alert_group.notify_in_slack_enabled is True and not log_record.slack_prevent_posting: + if alert_group.notify_in_slack_enabled is True: # we cannot notify users in Slack if their team does not have Slack integration if alert_group.channel.organization.slack_team_identity is None: task_logger.debug( @@ -332,6 +332,22 @@ def perform_notification(log_record_pk): ).save() return + if log_record.slack_prevent_posting: + task_logger.debug( + f"send_slack_notification for alert_group {alert_group.pk} failed because slack posting is disabled." + ) + UserNotificationPolicyLogRecord( + author=user, + type=UserNotificationPolicyLogRecord.TYPE_PERSONAL_NOTIFICATION_FAILED, + notification_policy=notification_policy, + reason="Prevented from posting in Slack", + alert_group=alert_group, + notification_step=notification_policy.step, + notification_channel=notification_channel, + notification_error_code=UserNotificationPolicyLogRecord.ERROR_NOTIFICATION_POSTING_TO_SLACK_IS_DISABLED, + ).save() + return + retry_timeout_hours = 1 if alert_group.slack_message: alert_group.slack_message.send_slack_notification(user, alert_group, notification_policy) diff --git a/engine/apps/alerts/tests/test_check_escalation_finished_task.py b/engine/apps/alerts/tests/test_check_escalation_finished_task.py index d635db10..8a8c7b85 100644 --- a/engine/apps/alerts/tests/test_check_escalation_finished_task.py +++ b/engine/apps/alerts/tests/test_check_escalation_finished_task.py @@ -5,12 +5,15 @@ import requests from django.test import override_settings from django.utils import timezone +from apps.alerts.models import EscalationPolicy from apps.alerts.tasks.check_escalation_finished import ( AlertGroupEscalationPolicyExecutionAuditException, audit_alert_group_escalation, check_escalation_finished_task, + check_personal_notifications_task, send_alert_group_escalation_auditor_task_heartbeat, ) +from apps.base.models import UserNotificationPolicy, UserNotificationPolicyLogRecord MOCKED_HEARTBEAT_URL = "https://hello.com/lsdjjkf" @@ -389,3 +392,125 @@ def test_check_escalation_finished_task_calls_audit_alert_group_escalation_for_e mocked_audit_alert_group_escalation.assert_any_call(alert_group3) mocked_send_alert_group_escalation_auditor_task_heartbeat.assert_not_called() + + +@patch("apps.alerts.tasks.check_escalation_finished.send_alert_group_escalation_auditor_task_heartbeat") +@pytest.mark.django_db +def test_check_escalation_finished_task_calls_audit_alert_group_personal_notifications( + mocked_send_alert_group_escalation_auditor_task_heartbeat, + make_organization_and_user, + make_user_notification_policy, + make_escalation_chain, + make_escalation_policy, + make_channel_filter, + make_alert_receive_channel, + make_alert_group_that_started_at_specific_date, + make_user_notification_policy_log_record, + caplog, +): + organization, user = make_organization_and_user() + user_notification_policy = make_user_notification_policy( + user=user, + step=UserNotificationPolicy.Step.NOTIFY, + notify_by=UserNotificationPolicy.NotificationChannel.SLACK, + ) + + alert_receive_channel = make_alert_receive_channel(organization) + escalation_chain = make_escalation_chain(organization) + channel_filter = make_channel_filter(alert_receive_channel, escalation_chain=escalation_chain) + notify_to_multiple_users_step = make_escalation_policy( + escalation_chain=channel_filter.escalation_chain, + escalation_policy_step=EscalationPolicy.STEP_NOTIFY_MULTIPLE_USERS, + ) + notify_to_multiple_users_step.notify_to_users_queue.set([user]) + + alert_group1 = make_alert_group_that_started_at_specific_date(alert_receive_channel, channel_filter=channel_filter) + alert_group2 = make_alert_group_that_started_at_specific_date(alert_receive_channel, channel_filter=channel_filter) + alert_group3 = make_alert_group_that_started_at_specific_date(alert_receive_channel, channel_filter=channel_filter) + alert_group4 = make_alert_group_that_started_at_specific_date(alert_receive_channel, channel_filter=channel_filter) + alert_groups = [alert_group1, alert_group2, alert_group3, alert_group4] + for alert_group in alert_groups: + alert_group.raw_escalation_snapshot = alert_group.build_raw_escalation_snapshot() + alert_group.raw_escalation_snapshot["last_active_escalation_policy_order"] = 1 + alert_group.save() + + now = timezone.now() + # alert_group1: wait, notify user, notification successful + make_user_notification_policy_log_record( + author=user, + alert_group=alert_group1, + notification_policy=user_notification_policy, + type=UserNotificationPolicyLogRecord.TYPE_PERSONAL_NOTIFICATION_TRIGGERED, + notification_step=UserNotificationPolicy.Step.WAIT, + ) + make_user_notification_policy_log_record( + author=user, + alert_group=alert_group1, + notification_policy=user_notification_policy, + notification_step=UserNotificationPolicy.Step.NOTIFY, + type=UserNotificationPolicyLogRecord.TYPE_PERSONAL_NOTIFICATION_TRIGGERED, + ) + make_user_notification_policy_log_record( + author=user, + alert_group=alert_group1, + notification_policy=user_notification_policy, + notification_step=UserNotificationPolicy.Step.NOTIFY, + type=UserNotificationPolicyLogRecord.TYPE_PERSONAL_NOTIFICATION_SUCCESS, + ) + # records created > 5 mins ago + alert_group1.personal_log_records.update(created_at=now - timezone.timedelta(minutes=7)) + + # alert_group2: notify user, notification failed + make_user_notification_policy_log_record( + author=user, + alert_group=alert_group2, + notification_policy=user_notification_policy, + notification_step=UserNotificationPolicy.Step.NOTIFY, + type=UserNotificationPolicyLogRecord.TYPE_PERSONAL_NOTIFICATION_TRIGGERED, + ) + make_user_notification_policy_log_record( + author=user, + alert_group=alert_group2, + notification_policy=user_notification_policy, + notification_step=UserNotificationPolicy.Step.NOTIFY, + type=UserNotificationPolicyLogRecord.TYPE_PERSONAL_NOTIFICATION_FAILED, + ) + # records created > 5 mins ago + alert_group2.personal_log_records.update(created_at=now - timezone.timedelta(minutes=7)) + + # alert_group3: notify user, missing completion + make_user_notification_policy_log_record( + author=user, + alert_group=alert_group3, + notification_policy=user_notification_policy, + notification_step=UserNotificationPolicy.Step.NOTIFY, + type=UserNotificationPolicyLogRecord.TYPE_PERSONAL_NOTIFICATION_TRIGGERED, + ) + # record created > 5 mins ago + alert_group3.personal_log_records.update(created_at=now - timezone.timedelta(minutes=7)) + + # alert_group4: notify user created > 5 mins ago, missing completion + make_user_notification_policy_log_record( + author=user, + created_at=now - timezone.timedelta(minutes=3), + alert_group=alert_group3, + notification_policy=user_notification_policy, + notification_step=UserNotificationPolicy.Step.NOTIFY, + type=UserNotificationPolicyLogRecord.TYPE_PERSONAL_NOTIFICATION_TRIGGERED, + ) + # record created < 5 mins ago + alert_group4.personal_log_records.update(created_at=now - timezone.timedelta(minutes=2)) + + # trigger task + with patch("apps.alerts.tasks.check_escalation_finished.check_personal_notifications_task") as mock_check_notif: + check_escalation_finished_task() + + for alert_group in alert_groups: + mock_check_notif.apply_async.assert_any_call((alert_group.id,)) + check_personal_notifications_task(alert_group.id) + if alert_group == alert_group3: + assert f"Alert group {alert_group3.id} has (1) uncompleted personal notifications" in caplog.text + else: + assert f"Alert group {alert_group.id} personal notifications check passed" in caplog.text + + mocked_send_alert_group_escalation_auditor_task_heartbeat.assert_called() diff --git a/engine/apps/alerts/tests/test_notify_user.py b/engine/apps/alerts/tests/test_notify_user.py index 98d8bc2b..4643f0c2 100644 --- a/engine/apps/alerts/tests/test_notify_user.py +++ b/engine/apps/alerts/tests/test_notify_user.py @@ -240,3 +240,48 @@ def test_perform_notification_reason_to_skip_escalation_in_slack( else: mocked_send_slack_notification.assert_called() assert last_log_record.type != UserNotificationPolicyLogRecord.TYPE_PERSONAL_NOTIFICATION_FAILED + + +@pytest.mark.django_db +def test_perform_notification_slack_prevent_posting( + make_organization, + make_slack_team_identity, + make_user, + make_user_notification_policy, + make_alert_receive_channel, + make_alert_group, + make_user_notification_policy_log_record, + make_slack_message, +): + organization = make_organization() + slack_team_identity = make_slack_team_identity() + organization.slack_team_identity = slack_team_identity + organization.save() + user = make_user(organization=organization) + user_notification_policy = make_user_notification_policy( + user=user, + step=UserNotificationPolicy.Step.NOTIFY, + notify_by=UserNotificationPolicy.NotificationChannel.SLACK, + ) + alert_receive_channel = make_alert_receive_channel(organization=organization) + alert_group = make_alert_group(alert_receive_channel=alert_receive_channel) + log_record = make_user_notification_policy_log_record( + author=user, + alert_group=alert_group, + notification_policy=user_notification_policy, + type=UserNotificationPolicyLogRecord.TYPE_PERSONAL_NOTIFICATION_TRIGGERED, + slack_prevent_posting=True, + ) + make_slack_message(alert_group=alert_group, channel_id="test_channel_id", slack_id="test_slack_id") + + with patch.object(SlackMessage, "send_slack_notification") as mocked_send_slack_notification: + perform_notification(log_record.pk) + + mocked_send_slack_notification.assert_not_called() + last_log_record = UserNotificationPolicyLogRecord.objects.last() + assert last_log_record.type == UserNotificationPolicyLogRecord.TYPE_PERSONAL_NOTIFICATION_FAILED + assert last_log_record.reason == "Prevented from posting in Slack" + assert ( + last_log_record.notification_error_code + == UserNotificationPolicyLogRecord.ERROR_NOTIFICATION_POSTING_TO_SLACK_IS_DISABLED + ) diff --git a/engine/settings/celery_task_routes.py b/engine/settings/celery_task_routes.py index a1f45836..2252733e 100644 --- a/engine/settings/celery_task_routes.py +++ b/engine/settings/celery_task_routes.py @@ -122,6 +122,7 @@ CELERY_TASK_ROUTES = { "apps.alerts.tasks.alert_group_web_title_cache.update_web_title_cache_for_alert_receive_channel": {"queue": "long"}, "apps.alerts.tasks.alert_group_web_title_cache.update_web_title_cache": {"queue": "long"}, "apps.alerts.tasks.check_escalation_finished.check_escalation_finished_task": {"queue": "long"}, + "apps.alerts.tasks.check_escalation_finished.check_personal_notifications_task": {"queue": "long"}, "apps.grafana_plugin.tasks.sync.cleanup_organization_async": {"queue": "long"}, "apps.grafana_plugin.tasks.sync.start_cleanup_deleted_organizations": {"queue": "long"}, "apps.grafana_plugin.tasks.sync.start_sync_organizations": {"queue": "long"},