From d5b43b043979a1d7d8bf1c9429ea1353d2db3365 Mon Sep 17 00:00:00 2001 From: Joey Orlando Date: Mon, 17 Jul 2023 16:04:53 +0200 Subject: [PATCH] minor improvements for check_escalation_finished celery task (#2554) # What this PR does This PR adds some enhancements to the `check_escalation_finished` celery task. It short-circuits auditing of an alert group if it does not have an escalation chain associated with it. In `EscalationSnapshotMixin.start_escalation_if_needed` we will not set `raw_escalation_snapshot` ([here](https://github.com/grafana/oncall/blob/dev/engine/apps/alerts/escalation_snapshot/escalation_snapshot_mixin.py#L262)) in this case: ```python3 def start_escalation_if_needed(self, countdown=START_ESCALATION_DELAY, eta=None): """ :type self:AlertGroup """ AlertGroup = apps.get_model("alerts", "AlertGroup") is_on_maintenace_or_debug_mode = self.channel.maintenance_mode is not None if ( self.is_restricted or is_on_maintenace_or_debug_mode or self.pause_escalation or not self.escalation_chain_exists <-- here ): logger.debug( f"Not escalating alert group w/ pk: {self.pk}\n" f"is_restricted: {self.is_restricted}\n" f"is_on_maintenace_or_debug_mode: {is_on_maintenace_or_debug_mode}\n" f"pause_escalation: {self.pause_escalation}\n" f"escalation_chain_exists: {self.escalation_chain_exists}" ) return logger.debug(f"Start escalation for alert group with pk: {self.pk}") # take raw escalation snapshot from db if escalation is paused raw_escalation_snapshot = ( self.build_raw_escalation_snapshot() if not self.pause_escalation else self.raw_escalation_snapshot ) task_id = celery_uuid() AlertGroup.all_objects.filter(pk=self.pk,).update( active_escalation_id=task_id, is_escalation_finished=False, raw_escalation_snapshot=raw_escalation_snapshot, ) ``` `EscalationSnapshotMixin.escalation_chain_exists` is as such: ```python3 @property def escalation_chain_exists(self) -> bool: if self.pause_escalation: return False elif not self.channel_filter: return False return self.channel_filter.escalation_chain is not None ``` ## Checklist - [x] Unit, integration, and e2e (if applicable) tests updated - [ ] Documentation added (or `pr:no public docs` PR label added if not required) (N/A) - [ ] `CHANGELOG.md` updated (or `pr:no changelog` PR label added if not required) (N/A) --- .../alerts/tasks/check_escalation_finished.py | 30 ++++++++++++++----- .../test_check_escalation_finished_task.py | 21 +++++++++++++ 2 files changed, 43 insertions(+), 8 deletions(-) diff --git a/engine/apps/alerts/tasks/check_escalation_finished.py b/engine/apps/alerts/tasks/check_escalation_finished.py index 343696c9..b0cfee9e 100644 --- a/engine/apps/alerts/tasks/check_escalation_finished.py +++ b/engine/apps/alerts/tasks/check_escalation_finished.py @@ -34,10 +34,19 @@ def audit_alert_group_escalation(alert_group: "AlertGroup") -> None: alert_group_id = alert_group.id base_msg = f"Alert group {alert_group_id}" - if not escalation_snapshot: - raise AlertGroupEscalationPolicyExecutionAuditException( - f"{base_msg} does not have an escalation snapshot associated with it, this should never occur" + if not alert_group.escalation_chain_exists: + task_logger.info( + f"{base_msg} does not have an escalation chain associated with it, and therefore it is expected " + "that it will not have an escalation snapshot, skipping further validation" ) + return + + if not escalation_snapshot: + msg = f"{base_msg} does not have an escalation snapshot associated with it, this should never occur" + + task_logger.warning(msg) + raise AlertGroupEscalationPolicyExecutionAuditException(msg) + task_logger.info(f"{base_msg} has an escalation snapshot associated with it, auditing if it executed properly") escalation_policies_snapshots = escalation_snapshot.escalation_policies_snapshots @@ -118,8 +127,11 @@ def check_escalation_finished_task() -> None: started_at__range=(two_days_ago, now), ) - if not alert_groups.exists(): - task_logger.info("There are no alert groups to audit, everything is good :)") + task_logger.info( + f"There are {len(alert_groups)} alert group(s) to audit" + if alert_groups.exists() + else "There are no alert groups to audit, everything is good :)" + ) alert_group_ids_that_failed_audit: typing.List[str] = [] @@ -130,8 +142,10 @@ def check_escalation_finished_task() -> None: alert_group_ids_that_failed_audit.append(str(alert_group.id)) if alert_group_ids_that_failed_audit: - raise AlertGroupEscalationPolicyExecutionAuditException( - f"The following alert group id(s) failed auditing: {', '.join(alert_group_ids_that_failed_audit)}" - ) + msg = f"The following alert group id(s) failed auditing: {', '.join(alert_group_ids_that_failed_audit)}" + task_logger.warning(msg) + raise AlertGroupEscalationPolicyExecutionAuditException(msg) + + task_logger.info("There were no alert groups that failed auditing") send_alert_group_escalation_auditor_task_heartbeat() diff --git a/engine/apps/alerts/tests/test_check_escalation_finished_task.py b/engine/apps/alerts/tests/test_check_escalation_finished_task.py index ab0cfaaf..8e947210 100644 --- a/engine/apps/alerts/tests/test_check_escalation_finished_task.py +++ b/engine/apps/alerts/tests/test_check_escalation_finished_task.py @@ -80,6 +80,27 @@ def test_send_alert_group_escalation_auditor_task_heartbeat_raises_an_exception_ mock_requests.get.return_value.raise_for_status.assert_called_once_with() +@pytest.mark.django_db +def test_audit_alert_group_escalation_skips_validation_if_the_alert_group_does_not_have_an_escalation_chain( + make_organization_and_user, + make_alert_receive_channel, + make_alert_group, +): + organization, _ = make_organization_and_user() + alert_receive_channel = make_alert_receive_channel(organization) + alert_group = make_alert_group(alert_receive_channel) + + alert_group.escalation_snapshot = None + alert_group.save() + + assert alert_group.escalation_chain_exists is False + + try: + audit_alert_group_escalation(alert_group) + except AlertGroupEscalationPolicyExecutionAuditException: + pytest.fail() + + @pytest.mark.django_db def test_audit_alert_group_escalation_raises_exception_if_the_alert_group_does_not_have_an_escalation_snapshot( escalation_snapshot_test_setup,