Clean up check escalation finished task (#2943)

# What this PR does Clean up check escalation finished task, update description ## Checklist - [ ] Unit, integration, and e2e (if applicable) tests updated - [x] Documentation added (or `pr:no public docs` PR label added if not required) - [x] `CHANGELOG.md` updated (or `pr:no changelog` PR label added if not required)
2023-09-01 12:48:47 +02:00 · 2023-09-01 12:48:47 +02:00 · 361d45dd02
commit 361d45dd02
parent 585bbe486a
1 changed files with 7 additions and 24 deletions
--- a/engine/apps/alerts/tasks/check_escalation_finished.py
+++ b/engine/apps/alerts/tasks/check_escalation_finished.py
@ -81,36 +81,19 @@ def audit_alert_group_escalation(alert_group: "AlertGroup") -> None:
            f"{base_msg}'s escalation snapshot has {num_of_executed_escalation_policy_snapshots} executed escalation policies"
        )

-    # TODO: consider adding the below checks later on. This is it a bit trickier to properly audit as the
-    # number of log records can vary if there are any STEP_NOTIFY_IF_NUM_ALERTS_IN_TIME_WINDOW or
-    # STEP_REPEAT_ESCALATION_N_TIMES escalation policy steps in the escalation chain
-    # see conversations in the original PR (https://github.com/grafana/oncall/pull/1266) for more context on this
-    #
-    # compare number of triggered/failed alert group log records to the number of executed
-    # escalation policy snapshot steps
-    # num_of_relevant_log_records = AlertGroupLogRecord.objects.filter(
-    #     alert_group_id=alert_group_id,
-    #     type__in=[AlertGroupLogRecord.TYPE_ESCALATION_TRIGGERED, AlertGroupLogRecord.TYPE_ESCALATION_FAILED],
-    # ).count()
-
-    # if num_of_relevant_log_records < num_of_executed_escalation_policy_snapshots:
-    #     raise AlertGroupEscalationPolicyExecutionAuditException(
-    #         f"{base_msg}'s number of triggered/failed alert group log records ({num_of_relevant_log_records}) is less "
-    #         f"than the number of executed escalation policy snapshot steps ({num_of_executed_escalation_policy_snapshots})"
-    #     )
-
-    # task_logger.info(
-    #     f"{base_msg}'s number of triggered/failed alert group log records ({num_of_relevant_log_records}) is greater "
-    #     f"than or equal to the number of executed escalation policy snapshot steps ({num_of_executed_escalation_policy_snapshots})"
-    # )
-
    task_logger.info(f"{base_msg} passed the audit checks")


@shared_task
 def check_escalation_finished_task() -> None:
    """
-    don't retry this task, the idea is to be alerted of failures
+    This task takes alert groups with active escalation, checks if escalation snapshot with escalation policies
+    was created and next escalation step eta is higher than now minus 5 min for every active alert group,
+    what means that escalations are going as expected.
+    If there are alert groups that failed the check, it raises exception. Otherwise - send heartbeat. Missing heartbeat
+    raises alert.
+
+    Attention: don't retry this task, the idea is to be alerted of failures
    """
    from apps.alerts.models import AlertGroup