Clean up check escalation finished task (#2943)
# What this PR does Clean up check escalation finished task, update description ## Checklist - [ ] Unit, integration, and e2e (if applicable) tests updated - [x] Documentation added (or `pr:no public docs` PR label added if not required) - [x] `CHANGELOG.md` updated (or `pr:no changelog` PR label added if not required)
This commit is contained in:
parent
585bbe486a
commit
361d45dd02
1 changed files with 7 additions and 24 deletions
|
|
@ -81,36 +81,19 @@ def audit_alert_group_escalation(alert_group: "AlertGroup") -> None:
|
|||
f"{base_msg}'s escalation snapshot has {num_of_executed_escalation_policy_snapshots} executed escalation policies"
|
||||
)
|
||||
|
||||
# TODO: consider adding the below checks later on. This is it a bit trickier to properly audit as the
|
||||
# number of log records can vary if there are any STEP_NOTIFY_IF_NUM_ALERTS_IN_TIME_WINDOW or
|
||||
# STEP_REPEAT_ESCALATION_N_TIMES escalation policy steps in the escalation chain
|
||||
# see conversations in the original PR (https://github.com/grafana/oncall/pull/1266) for more context on this
|
||||
#
|
||||
# compare number of triggered/failed alert group log records to the number of executed
|
||||
# escalation policy snapshot steps
|
||||
# num_of_relevant_log_records = AlertGroupLogRecord.objects.filter(
|
||||
# alert_group_id=alert_group_id,
|
||||
# type__in=[AlertGroupLogRecord.TYPE_ESCALATION_TRIGGERED, AlertGroupLogRecord.TYPE_ESCALATION_FAILED],
|
||||
# ).count()
|
||||
|
||||
# if num_of_relevant_log_records < num_of_executed_escalation_policy_snapshots:
|
||||
# raise AlertGroupEscalationPolicyExecutionAuditException(
|
||||
# f"{base_msg}'s number of triggered/failed alert group log records ({num_of_relevant_log_records}) is less "
|
||||
# f"than the number of executed escalation policy snapshot steps ({num_of_executed_escalation_policy_snapshots})"
|
||||
# )
|
||||
|
||||
# task_logger.info(
|
||||
# f"{base_msg}'s number of triggered/failed alert group log records ({num_of_relevant_log_records}) is greater "
|
||||
# f"than or equal to the number of executed escalation policy snapshot steps ({num_of_executed_escalation_policy_snapshots})"
|
||||
# )
|
||||
|
||||
task_logger.info(f"{base_msg} passed the audit checks")
|
||||
|
||||
|
||||
@shared_task
|
||||
def check_escalation_finished_task() -> None:
|
||||
"""
|
||||
don't retry this task, the idea is to be alerted of failures
|
||||
This task takes alert groups with active escalation, checks if escalation snapshot with escalation policies
|
||||
was created and next escalation step eta is higher than now minus 5 min for every active alert group,
|
||||
what means that escalations are going as expected.
|
||||
If there are alert groups that failed the check, it raises exception. Otherwise - send heartbeat. Missing heartbeat
|
||||
raises alert.
|
||||
|
||||
Attention: don't retry this task, the idea is to be alerted of failures
|
||||
"""
|
||||
from apps.alerts.models import AlertGroup
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue