oncall-engine/engine/apps/alerts/escalation_snapshot/snapshot_classes/escalation_snapshot.py
Joey Orlando 4d655dff60
modify check_escalation_finished_task task (#1266)
# What this PR does

This PR:
- modifies the `check_escalation_finished_task` celery task to:
  - do stricter escalation validation based on the alert group's
escalation snapshot (see the `audit_alert_group_escalation` method in
`engine/apps/alerts/tasks/check_escalation_finished.py` for the
validation logic)
- use a read-only database for querying alert-groups if one is
configured, otherwise use the "default" one
- ping a configurable heartbeat (new env var
`ALERT_GROUP_ESCALATION_AUDITOR_CELERY_TASK_HEARTBEAT_URL` added)
- increase the task frequency from every 10 to every 13 minutes (this
can be configured via an env variable)
  - adds public documentation on how to configure this auditor task
- modifies the local celery startup command to properly take into
consideration all celery related env vars (similar to the ones we use in
`engine/celery_with_exporter.sh`; this made it easier to enable `celery
beat` locally for testing)
- removes the following code:
- removes references to `AlertGroup.estimate_escalation_finish_time` and
marks the model field as deprecated using the [`django-deprecate-fields`
library](https://pypi.org/project/django-deprecate-fields/). This field
was only used for the previous version of this validation task
- `EscalationSnapshotMixin.calculate_eta_for_finish_escalation` was only
used to calculate the value for
`AlertGroup.estimate_escalation_finish_time`
  - `calculate_escalation_finish_time` celery task
  

## Which issue(s) this PR fixes

https://github.com/grafana/oncall-private/issues/1558

## Checklist

- [x] Tests updated
- [x] Documentation added
- [x] `CHANGELOG.md` updated
2023-03-17 10:14:08 +00:00

163 lines
6.9 KiB
Python

import logging
import typing
from celery.utils.log import get_task_logger
from django.utils import timezone
from apps.alerts.escalation_snapshot.serializers import EscalationSnapshotSerializer
from apps.alerts.models.alert_group_log_record import AlertGroupLogRecord
logger = get_task_logger(__name__)
logger.setLevel(logging.DEBUG)
if typing.TYPE_CHECKING:
from apps.alerts.escalation_snapshot.snapshot_classes import (
ChannelFilterSnapshot,
EscalationChainSnapshot,
EscalationPolicySnapshot,
)
from apps.alerts.models import AlertGroup
class EscalationSnapshot:
__slots__ = (
"alert_group",
"channel_filter_snapshot",
"escalation_chain_snapshot",
"escalation_policies_snapshots",
"last_active_escalation_policy_order",
"slack_channel_id",
"next_step_eta",
"stop_escalation",
"pause_escalation",
)
serializer = EscalationSnapshotSerializer
def __init__(
self,
alert_group: "AlertGroup",
channel_filter_snapshot: "ChannelFilterSnapshot",
escalation_chain_snapshot: "EscalationChainSnapshot",
last_active_escalation_policy_order: int,
escalation_policies_snapshots: typing.List["EscalationPolicySnapshot"],
slack_channel_id: str,
pause_escalation: bool,
next_step_eta: typing.Optional[str],
):
self.alert_group = alert_group
self.channel_filter_snapshot = channel_filter_snapshot
self.escalation_chain_snapshot = escalation_chain_snapshot
self.last_active_escalation_policy_order = last_active_escalation_policy_order
self.escalation_policies_snapshots = escalation_policies_snapshots
self.slack_channel_id = slack_channel_id
self.pause_escalation = pause_escalation
self.next_step_eta = next_step_eta
self.stop_escalation = False
@property
def last_active_escalation_policy_snapshot(self) -> typing.Optional["EscalationPolicySnapshot"]:
order = self.last_active_escalation_policy_order
if order is None:
return None
return self.escalation_policies_snapshots[order]
@property
def next_active_escalation_policy_snapshot(self) -> typing.Optional["EscalationPolicySnapshot"]:
order = self.next_active_escalation_policy_order
if len(self.escalation_policies_snapshots) < order + 1:
next_link = None
else:
next_link = self.escalation_policies_snapshots[order]
return next_link
@property
def next_active_escalation_policy_order(self) -> int:
if self.last_active_escalation_policy_order is None:
next_order = 0
else:
next_order = self.last_active_escalation_policy_order + 1
return next_order
@property
def executed_escalation_policy_snapshots(self) -> typing.List["EscalationPolicySnapshot"]:
"""
Returns a list of escalation policy snapshots that have already been executed, according
to the value of last_active_escalation_policy_order
"""
if self.last_active_escalation_policy_order is None:
return []
elif self.last_active_escalation_policy_order == 0:
return [self.escalation_policies_snapshots[0]]
return self.escalation_policies_snapshots[: self.last_active_escalation_policy_order]
def next_step_eta_is_valid(self) -> typing.Union[None, bool]:
"""
`next_step_eta` should never be less than the current time (with a 5 minute buffer provided)
as this field should be updated as the escalation policy is executed over time. If it is, this means that
an escalation policy step has been missed, or is substantially delayed
if `next_step_eta` is `None` then `None` is returned, otherwise a boolean is returned
representing the result of the time comparision
"""
if self.next_step_eta is None:
return None
return self.next_step_eta > (timezone.now() - timezone.timedelta(minutes=5))
def save_to_alert_group(self) -> None:
self.alert_group.raw_escalation_snapshot = self.convert_to_dict()
self.alert_group.save(update_fields=["raw_escalation_snapshot"])
def convert_to_dict(self) -> dict:
return self.serializer(self).data
def execute_actual_escalation_step(self) -> None:
"""
Executes actual escalation step and saves result of execution like stop_escalation param and eta,
that will be used for start next escalate_alert_group task.
Also updates self.last_active_escalation_policy_order if escalation step was executed.
"""
escalation_policy_snapshot = self.next_active_escalation_policy_snapshot
if escalation_policy_snapshot is None:
AlertGroupLogRecord(
type=AlertGroupLogRecord.TYPE_ESCALATION_FINISHED,
alert_group=self.alert_group,
reason="escalation finished",
).save()
self.stop_escalation = True
logger.debug(
"escalation_policy_snapshot is None, stop escalation. Last escalation policy snapshot order "
f"{self.last_active_escalation_policy_order}"
)
else:
logger.debug(
f"Starting to execute escalation step {escalation_policy_snapshot.step_display} with order "
f"{escalation_policy_snapshot.order}"
)
reason = f"lifecycle rule for {self.channel_filter_snapshot.str_for_clients} route"
# get execution result in namedtuple format and save its data
# (e.g. StepExecutionResultData(eta=None, start_from_beginning=False, stop_escalation=False)
execution_result = escalation_policy_snapshot.execute(alert_group=self.alert_group, reason=reason)
self.next_step_eta = execution_result.eta
self.stop_escalation = execution_result.stop_escalation # result of STEP_FINAL_RESOLVE
self.pause_escalation = execution_result.pause_escalation # result of STEP_NOTIFY_IF_NUM_ALERTS_IN_WINDOW
last_active_escalation_policy_order = escalation_policy_snapshot.order
if execution_result.start_from_beginning: # result of STEP_REPEAT_ESCALATION_N_TIMES
last_active_escalation_policy_order = None
# do not advance to the next escalation policy if escalation is paused
if execution_result.pause_escalation:
last_active_escalation_policy_order = self.last_active_escalation_policy_order
self.last_active_escalation_policy_order = last_active_escalation_policy_order
logger.debug(
f"Finished to execute escalation step {escalation_policy_snapshot.step_display} with order "
f"{escalation_policy_snapshot.order}, next escalation policy snapshot order "
f"{self.next_active_escalation_policy_order}"
)