# What this PR does This PR: - modifies the `check_escalation_finished_task` celery task to: - do stricter escalation validation based on the alert group's escalation snapshot (see the `audit_alert_group_escalation` method in `engine/apps/alerts/tasks/check_escalation_finished.py` for the validation logic) - use a read-only database for querying alert-groups if one is configured, otherwise use the "default" one - ping a configurable heartbeat (new env var `ALERT_GROUP_ESCALATION_AUDITOR_CELERY_TASK_HEARTBEAT_URL` added) - increase the task frequency from every 10 to every 13 minutes (this can be configured via an env variable) - adds public documentation on how to configure this auditor task - modifies the local celery startup command to properly take into consideration all celery related env vars (similar to the ones we use in `engine/celery_with_exporter.sh`; this made it easier to enable `celery beat` locally for testing) - removes the following code: - removes references to `AlertGroup.estimate_escalation_finish_time` and marks the model field as deprecated using the [`django-deprecate-fields` library](https://pypi.org/project/django-deprecate-fields/). This field was only used for the previous version of this validation task - `EscalationSnapshotMixin.calculate_eta_for_finish_escalation` was only used to calculate the value for `AlertGroup.estimate_escalation_finish_time` - `calculate_escalation_finish_time` celery task ## Which issue(s) this PR fixes https://github.com/grafana/oncall-private/issues/1558 ## Checklist - [x] Tests updated - [x] Documentation added - [x] `CHANGELOG.md` updated
260 lines
10 KiB
Python
260 lines
10 KiB
Python
import datetime
|
|
import logging
|
|
from typing import Optional
|
|
|
|
import pytz
|
|
from celery import uuid as celery_uuid
|
|
from dateutil.parser import parse
|
|
from django.apps import apps
|
|
from django.utils.functional import cached_property
|
|
from rest_framework.exceptions import ValidationError
|
|
|
|
from apps.alerts.escalation_snapshot.snapshot_classes import (
|
|
ChannelFilterSnapshot,
|
|
EscalationChainSnapshot,
|
|
EscalationPolicySnapshot,
|
|
EscalationSnapshot,
|
|
)
|
|
from apps.alerts.tasks import escalate_alert_group
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Is a delay to prevent intermediate activity by system in case user is doing some multi-step action.
|
|
# For example if user wants to unack and ack we don't need to launch escalation right after unack.
|
|
START_ESCALATION_DELAY = 10
|
|
|
|
|
|
class EscalationSnapshotMixin:
|
|
"""
|
|
Mixin for AlertGroup. It contains methods related with alert group escalation
|
|
"""
|
|
|
|
def build_raw_escalation_snapshot(self) -> dict:
|
|
"""
|
|
Builds new escalation chain in a json serializable format (dict).
|
|
Use this method to prepare escalation chain data for saving to alert group before start new escalation.
|
|
|
|
Example result:
|
|
{
|
|
'channel_filter_snapshot': {
|
|
'id': 1,
|
|
'notify_in_slack': True,
|
|
'str_for_clients': 'default',
|
|
'notify_in_telegram': True
|
|
},
|
|
'escalation_chain_snapshot': {
|
|
'id': 1,
|
|
'name': 'Test'
|
|
},
|
|
'escalation_policies_snapshots': [
|
|
{
|
|
'id': 1,
|
|
'step': 14,
|
|
'order': 0,
|
|
'to_time': None,
|
|
'from_time': None,
|
|
'num_alerts_in_window': None,
|
|
'num_minutes_in_window': None,
|
|
'wait_delay': None,
|
|
'notify_schedule': None,
|
|
'notify_to_group': None,
|
|
'passed_last_time': None,
|
|
'escalation_counter': 0,
|
|
'last_notified_user': None,
|
|
'custom_button_trigger': None,
|
|
'notify_to_users_queue': [1,2,3]
|
|
},
|
|
{
|
|
'id': 2,
|
|
'step': 0,
|
|
'order': 1,
|
|
'to_time': None,
|
|
'from_time': None,
|
|
'num_alerts_in_window': None,
|
|
'num_minutes_in_window': None,
|
|
'wait_delay': '00:05:00',
|
|
'notify_schedule': None,
|
|
'notify_to_group': None,
|
|
'passed_last_time': None,
|
|
'escalation_counter': 0,
|
|
'last_notified_user': None,
|
|
'custom_button_trigger': None,
|
|
'notify_to_users_queue': []
|
|
},
|
|
],
|
|
'slack_channel_id': 'SLACK_CHANNEL_ID',
|
|
'last_active_escalation_policy_order': None,
|
|
'pause_escalation': False,
|
|
'next_step_eta': '2021-10-18T10:28:28.890369Z
|
|
}
|
|
"""
|
|
data = {}
|
|
|
|
if self.escalation_chain_exists:
|
|
channel_filter = self.channel_filter
|
|
escalation_chain = channel_filter.escalation_chain
|
|
escalation_policies = escalation_chain.escalation_policies.all()
|
|
|
|
data = {
|
|
"channel_filter_snapshot": channel_filter,
|
|
"escalation_chain_snapshot": escalation_chain,
|
|
"escalation_policies_snapshots": escalation_policies,
|
|
"slack_channel_id": self.slack_channel_id,
|
|
}
|
|
return EscalationSnapshot.serializer(data).data
|
|
|
|
@property
|
|
def channel_filter_with_respect_to_escalation_snapshot(self):
|
|
# Try to get saved channel filter data from escalation snapshot at first because channel filter object
|
|
# can be changed or deleted during escalation
|
|
return self.channel_filter_snapshot or self.channel_filter
|
|
|
|
@property
|
|
def escalation_chain_with_respect_to_escalation_snapshot(self):
|
|
# Try to get saved escalation chain data from escalation snapshot at first because escalation chain object
|
|
# can be changed or deleted during escalation
|
|
return self.escalation_chain_snapshot or (self.channel_filter.escalation_chain if self.channel_filter else None)
|
|
|
|
@cached_property
|
|
def channel_filter_snapshot(self) -> Optional[ChannelFilterSnapshot]:
|
|
"""
|
|
in some cases we need only channel filter and don't want to serialize whole escalation
|
|
"""
|
|
escalation_snapshot = self.raw_escalation_snapshot
|
|
if not escalation_snapshot:
|
|
return None
|
|
|
|
channel_filter_snapshot = escalation_snapshot["channel_filter_snapshot"]
|
|
if not channel_filter_snapshot:
|
|
return None
|
|
|
|
channel_filter_snapshot = ChannelFilterSnapshot.serializer().to_internal_value(channel_filter_snapshot)
|
|
return ChannelFilterSnapshot(**channel_filter_snapshot)
|
|
|
|
@cached_property
|
|
def escalation_chain_snapshot(self) -> Optional[EscalationChainSnapshot]:
|
|
"""
|
|
in some cases we need only escalation chain and don't want to serialize whole escalation
|
|
escalation_chain_snapshot_object = None
|
|
"""
|
|
escalation_snapshot = self.raw_escalation_snapshot
|
|
if not escalation_snapshot:
|
|
return None
|
|
|
|
escalation_chain_snapshot = escalation_snapshot["escalation_chain_snapshot"]
|
|
if not escalation_chain_snapshot:
|
|
return None
|
|
|
|
escalation_chain_snapshot = EscalationChainSnapshot.serializer().to_internal_value(escalation_chain_snapshot)
|
|
return EscalationChainSnapshot(**escalation_chain_snapshot)
|
|
|
|
@cached_property
|
|
def escalation_snapshot(self) -> Optional[EscalationSnapshot]:
|
|
raw_escalation_snapshot = self.raw_escalation_snapshot
|
|
if raw_escalation_snapshot:
|
|
try:
|
|
return self._deserialize_escalation_snapshot(raw_escalation_snapshot)
|
|
except ValidationError as e:
|
|
logger.error(f"Error trying to deserialize raw escalation snapshot: {e}")
|
|
return None
|
|
|
|
@cached_property
|
|
def has_escalation_policies_snapshots(self) -> bool:
|
|
if not self.raw_escalation_snapshot:
|
|
return False
|
|
return len(self.raw_escalation_snapshot["escalation_policies_snapshots"]) > 0
|
|
|
|
def _deserialize_escalation_snapshot(self, raw_escalation_snapshot) -> EscalationSnapshot:
|
|
"""
|
|
Deserializes raw escalation snapshot to EscalationSnapshot object with channel_filter_snapshot as
|
|
ChannelFilterSnapshot object and escalation_policies_snapshots as EscalationPolicySnapshot objects
|
|
:param raw_escalation_snapshot: dict
|
|
:return: EscalationSnapshot
|
|
"""
|
|
deserialized_escalation_snapshot = EscalationSnapshot.serializer().to_internal_value(raw_escalation_snapshot)
|
|
channel_filter_snapshot = deserialized_escalation_snapshot["channel_filter_snapshot"]
|
|
deserialized_escalation_snapshot["channel_filter_snapshot"] = ChannelFilterSnapshot(**channel_filter_snapshot)
|
|
|
|
escalation_chain_snapshot = deserialized_escalation_snapshot["escalation_chain_snapshot"]
|
|
deserialized_escalation_snapshot["escalation_chain_snapshot"] = EscalationChainSnapshot(
|
|
**escalation_chain_snapshot
|
|
)
|
|
|
|
escalation_policies_snapshots_raw = deserialized_escalation_snapshot["escalation_policies_snapshots"]
|
|
escalation_policies_snapshots = []
|
|
for escalation_policy_snapshot in escalation_policies_snapshots_raw:
|
|
escalation_policies_snapshots.append(EscalationPolicySnapshot(**escalation_policy_snapshot))
|
|
deserialized_escalation_snapshot["escalation_policies_snapshots"] = escalation_policies_snapshots
|
|
|
|
escalation_snapshot_object = EscalationSnapshot(self, **deserialized_escalation_snapshot)
|
|
return escalation_snapshot_object
|
|
|
|
@property
|
|
def escalation_chain_exists(self) -> bool:
|
|
if self.pause_escalation:
|
|
return False
|
|
elif not self.channel_filter:
|
|
return False
|
|
return self.channel_filter.escalation_chain is not None
|
|
|
|
@property
|
|
def pause_escalation(self) -> bool:
|
|
"""
|
|
get pause_escalation field directly to avoid serialization overhead
|
|
"""
|
|
if not self.raw_escalation_snapshot:
|
|
return False
|
|
return self.raw_escalation_snapshot.get("pause_escalation", False)
|
|
|
|
@property
|
|
def next_step_eta(self) -> Optional[datetime.datetime]:
|
|
"""
|
|
get next_step_eta field directly to avoid serialization overhead
|
|
"""
|
|
if not self.raw_escalation_snapshot:
|
|
return None
|
|
|
|
raw_next_step_eta = self.raw_escalation_snapshot.get("next_step_eta")
|
|
if not raw_next_step_eta:
|
|
return None
|
|
|
|
if raw_next_step_eta:
|
|
return parse(raw_next_step_eta).replace(tzinfo=pytz.UTC)
|
|
|
|
def start_escalation_if_needed(self, countdown=START_ESCALATION_DELAY, eta=None):
|
|
"""
|
|
:type self:AlertGroup
|
|
"""
|
|
AlertGroup = apps.get_model("alerts", "AlertGroup")
|
|
|
|
is_on_maintenace_or_debug_mode = (
|
|
self.channel.maintenance_mode is not None or self.channel.organization.maintenance_mode is not None
|
|
)
|
|
if is_on_maintenace_or_debug_mode:
|
|
return
|
|
if self.pause_escalation:
|
|
return
|
|
|
|
if not self.escalation_chain_exists:
|
|
return
|
|
|
|
logger.debug(f"Start escalation for alert group with pk: {self.pk}")
|
|
|
|
# take raw escalation snapshot from db if escalation is paused
|
|
raw_escalation_snapshot = (
|
|
self.build_raw_escalation_snapshot() if not self.pause_escalation else self.raw_escalation_snapshot
|
|
)
|
|
task_id = celery_uuid()
|
|
|
|
AlertGroup.all_objects.filter(pk=self.pk,).update(
|
|
active_escalation_id=task_id,
|
|
is_escalation_finished=False,
|
|
raw_escalation_snapshot=raw_escalation_snapshot,
|
|
)
|
|
escalate_alert_group.apply_async((self.pk,), countdown=countdown, immutable=True, eta=eta, task_id=task_id)
|
|
|
|
def stop_escalation(self):
|
|
self.is_escalation_finished = True
|
|
# change active_escalation_id to prevent alert escalation
|
|
self.active_escalation_id = "intentionally_stopped"
|
|
self.save(update_fields=["is_escalation_finished", "active_escalation_id"])
|