oncall-engine/engine/apps/alerts/escalation_snapshot/escalation_snapshot_mixin.py
Joey Orlando 4d655dff60
modify check_escalation_finished_task task (#1266)
# What this PR does

This PR:
- modifies the `check_escalation_finished_task` celery task to:
  - do stricter escalation validation based on the alert group's
escalation snapshot (see the `audit_alert_group_escalation` method in
`engine/apps/alerts/tasks/check_escalation_finished.py` for the
validation logic)
- use a read-only database for querying alert-groups if one is
configured, otherwise use the "default" one
- ping a configurable heartbeat (new env var
`ALERT_GROUP_ESCALATION_AUDITOR_CELERY_TASK_HEARTBEAT_URL` added)
- increase the task frequency from every 10 to every 13 minutes (this
can be configured via an env variable)
  - adds public documentation on how to configure this auditor task
- modifies the local celery startup command to properly take into
consideration all celery related env vars (similar to the ones we use in
`engine/celery_with_exporter.sh`; this made it easier to enable `celery
beat` locally for testing)
- removes the following code:
- removes references to `AlertGroup.estimate_escalation_finish_time` and
marks the model field as deprecated using the [`django-deprecate-fields`
library](https://pypi.org/project/django-deprecate-fields/). This field
was only used for the previous version of this validation task
- `EscalationSnapshotMixin.calculate_eta_for_finish_escalation` was only
used to calculate the value for
`AlertGroup.estimate_escalation_finish_time`
  - `calculate_escalation_finish_time` celery task
  

## Which issue(s) this PR fixes

https://github.com/grafana/oncall-private/issues/1558

## Checklist

- [x] Tests updated
- [x] Documentation added
- [x] `CHANGELOG.md` updated
2023-03-17 10:14:08 +00:00

260 lines
10 KiB
Python

import datetime
import logging
from typing import Optional
import pytz
from celery import uuid as celery_uuid
from dateutil.parser import parse
from django.apps import apps
from django.utils.functional import cached_property
from rest_framework.exceptions import ValidationError
from apps.alerts.escalation_snapshot.snapshot_classes import (
ChannelFilterSnapshot,
EscalationChainSnapshot,
EscalationPolicySnapshot,
EscalationSnapshot,
)
from apps.alerts.tasks import escalate_alert_group
logger = logging.getLogger(__name__)
# Is a delay to prevent intermediate activity by system in case user is doing some multi-step action.
# For example if user wants to unack and ack we don't need to launch escalation right after unack.
START_ESCALATION_DELAY = 10
class EscalationSnapshotMixin:
"""
Mixin for AlertGroup. It contains methods related with alert group escalation
"""
def build_raw_escalation_snapshot(self) -> dict:
"""
Builds new escalation chain in a json serializable format (dict).
Use this method to prepare escalation chain data for saving to alert group before start new escalation.
Example result:
{
'channel_filter_snapshot': {
'id': 1,
'notify_in_slack': True,
'str_for_clients': 'default',
'notify_in_telegram': True
},
'escalation_chain_snapshot': {
'id': 1,
'name': 'Test'
},
'escalation_policies_snapshots': [
{
'id': 1,
'step': 14,
'order': 0,
'to_time': None,
'from_time': None,
'num_alerts_in_window': None,
'num_minutes_in_window': None,
'wait_delay': None,
'notify_schedule': None,
'notify_to_group': None,
'passed_last_time': None,
'escalation_counter': 0,
'last_notified_user': None,
'custom_button_trigger': None,
'notify_to_users_queue': [1,2,3]
},
{
'id': 2,
'step': 0,
'order': 1,
'to_time': None,
'from_time': None,
'num_alerts_in_window': None,
'num_minutes_in_window': None,
'wait_delay': '00:05:00',
'notify_schedule': None,
'notify_to_group': None,
'passed_last_time': None,
'escalation_counter': 0,
'last_notified_user': None,
'custom_button_trigger': None,
'notify_to_users_queue': []
},
],
'slack_channel_id': 'SLACK_CHANNEL_ID',
'last_active_escalation_policy_order': None,
'pause_escalation': False,
'next_step_eta': '2021-10-18T10:28:28.890369Z
}
"""
data = {}
if self.escalation_chain_exists:
channel_filter = self.channel_filter
escalation_chain = channel_filter.escalation_chain
escalation_policies = escalation_chain.escalation_policies.all()
data = {
"channel_filter_snapshot": channel_filter,
"escalation_chain_snapshot": escalation_chain,
"escalation_policies_snapshots": escalation_policies,
"slack_channel_id": self.slack_channel_id,
}
return EscalationSnapshot.serializer(data).data
@property
def channel_filter_with_respect_to_escalation_snapshot(self):
# Try to get saved channel filter data from escalation snapshot at first because channel filter object
# can be changed or deleted during escalation
return self.channel_filter_snapshot or self.channel_filter
@property
def escalation_chain_with_respect_to_escalation_snapshot(self):
# Try to get saved escalation chain data from escalation snapshot at first because escalation chain object
# can be changed or deleted during escalation
return self.escalation_chain_snapshot or (self.channel_filter.escalation_chain if self.channel_filter else None)
@cached_property
def channel_filter_snapshot(self) -> Optional[ChannelFilterSnapshot]:
"""
in some cases we need only channel filter and don't want to serialize whole escalation
"""
escalation_snapshot = self.raw_escalation_snapshot
if not escalation_snapshot:
return None
channel_filter_snapshot = escalation_snapshot["channel_filter_snapshot"]
if not channel_filter_snapshot:
return None
channel_filter_snapshot = ChannelFilterSnapshot.serializer().to_internal_value(channel_filter_snapshot)
return ChannelFilterSnapshot(**channel_filter_snapshot)
@cached_property
def escalation_chain_snapshot(self) -> Optional[EscalationChainSnapshot]:
"""
in some cases we need only escalation chain and don't want to serialize whole escalation
escalation_chain_snapshot_object = None
"""
escalation_snapshot = self.raw_escalation_snapshot
if not escalation_snapshot:
return None
escalation_chain_snapshot = escalation_snapshot["escalation_chain_snapshot"]
if not escalation_chain_snapshot:
return None
escalation_chain_snapshot = EscalationChainSnapshot.serializer().to_internal_value(escalation_chain_snapshot)
return EscalationChainSnapshot(**escalation_chain_snapshot)
@cached_property
def escalation_snapshot(self) -> Optional[EscalationSnapshot]:
raw_escalation_snapshot = self.raw_escalation_snapshot
if raw_escalation_snapshot:
try:
return self._deserialize_escalation_snapshot(raw_escalation_snapshot)
except ValidationError as e:
logger.error(f"Error trying to deserialize raw escalation snapshot: {e}")
return None
@cached_property
def has_escalation_policies_snapshots(self) -> bool:
if not self.raw_escalation_snapshot:
return False
return len(self.raw_escalation_snapshot["escalation_policies_snapshots"]) > 0
def _deserialize_escalation_snapshot(self, raw_escalation_snapshot) -> EscalationSnapshot:
"""
Deserializes raw escalation snapshot to EscalationSnapshot object with channel_filter_snapshot as
ChannelFilterSnapshot object and escalation_policies_snapshots as EscalationPolicySnapshot objects
:param raw_escalation_snapshot: dict
:return: EscalationSnapshot
"""
deserialized_escalation_snapshot = EscalationSnapshot.serializer().to_internal_value(raw_escalation_snapshot)
channel_filter_snapshot = deserialized_escalation_snapshot["channel_filter_snapshot"]
deserialized_escalation_snapshot["channel_filter_snapshot"] = ChannelFilterSnapshot(**channel_filter_snapshot)
escalation_chain_snapshot = deserialized_escalation_snapshot["escalation_chain_snapshot"]
deserialized_escalation_snapshot["escalation_chain_snapshot"] = EscalationChainSnapshot(
**escalation_chain_snapshot
)
escalation_policies_snapshots_raw = deserialized_escalation_snapshot["escalation_policies_snapshots"]
escalation_policies_snapshots = []
for escalation_policy_snapshot in escalation_policies_snapshots_raw:
escalation_policies_snapshots.append(EscalationPolicySnapshot(**escalation_policy_snapshot))
deserialized_escalation_snapshot["escalation_policies_snapshots"] = escalation_policies_snapshots
escalation_snapshot_object = EscalationSnapshot(self, **deserialized_escalation_snapshot)
return escalation_snapshot_object
@property
def escalation_chain_exists(self) -> bool:
if self.pause_escalation:
return False
elif not self.channel_filter:
return False
return self.channel_filter.escalation_chain is not None
@property
def pause_escalation(self) -> bool:
"""
get pause_escalation field directly to avoid serialization overhead
"""
if not self.raw_escalation_snapshot:
return False
return self.raw_escalation_snapshot.get("pause_escalation", False)
@property
def next_step_eta(self) -> Optional[datetime.datetime]:
"""
get next_step_eta field directly to avoid serialization overhead
"""
if not self.raw_escalation_snapshot:
return None
raw_next_step_eta = self.raw_escalation_snapshot.get("next_step_eta")
if not raw_next_step_eta:
return None
if raw_next_step_eta:
return parse(raw_next_step_eta).replace(tzinfo=pytz.UTC)
def start_escalation_if_needed(self, countdown=START_ESCALATION_DELAY, eta=None):
"""
:type self:AlertGroup
"""
AlertGroup = apps.get_model("alerts", "AlertGroup")
is_on_maintenace_or_debug_mode = (
self.channel.maintenance_mode is not None or self.channel.organization.maintenance_mode is not None
)
if is_on_maintenace_or_debug_mode:
return
if self.pause_escalation:
return
if not self.escalation_chain_exists:
return
logger.debug(f"Start escalation for alert group with pk: {self.pk}")
# take raw escalation snapshot from db if escalation is paused
raw_escalation_snapshot = (
self.build_raw_escalation_snapshot() if not self.pause_escalation else self.raw_escalation_snapshot
)
task_id = celery_uuid()
AlertGroup.all_objects.filter(pk=self.pk,).update(
active_escalation_id=task_id,
is_escalation_finished=False,
raw_escalation_snapshot=raw_escalation_snapshot,
)
escalate_alert_group.apply_async((self.pk,), countdown=countdown, immutable=True, eta=eta, task_id=task_id)
def stop_escalation(self):
self.is_escalation_finished = True
# change active_escalation_id to prevent alert escalation
self.active_escalation_id = "intentionally_stopped"
self.save(update_fields=["is_escalation_finished", "active_escalation_id"])