# What this PR does This PR: - modifies the `check_escalation_finished_task` celery task to: - do stricter escalation validation based on the alert group's escalation snapshot (see the `audit_alert_group_escalation` method in `engine/apps/alerts/tasks/check_escalation_finished.py` for the validation logic) - use a read-only database for querying alert-groups if one is configured, otherwise use the "default" one - ping a configurable heartbeat (new env var `ALERT_GROUP_ESCALATION_AUDITOR_CELERY_TASK_HEARTBEAT_URL` added) - increase the task frequency from every 10 to every 13 minutes (this can be configured via an env variable) - adds public documentation on how to configure this auditor task - modifies the local celery startup command to properly take into consideration all celery related env vars (similar to the ones we use in `engine/celery_with_exporter.sh`; this made it easier to enable `celery beat` locally for testing) - removes the following code: - removes references to `AlertGroup.estimate_escalation_finish_time` and marks the model field as deprecated using the [`django-deprecate-fields` library](https://pypi.org/project/django-deprecate-fields/). This field was only used for the previous version of this validation task - `EscalationSnapshotMixin.calculate_eta_for_finish_escalation` was only used to calculate the value for `AlertGroup.estimate_escalation_finish_time` - `calculate_escalation_finish_time` celery task ## Which issue(s) this PR fixes https://github.com/grafana/oncall-private/issues/1558 ## Checklist - [x] Tests updated - [x] Documentation added - [x] `CHANGELOG.md` updated
148 lines
7 KiB
Python
148 lines
7 KiB
Python
import datetime
|
|
import typing
|
|
|
|
import requests
|
|
from celery import shared_task
|
|
from django.apps import apps
|
|
from django.conf import settings
|
|
from django.db.models import Q
|
|
from django.utils import timezone
|
|
|
|
from apps.alerts.tasks.task_logger import task_logger
|
|
from common.database import get_random_readonly_database_key_if_present_otherwise_default
|
|
|
|
if typing.TYPE_CHECKING:
|
|
from apps.alerts.models.alert_group import AlertGroup
|
|
|
|
|
|
class AlertGroupEscalationPolicyExecutionAuditException(BaseException):
|
|
"""This exception is raised when an alert group's escalation policy did not execute execute properly for some reason"""
|
|
|
|
|
|
def send_alert_group_escalation_auditor_task_heartbeat() -> None:
|
|
heartbeat_url = settings.ALERT_GROUP_ESCALATION_AUDITOR_CELERY_TASK_HEARTBEAT_URL
|
|
if heartbeat_url:
|
|
task_logger.info(f"Sending heartbeat to configured URL: {heartbeat_url}")
|
|
requests.get(heartbeat_url).raise_for_status()
|
|
task_logger.info(f"Heartbeat successfully sent to {heartbeat_url}")
|
|
else:
|
|
task_logger.info(f"Skipping sending heartbeat as no heartbeat URL is configured")
|
|
|
|
|
|
def audit_alert_group_escalation(alert_group: "AlertGroup") -> None:
|
|
escalation_snapshot = alert_group.escalation_snapshot
|
|
alert_group_id = alert_group.id
|
|
base_msg = f"Alert group {alert_group_id}"
|
|
|
|
if not escalation_snapshot:
|
|
raise AlertGroupEscalationPolicyExecutionAuditException(
|
|
f"{base_msg} does not have an escalation snapshot associated with it, this should never occur"
|
|
)
|
|
task_logger.info(f"{base_msg} has an escalation snapshot associated with it, auditing if it executed properly")
|
|
|
|
escalation_policies_snapshots = escalation_snapshot.escalation_policies_snapshots
|
|
|
|
if not escalation_policies_snapshots:
|
|
task_logger.info(
|
|
f"{base_msg}'s escalation snapshot has an empty escalation_policies_snapshots, skipping further validation"
|
|
)
|
|
return
|
|
task_logger.info(
|
|
f"{base_msg}'s escalation snapshot has a populated escalation_policies_snapshots, continuing validation"
|
|
)
|
|
|
|
if escalation_snapshot.next_step_eta_is_valid() is False:
|
|
raise AlertGroupEscalationPolicyExecutionAuditException(
|
|
f"{base_msg}'s escalation snapshot does not have a valid next_step_eta: {escalation_snapshot.next_step_eta}"
|
|
)
|
|
task_logger.info(f"{base_msg}'s escalation snapshot has a valid next_step_eta: {escalation_snapshot.next_step_eta}")
|
|
|
|
executed_escalation_policy_snapshots = escalation_snapshot.executed_escalation_policy_snapshots
|
|
num_of_executed_escalation_policy_snapshots = len(executed_escalation_policy_snapshots)
|
|
|
|
if num_of_executed_escalation_policy_snapshots == 0:
|
|
task_logger.info(
|
|
f"{base_msg}'s escalation snapshot does not have any executed escalation policies, skipping further validation"
|
|
)
|
|
else:
|
|
task_logger.info(
|
|
f"{base_msg}'s escalation snapshot has {num_of_executed_escalation_policy_snapshots} executed escalation policies"
|
|
)
|
|
|
|
# TODO: consider adding the below checks later on. This is it a bit trickier to properly audit as the
|
|
# number of log records can vary if there are any STEP_NOTIFY_IF_NUM_ALERTS_IN_TIME_WINDOW or
|
|
# STEP_REPEAT_ESCALATION_N_TIMES escalation policy steps in the escalation chain
|
|
# see conversations in the original PR (https://github.com/grafana/oncall/pull/1266) for more context on this
|
|
#
|
|
# compare number of triggered/failed alert group log records to the number of executed
|
|
# escalation policy snapshot steps
|
|
# num_of_relevant_log_records = AlertGroupLogRecord.objects.filter(
|
|
# alert_group_id=alert_group_id,
|
|
# type__in=[AlertGroupLogRecord.TYPE_ESCALATION_TRIGGERED, AlertGroupLogRecord.TYPE_ESCALATION_FAILED],
|
|
# ).count()
|
|
|
|
# if num_of_relevant_log_records < num_of_executed_escalation_policy_snapshots:
|
|
# raise AlertGroupEscalationPolicyExecutionAuditException(
|
|
# f"{base_msg}'s number of triggered/failed alert group log records ({num_of_relevant_log_records}) is less "
|
|
# f"than the number of executed escalation policy snapshot steps ({num_of_executed_escalation_policy_snapshots})"
|
|
# )
|
|
|
|
# task_logger.info(
|
|
# f"{base_msg}'s number of triggered/failed alert group log records ({num_of_relevant_log_records}) is greater "
|
|
# f"than or equal to the number of executed escalation policy snapshot steps ({num_of_executed_escalation_policy_snapshots})"
|
|
# )
|
|
|
|
task_logger.info(f"{base_msg} passed the audit checks")
|
|
|
|
|
|
def get_auditable_alert_groups_started_at_range() -> typing.Tuple[datetime.datetime, datetime.datetime]:
|
|
"""
|
|
NOTE: this started_at__range is a bit of a hack..
|
|
we wanted to avoid performing a migration on the alerts_alertgroup table to update
|
|
alert groups where raw_escalation_snapshot was None. raw_escalation_snapshot being None is a legitimate case,
|
|
where the alert group's integration does not have an escalation chain associated with it.
|
|
|
|
However, we wanted a way to be able to differentiate between "actually None" and "there was an error writing to
|
|
raw_escalation_snapshot" (as this is performed async by a celery task).
|
|
|
|
This field was updated, in the commit that added this comment, to no longer be set to None by default.
|
|
As part of this celery task we do a check that this field is in fact not None, so if we were to check older
|
|
alert groups, whose integration did not have an escalation chain at the time the alert group was created
|
|
we would raise errors
|
|
"""
|
|
return (datetime.datetime(2023, 3, 25), timezone.now() - timezone.timedelta(days=2))
|
|
|
|
|
|
# don't retry this task as the AlertGroup DB query is rather expensive
|
|
@shared_task
|
|
def check_escalation_finished_task():
|
|
AlertGroup = apps.get_model("alerts", "AlertGroup")
|
|
AlertReceiveChannel = apps.get_model("alerts", "AlertReceiveChannel")
|
|
|
|
alert_groups = AlertGroup.all_objects.using(get_random_readonly_database_key_if_present_otherwise_default()).filter(
|
|
~Q(channel__integration=AlertReceiveChannel.INTEGRATION_MAINTENANCE),
|
|
~Q(silenced=True, silenced_until__isnull=True), # filter silenced forever alert_groups
|
|
is_escalation_finished=False,
|
|
resolved=False,
|
|
acknowledged=False,
|
|
root_alert_group=None,
|
|
started_at__range=get_auditable_alert_groups_started_at_range(),
|
|
)
|
|
|
|
if not alert_groups.exists():
|
|
task_logger.info("There are no alert groups to audit, everything is good :)")
|
|
|
|
alert_group_ids_that_failed_audit: typing.List[str] = []
|
|
|
|
for alert_group in alert_groups:
|
|
try:
|
|
audit_alert_group_escalation(alert_group)
|
|
except AlertGroupEscalationPolicyExecutionAuditException:
|
|
alert_group_ids_that_failed_audit.append(str(alert_group.id))
|
|
|
|
if alert_group_ids_that_failed_audit:
|
|
raise AlertGroupEscalationPolicyExecutionAuditException(
|
|
f"The following alert group id(s) failed auditing: {', '.join(alert_group_ids_that_failed_audit)}"
|
|
)
|
|
|
|
send_alert_group_escalation_auditor_task_heartbeat()
|