oncall-engine/engine/apps/alerts/models/alert_group.py
Joey Orlando deb6a45588
chore: convert two slack channel ID char fields to foreign keys (#5224)
# What this PR does

Similar to https://github.com/grafana/oncall/pull/5199

Converts follow char fields to primary key relationships on
`SlackChannel` table:
- `ResolutionNoteSlackMessage.channel_id` ->
`ResolutionNoteSlackMessage.slack_channel`
- `ChannelFilter.slack_channel_id` -> `ChannelFilter.slack_channel`

## Checklist

- [x] Unit, integration, and e2e (if applicable) tests updated
- [x] Documentation added (or `pr:no public docs` PR label added if not
required)
- [x] Added the relevant release notes label (see labels prefixed w/
`release:`). These labels dictate how your PR will
    show up in the autogenerated release notes.
2024-11-04 13:34:06 -05:00

2023 lines
82 KiB
Python

import datetime
import logging
import typing
import urllib
from collections import namedtuple
from functools import partial
from celery import uuid as celery_uuid
from django.conf import settings
from django.core.validators import MinLengthValidator
from django.db import IntegrityError, models, transaction
from django.db.models import JSONField, Q, QuerySet
from django.utils import timezone
from django.utils.functional import cached_property
from apps.alerts.constants import ActionSource, AlertGroupState
from apps.alerts.escalation_snapshot import EscalationSnapshotMixin
from apps.alerts.escalation_snapshot.escalation_snapshot_mixin import START_ESCALATION_DELAY
from apps.alerts.incident_appearance.renderers.constants import DEFAULT_BACKUP_TITLE
from apps.alerts.incident_appearance.renderers.slack_renderer import AlertGroupSlackRenderer
from apps.alerts.incident_log_builder import IncidentLogBuilder
from apps.alerts.signals import alert_group_created_signal
from apps.alerts.tasks import (
acknowledge_reminder_task,
send_alert_group_signal,
send_alert_group_signal_for_delete,
unsilence_task,
)
from apps.grafana_plugin.ui_url_builder import UIURLBuilder
from apps.metrics_exporter.tasks import update_metrics_for_alert_group
from apps.slack.slack_formatter import SlackFormatter
from apps.user_management.models import User
from common.public_primary_keys import generate_public_primary_key, increase_public_primary_key_length
from common.utils import clean_markup, str_or_backup
from .alert_group_counter import AlertGroupCounter
if typing.TYPE_CHECKING:
from django.db.models.manager import RelatedManager
from apps.alerts.models import (
Alert,
AlertGroupLogRecord,
AlertReceiveChannel,
BundledNotification,
Invitation,
RelatedIncident,
ResolutionNote,
ResolutionNoteSlackMessage,
)
from apps.base.models import UserNotificationPolicyLogRecord
from apps.labels.models import AlertGroupAssociatedLabel
from apps.slack.models import SlackMessage
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
def generate_public_primary_key_for_alert_group():
prefix = "I"
new_public_primary_key = generate_public_primary_key(prefix)
failure_counter = 0
while AlertGroup.objects.filter(public_primary_key=new_public_primary_key).exists():
new_public_primary_key = increase_public_primary_key_length(
failure_counter=failure_counter, prefix=prefix, model_name="AlertGroup"
)
failure_counter += 1
return new_public_primary_key
class LogRecordUser(typing.TypedDict):
username: str
pk: str
avatar: str
avatar_full: str
class PagedUser(typing.TypedDict):
id: int
username: str
name: str
pk: str
avatar: str
avatar_full: str
important: bool
class LogRecords(typing.TypedDict):
time: str # humanized delta relative to now
action: str # human-friendly description
realm: typing.Literal["user_notification", "alert_group", "resolution_note"]
type: int # depending on realm, check type choices
created_at: str # timestamp
author: LogRecordUser
class Permalinks(typing.TypedDict):
slack: typing.Optional[str]
slack_app: typing.Optional[str]
telegram: typing.Optional[str]
web: str
class AlertGroupQuerySet(models.QuerySet):
def create(self, **kwargs):
organization = kwargs["channel"].organization
inside_organization_number = AlertGroupCounter.objects.get_value(organization=organization) + 1
return super().create(**kwargs, inside_organization_number=inside_organization_number)
def get_or_create_grouping(self, channel, channel_filter, group_data, received_at=None):
"""
This method is similar to default Django QuerySet.get_or_create(), please see the original get_or_create method.
The difference is that this method is trying to get an object using multiple queries with different filters.
Also, "create" is invoked without transaction.atomic to reduce number of ConcurrentUpdateError's which can be
raised in AlertGroupQuerySet.create() due to optimistic locking of AlertGroupCounter model.
"""
search_params = {
"channel": channel,
"channel_filter": channel_filter,
"distinction": group_data.group_distinction,
}
# Try to return the last open group
# Note that (channel, channel_filter, distinction, is_open_for_grouping) is in unique_together
try:
return self.get(**search_params, is_open_for_grouping__isnull=False), False
except self.model.DoesNotExist:
pass
# If it's an "OK" alert, try to return the latest resolved group
# (only if the channel allows source base resolving and the alert is a resolve signal)
if channel.allow_source_based_resolving and group_data.is_resolve_signal:
try:
return self.filter(**search_params, resolved=True).latest(), False
except self.model.DoesNotExist:
pass
# Create a new group if we couldn't group it to any existing ones
try:
alert_group = self.create(
**search_params,
is_open_for_grouping=True,
web_title_cache=group_data.web_title_cache,
received_at=received_at,
)
alert_group_created_signal.send(sender=self.__class__, alert_group=alert_group)
return (alert_group, True)
except IntegrityError:
try:
return self.get(**search_params, is_open_for_grouping__isnull=False), False
except self.model.DoesNotExist:
pass
raise
def filter_active(self, *args, **kwargs):
# filter alert groups with active escalation
return super().filter(
*args,
~Q(silenced=True, silenced_until__isnull=True), # filter silenced forever alert_groups
**kwargs,
maintenance_uuid__isnull=True,
is_escalation_finished=False,
resolved=False,
acknowledged=False,
root_alert_group=None,
)
class AlertGroupSlackRenderingMixin:
"""
Ideally this mixin should not exist. Instead of this instance of AlertGroupSlackRenderer should be created and used
but slack rendering is distributed throughout the codebase.
"""
@cached_property
def slack_renderer(self):
return AlertGroupSlackRenderer(self)
def render_slack_attachments(self):
return self.slack_renderer.render_alert_group_attachments()
def render_slack_blocks(self):
return self.slack_renderer.render_alert_group_blocks()
@property
def slack_templated_first_alert(self):
return self.slack_renderer.alert_renderer.templated_alert
class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models.Model):
acknowledged_by_user: typing.Optional["User"]
alerts: "RelatedManager['Alert']"
bundled_notifications: "RelatedManager['BundledNotification']"
channel: "AlertReceiveChannel"
dependent_alert_groups: "RelatedManager['AlertGroup']"
invitations: "RelatedManager['Invitation']"
labels: "RelatedManager['AlertGroupAssociatedLabel']"
log_records: "RelatedManager['AlertGroupLogRecord']"
personal_log_records: "RelatedManager['UserNotificationPolicyLogRecord']"
related_incidents: "RelatedManager['RelatedIncident']"
resolution_notes: "RelatedManager['ResolutionNote']"
resolution_note_slack_messages: "RelatedManager['ResolutionNoteSlackMessage']"
resolved_by_user: typing.Optional["User"]
root_alert_group: typing.Optional["AlertGroup"]
silenced_by_user: typing.Optional["User"]
slack_messages: "RelatedManager['SlackMessage']"
users: "RelatedManager['User']"
objects: models.Manager["AlertGroup"] = AlertGroupQuerySet.as_manager()
(
NEW,
ACKNOWLEDGED,
RESOLVED,
SILENCED,
) = range(4)
# exists for status filter in API
STATUS_CHOICES = ((NEW, "New"), (ACKNOWLEDGED, "Acknowledged"), (RESOLVED, "Resolved"), (SILENCED, "Silenced"))
GroupData = namedtuple(
"GroupData", ["is_resolve_signal", "group_distinction", "web_title_cache", "is_acknowledge_signal"]
)
SOURCE, USER, NOT_YET, LAST_STEP, ARCHIVED, WIPED, DISABLE_MAINTENANCE, NOT_YET_STOP_AUTORESOLVE = range(8)
SOURCE_CHOICES = (
(SOURCE, "source"),
(USER, "user"),
(NOT_YET, "not yet"),
(LAST_STEP, "last escalation step"),
(ARCHIVED, "archived"), # deprecated. don't use
(WIPED, "wiped"),
(DISABLE_MAINTENANCE, "stop maintenance"),
(NOT_YET_STOP_AUTORESOLVE, "not yet, autoresolve disabled"),
)
ACKNOWLEDGE = "acknowledge"
RESOLVE = "resolve"
SILENCE = "silence"
RESTART = "restart"
BULK_ACTIONS = [
ACKNOWLEDGE,
RESOLVE,
SILENCE,
RESTART,
]
public_primary_key = models.CharField(
max_length=20,
validators=[MinLengthValidator(settings.PUBLIC_PRIMARY_KEY_MIN_LENGTH + 1)],
unique=True,
default=generate_public_primary_key_for_alert_group,
)
channel = models.ForeignKey(
"alerts.AlertReceiveChannel",
on_delete=models.CASCADE,
related_name="alert_groups",
)
# Distinction is a difference between groups inside the same channel.
# For example different types of alerts from the same channel should go to different groups.
# Distinction is what describes their difference.
distinction = models.CharField(max_length=100, null=True, default=None, db_index=True)
web_title_cache = models.TextField(null=True, default=None)
inside_organization_number = models.IntegerField(default=0)
channel_filter = models.ForeignKey(
"alerts.ChannelFilter",
on_delete=models.SET_DEFAULT,
related_name="alert_groups",
null=True,
default=None,
)
resolved = models.BooleanField(default=False)
resolved_by = models.IntegerField(choices=SOURCE_CHOICES, default=NOT_YET)
resolved_by_user = models.ForeignKey(
"user_management.User",
on_delete=models.SET_NULL,
null=True,
default=None,
related_name="resolved_alert_groups",
)
resolved_by_alert = models.ForeignKey(
"alerts.Alert",
on_delete=models.DO_NOTHING,
db_constraint=False,
null=True,
default=None,
related_name="resolved_alert_groups",
)
"""
⚠️ This field is no longer being set/read anywhere, DON'T USE IT! ⚠️
TODO: We still need to figure out how to remove it safely.
See [this conversation](https://raintank-corp.slack.com/archives/C07RGREUH4Z/p1728494111646319) for more context
"""
resolved_at = models.DateTimeField(blank=True, null=True)
acknowledged = models.BooleanField(default=False)
acknowledged_on_source = models.BooleanField(default=False)
acknowledged_at = models.DateTimeField(blank=True, null=True)
acknowledged_by = models.IntegerField(choices=SOURCE_CHOICES, default=NOT_YET)
acknowledged_by_user = models.ForeignKey(
"user_management.User",
on_delete=models.SET_NULL,
null=True,
default=None,
related_name="acknowledged_alert_groups",
)
acknowledged_by_confirmed = models.DateTimeField(null=True, default=None)
is_escalation_finished = models.BooleanField(default=False)
started_at = models.DateTimeField(auto_now_add=True, db_index=True)
slack_message_sent = models.BooleanField(default=False)
active_escalation_id = models.CharField(max_length=100, null=True, default=None) # ID generated by celery
active_resolve_calculation_id = models.CharField(max_length=100, null=True, default=None) # ID generated by celery
SILENCE_DELAY_OPTIONS = (
(1800, "30 minutes"),
(3600, "1 hour"),
(7200, "2 hours"),
(10800, "3 hours"),
(14400, "4 hours"),
(21600, "6 hours"),
(43200, "12 hours"),
(57600, "16 hours"),
(72000, "20 hours"),
(86400, "24 hours"),
(-1, "Forever"),
)
silenced = models.BooleanField(default=False)
silenced_at = models.DateTimeField(null=True)
silenced_by_user = models.ForeignKey(
"user_management.User",
on_delete=models.SET_NULL,
null=True,
default=None,
related_name="silenced_alert_groups",
)
silenced_until = models.DateTimeField(blank=True, null=True)
unsilence_task_uuid = models.CharField(max_length=100, null=True, default=None)
restarted_at = models.DateTimeField(blank=True, null=True, default=None)
response_time = models.DurationField(null=True, default=None)
received_at = models.DateTimeField(blank=True, null=True, default=None)
@property
def is_silenced_forever(self):
return self.silenced and self.silenced_until is None
@property
def is_silenced_for_period(self):
return self.silenced and self.silenced_until is not None
@property
def status(self) -> int:
if self.resolved:
return AlertGroup.RESOLVED
elif self.acknowledged:
return AlertGroup.ACKNOWLEDGED
elif self.silenced:
return AlertGroup.SILENCED
else:
return AlertGroup.NEW
(
ACCOUNT_INACTIVE,
CHANNEL_ARCHIVED,
NO_REASON,
RATE_LIMITED,
CHANNEL_NOT_SPECIFIED,
RESTRICTED_ACTION,
INVALID_AUTH,
) = range(7)
REASONS_TO_SKIP_ESCALATIONS = (
(ACCOUNT_INACTIVE, "account_inactive"),
(CHANNEL_ARCHIVED, "is_archived"),
(NO_REASON, "no_reason"),
(RATE_LIMITED, "rate_limited"),
(CHANNEL_NOT_SPECIFIED, "channel_not_specified"),
(RESTRICTED_ACTION, "restricted_action"),
(INVALID_AUTH, "invalid_auth"),
)
reason_to_skip_escalation = models.IntegerField(choices=REASONS_TO_SKIP_ESCALATIONS, default=NO_REASON)
root_alert_group = models.ForeignKey(
"alerts.AlertGroup",
on_delete=models.SET_NULL,
null=True,
default=None,
related_name="dependent_alert_groups",
)
last_unique_unacknowledge_process_id = models.CharField(max_length=100, null=True, default=None)
wiped_at = models.DateTimeField(null=True, default=None)
wiped_by = models.ForeignKey(
"user_management.User",
on_delete=models.SET_NULL,
null=True,
default=None,
related_name="wiped_alert_groups",
)
prevent_posting_alerts = models.BooleanField(default=False)
maintenance_uuid = models.CharField(max_length=100, unique=True, null=True, default=None)
raw_escalation_snapshot = JSONField(null=True, default=None)
# This field is used for constraints so we can use get_or_create() in concurrent calls
# https://docs.djangoproject.com/en/3.2/ref/models/querysets/#get-or-create
# Combined with unique_together below, it allows only one alert group with
# the combination (alert_receive_channel_id, channel_filter_id, distinction, is_open_for_grouping=True)
# If is_open_for_grouping=None, then we can have as many combinations of
# (alert_receive_channel_id, channel_filter_id, distinction, is_open_for_grouping=None) as we want
# We just don't care about that because we'll use only get_or_create(...is_open_for_grouping=True...)
# https://code.djangoproject.com/ticket/28545
is_open_for_grouping = models.BooleanField(default=None, null=True, blank=True)
grafana_incident_id = models.CharField(max_length=100, null=True, default=None)
@staticmethod
def get_silenced_state_filter():
"""
models.Value(0/1) is used instead of True/False because django translates that into
WHERE bool_field=0/1 instead of WHERE bool_field/NOT bool_field
which works much faster in mysql
"""
return Q(silenced=models.Value("1")) & Q(acknowledged=models.Value("0")) & Q(resolved=models.Value("0"))
@staticmethod
def get_new_state_filter():
"""
models.Value(0/1) is used instead of True/False because django translates that into
WHERE bool_field=0/1 instead of WHERE bool_field/NOT bool_field
which works much faster in mysql
"""
return Q(silenced=models.Value("0")) & Q(acknowledged=models.Value("0")) & Q(resolved=models.Value("0"))
@staticmethod
def get_acknowledged_state_filter():
"""
models.Value(0/1) is used instead of True/False because django translates that into
WHERE bool_field=0/1 instead of WHERE bool_field/NOT bool_field
which works much faster in mysql
"""
return Q(acknowledged=models.Value("1")) & Q(resolved=models.Value("0"))
@staticmethod
def get_resolved_state_filter():
"""
models.Value(0/1) is used instead of True/False because django translates that into
WHERE bool_field=0/1 instead of WHERE bool_field/NOT bool_field
which works much faster in mysql
"""
return Q(resolved=models.Value("1"))
class Meta:
get_latest_by = "pk"
unique_together = [
"channel_id",
"channel_filter_id",
"distinction",
"is_open_for_grouping",
]
indexes = [
models.Index(
fields=["channel_id", "resolved", "acknowledged", "silenced", "root_alert_group_id", "started_at"],
name="alert_group_list_index",
),
]
def __str__(self):
return f"{self.pk}: {self.web_title_cache}"
@property
def is_maintenance_incident(self):
return self.maintenance_uuid is not None
def stop_maintenance(self, user: User) -> None:
from apps.alerts.models import AlertReceiveChannel
try:
integration_on_maintenance = AlertReceiveChannel.objects.get(maintenance_uuid=self.maintenance_uuid)
integration_on_maintenance.force_disable_maintenance(user)
return
except AlertReceiveChannel.DoesNotExist:
pass
self.resolve_by_disable_maintenance()
@property
def skip_escalation_in_slack(self):
return self.reason_to_skip_escalation in (
AlertGroup.CHANNEL_ARCHIVED,
AlertGroup.ACCOUNT_INACTIVE,
AlertGroup.RATE_LIMITED,
AlertGroup.CHANNEL_NOT_SPECIFIED,
AlertGroup.RESTRICTED_ACTION,
)
def is_alert_a_resolve_signal(self, alert):
raise NotImplementedError
@property
def slack_permalink(self) -> typing.Optional[str]:
return None if self.slack_message is None else self.slack_message.permalink
@property
def slack_app_link(self) -> typing.Optional[str]:
return None if self.slack_message is None else self.slack_message.deep_link
@property
def telegram_permalink(self) -> typing.Optional[str]:
from apps.telegram.models.message import TelegramMessage
try:
# prefetched_telegram_messages could be set in apps.api.serializers.alert_group.AlertGroupListSerializer
main_telegram_message = self.prefetched_telegram_messages[0] if self.prefetched_telegram_messages else None
except AttributeError:
main_telegram_message = (
self.telegram_messages.filter(chat_id__startswith="-", message_type=TelegramMessage.ALERT_GROUP_MESSAGE)
.order_by("id")
.first()
)
return main_telegram_message.link if main_telegram_message else None
@property
def permalinks(self) -> Permalinks:
return {
"slack": self.slack_permalink,
"slack_app": self.slack_app_link,
"telegram": self.telegram_permalink,
"web": self.web_link,
}
@property
def web_link(self) -> str:
return UIURLBuilder(self.channel.organization).alert_group_detail(self.public_primary_key)
@property
def declare_incident_link(self) -> str:
"""
Generate a link for AlertGroup to declare Grafana Incident by click
"""
caption = urllib.parse.quote_plus("OnCall Alert Group")
title = urllib.parse.quote_plus(self.web_title_cache) if self.web_title_cache else DEFAULT_BACKUP_TITLE
title = title[:2000] # set max title length to avoid exceptions with too long declare incident link
link = urllib.parse.quote_plus(self.web_link)
return UIURLBuilder(self.channel.organization).declare_incident(f"?caption={caption}&url={link}&title={title}")
@property
def happened_while_maintenance(self):
return self.root_alert_group is not None and self.root_alert_group.maintenance_uuid is not None
def get_paged_users(self) -> typing.List[PagedUser]:
from apps.alerts.models import AlertGroupLogRecord
user_ids: typing.Set[str] = set()
users: typing.Dict[str, PagedUser] = {}
organization = self.channel.organization
log_records = self.log_records.filter(
type__in=(AlertGroupLogRecord.TYPE_DIRECT_PAGING, AlertGroupLogRecord.TYPE_UNPAGE_USER)
).order_by("created_at")
for log_record in log_records:
# filter paging events, track still active escalations
info = log_record.get_step_specific_info()
user_id = info.get("user") if info else None
important = info.get("important") if info else None
if user_id is not None:
user_ids.add(
user_id
) if log_record.type == AlertGroupLogRecord.TYPE_DIRECT_PAGING else user_ids.discard(user_id)
user_instances = User.objects.filter(public_primary_key__in=user_ids)
user_map = {u.public_primary_key: u for u in user_instances}
# mostly doing this second loop to avoid having to query each user individually in the first loop
for log_record in log_records:
# filter paging events, track still active escalations
info = log_record.get_step_specific_info()
user_id = info.get("user") if info else None
important = info.get("important") if info else False
if user_id is not None and (user := user_map.get(user_id)) is not None:
if log_record.type == AlertGroupLogRecord.TYPE_DIRECT_PAGING:
# add the user
users[user_id] = {
"id": user.pk,
"pk": user.public_primary_key,
"name": user.name,
"username": user.username,
"avatar": user.avatar_url,
"avatar_full": user.avatar_full_url(organization),
"important": important,
"teams": [{"pk": t.public_primary_key, "name": t.name} for t in user.teams.all()],
}
else:
# user was unpaged at some point, remove them
# there could be multiple unpage log records if API was hit several times
if user_id in users:
del users[user_id]
return list(users.values())
def _get_response_time(self):
"""Return response_time based on current alert group status."""
response_time = None
timestamps = (self.acknowledged_at, self.resolved_at, self.silenced_at, self.wiped_at)
min_timestamp = min((ts for ts in timestamps if ts), default=None)
if min_timestamp:
response_time = min_timestamp - self.started_at
return response_time
def _update_metrics(self, organization_id, previous_state, state):
"""Update metrics cache for response time and state as needed."""
update_metrics_for_alert_group.apply_async((self.id, organization_id, previous_state, state))
def update_state_by_backsync(self, new_state: AlertGroupState, source_channel: "AlertReceiveChannel") -> None:
if self.state == new_state:
return
logger.debug(f"Update state {self.state} -> {new_state} for alert_group {self.pk}")
kwargs = {
"source_channel": source_channel,
"action_source": ActionSource.BACKSYNC,
}
if new_state == AlertGroupState.FIRING:
if self.state == AlertGroupState.ACKNOWLEDGED:
self.un_acknowledge_by_user_or_backsync(**kwargs)
elif self.state == AlertGroupState.RESOLVED:
self.un_resolve_by_user_or_backsync(**kwargs)
elif self.state == AlertGroupState.SILENCED:
self.un_silence_by_user_or_backsync(**kwargs)
elif new_state == AlertGroupState.ACKNOWLEDGED:
self.acknowledge_by_user_or_backsync(**kwargs)
elif new_state == AlertGroupState.RESOLVED:
self.resolve_by_user_or_backsync(**kwargs)
elif new_state == AlertGroupState.SILENCED:
self.silence_by_user_or_backsync(**kwargs)
def acknowledge_by_user_or_backsync(
self,
user: typing.Optional[User] = None,
source_channel: typing.Optional["AlertReceiveChannel"] = None,
action_source: typing.Optional[ActionSource] = None,
) -> None:
from apps.alerts.models import AlertGroupLogRecord
initial_state = self.state
reason = "Acknowledge button" if user else "Backsync signal"
acknowledged_by = AlertGroup.USER if user else AlertGroup.SOURCE
step_specific_info = (
{"source_integration_name": source_channel.verbal_name} if action_source == ActionSource.BACKSYNC else None
)
organization_id = user.organization_id if user else self.channel.organization_id
logger.debug(f"Started acknowledge_by_user_or_backsync for alert_group {self.pk}")
# if incident was silenced or resolved, unsilence/unresolve it without starting escalation
if self.silenced:
self.un_silence()
self.log_records.create(
type=AlertGroupLogRecord.TYPE_UN_SILENCE,
author=user,
silence_delay=None,
reason=reason,
action_source=action_source,
step_specific_info=step_specific_info,
)
if self.resolved:
self.unresolve()
self.log_records.create(
type=AlertGroupLogRecord.TYPE_UN_RESOLVED,
author=user,
reason=reason,
action_source=action_source,
step_specific_info=step_specific_info,
)
self.acknowledge(acknowledged_by_user=user, acknowledged_by=acknowledged_by)
# Update alert group state and response time metrics cache
self._update_metrics(organization_id=organization_id, previous_state=initial_state, state=self.state)
self.stop_escalation()
if user: # ack reminder works only for actions performed by user
self.start_ack_reminder_if_needed()
with transaction.atomic():
log_record = self.log_records.create(
type=AlertGroupLogRecord.TYPE_ACK,
author=user,
action_source=action_source,
step_specific_info=step_specific_info,
)
logger.debug(
f"send alert_group_action_triggered_signal for alert_group {self.pk}, "
f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: {action_source}"
)
transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk))
for dependent_alert_group in self.dependent_alert_groups.all():
dependent_alert_group.acknowledge_by_user_or_backsync(
user, source_channel=source_channel, action_source=action_source
)
logger.debug(f"Finished acknowledge_by_user_or_backsync for alert_group {self.pk}")
def acknowledge_by_source(self):
from apps.alerts.models import AlertGroupLogRecord
initial_state = self.state
# if incident was silenced, unsilence it without starting escalation
if self.silenced:
self.un_silence()
self.log_records.create(
type=AlertGroupLogRecord.TYPE_UN_SILENCE,
silence_delay=None,
reason="Acknowledge by source",
)
self.acknowledge(acknowledged_by=AlertGroup.SOURCE)
# Update alert group state and response time metrics cache
self._update_metrics(
organization_id=self.channel.organization_id, previous_state=initial_state, state=self.state
)
self.stop_escalation()
with transaction.atomic():
log_record = self.log_records.create(type=AlertGroupLogRecord.TYPE_ACK)
logger.debug(
f"send alert_group_action_triggered_signal for alert_group {self.pk}, "
f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: alert"
)
transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk))
for dependent_alert_group in self.dependent_alert_groups.all():
dependent_alert_group.acknowledge_by_source()
def un_acknowledge_by_user_or_backsync(
self,
user: typing.Optional[User] = None,
source_channel: typing.Optional["AlertReceiveChannel"] = None,
action_source: typing.Optional[ActionSource] = None,
) -> None:
from apps.alerts.models import AlertGroupLogRecord
initial_state = self.state
step_specific_info = (
{"source_integration_name": source_channel.verbal_name} if action_source == ActionSource.BACKSYNC else None
)
organization_id = user.organization_id if user else self.channel.organization_id
logger.debug(f"Started un_acknowledge_by_user_or_backsync for alert_group {self.pk}")
self.unacknowledge()
# Update alert group state metric cache
self._update_metrics(organization_id=organization_id, previous_state=initial_state, state=self.state)
if self.is_root_alert_group:
self.start_escalation_if_needed()
with transaction.atomic():
log_record = self.log_records.create(
type=AlertGroupLogRecord.TYPE_UN_ACK,
author=user,
action_source=action_source,
step_specific_info=step_specific_info,
)
logger.debug(
f"send alert_group_action_triggered_signal for alert_group {self.pk}, "
f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: {action_source}"
)
transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk))
for dependent_alert_group in self.dependent_alert_groups.all():
dependent_alert_group.un_acknowledge_by_user_or_backsync(
user, source_channel=source_channel, action_source=action_source
)
logger.debug(f"Finished un_acknowledge_by_user_or_backsync for alert_group {self.pk}")
def resolve_by_user_or_backsync(
self,
user: typing.Optional[User] = None,
source_channel: typing.Optional["AlertReceiveChannel"] = None,
action_source: typing.Optional[ActionSource] = None,
) -> None:
from apps.alerts.models import AlertGroupLogRecord
initial_state = self.state
reason = "Resolve button" if user else "Backsync signal"
resolved_by = AlertGroup.USER if user else AlertGroup.SOURCE
step_specific_info = (
{"source_integration_name": source_channel.verbal_name} if action_source == ActionSource.BACKSYNC else None
)
organization_id = user.organization_id if user else self.channel.organization_id
# if incident was silenced, unsilence it without starting escalation
if self.silenced:
self.un_silence()
self.log_records.create(
type=AlertGroupLogRecord.TYPE_UN_SILENCE,
author=user,
silence_delay=None,
reason=reason,
action_source=action_source,
step_specific_info=step_specific_info,
)
self.resolve(resolved_by=resolved_by, resolved_by_user=user)
# Update alert group state and response time metrics cache
self._update_metrics(organization_id=organization_id, previous_state=initial_state, state=self.state)
self.stop_escalation()
with transaction.atomic():
log_record = self.log_records.create(
type=AlertGroupLogRecord.TYPE_RESOLVED,
author=user,
action_source=action_source,
step_specific_info=step_specific_info,
)
logger.debug(
f"send alert_group_action_triggered_signal for alert_group {self.pk}, "
f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: {action_source}"
)
transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk))
for dependent_alert_group in self.dependent_alert_groups.all():
dependent_alert_group.resolve_by_user_or_backsync(
user, source_channel=source_channel, action_source=action_source
)
def resolve_by_source(self):
from apps.alerts.models import AlertGroupLogRecord
initial_state = self.state
# if incident was silenced, unsilence it without starting escalation
if self.silenced:
self.un_silence()
self.log_records.create(
type=AlertGroupLogRecord.TYPE_UN_SILENCE,
silence_delay=None,
reason="Resolve by source",
)
self.resolve(resolved_by=AlertGroup.SOURCE)
# Update alert group state and response time metrics cache
self._update_metrics(
organization_id=self.channel.organization_id, previous_state=initial_state, state=self.state
)
self.stop_escalation()
with transaction.atomic():
log_record = self.log_records.create(type=AlertGroupLogRecord.TYPE_RESOLVED)
logger.debug(
f"send alert_group_action_triggered_signal for alert_group {self.pk}, "
f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: alert"
)
transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk))
for dependent_alert_group in self.dependent_alert_groups.all():
dependent_alert_group.resolve_by_source()
def resolve_by_last_step(self):
from apps.alerts.models import AlertGroupLogRecord
initial_state = self.state
self.resolve(resolved_by=AlertGroup.LAST_STEP)
# Update alert group state and response time metrics cache
self._update_metrics(
organization_id=self.channel.organization_id, previous_state=initial_state, state=self.state
)
self.stop_escalation()
with transaction.atomic():
log_record = self.log_records.create(type=AlertGroupLogRecord.TYPE_RESOLVED)
logger.debug(
f"send alert_group_action_triggered_signal for alert_group {self.pk}, "
f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: resolve step"
)
transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk))
for dependent_alert_group in self.dependent_alert_groups.all():
dependent_alert_group.resolve_by_last_step()
def resolve_by_disable_maintenance(self):
from apps.alerts.models import AlertGroupLogRecord
self.resolve(resolved_by=AlertGroup.DISABLE_MAINTENANCE)
self.stop_escalation()
with transaction.atomic():
log_record = self.log_records.create(type=AlertGroupLogRecord.TYPE_RESOLVED)
logger.debug(
f"send alert_group_action_triggered_signal for alert_group {self.pk}, "
f"log record {log_record.pk} with type '{log_record.get_type_display()}', "
f"action source: disable maintenance"
)
transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk))
for dependent_alert_group in self.dependent_alert_groups.all():
dependent_alert_group.resolve_by_disable_maintenance()
def un_resolve_by_user_or_backsync(
self,
user: typing.Optional[User] = None,
source_channel: typing.Optional["AlertReceiveChannel"] = None,
action_source: typing.Optional[ActionSource] = None,
) -> None:
from apps.alerts.models import AlertGroupLogRecord
if self.wiped_at is None:
initial_state = self.state
step_specific_info = (
{"source_integration_name": source_channel.verbal_name}
if action_source == ActionSource.BACKSYNC
else None
)
organization_id = user.organization_id if user else self.channel.organization_id
self.unresolve()
# Update alert group state metric cache
self._update_metrics(organization_id=organization_id, previous_state=initial_state, state=self.state)
with transaction.atomic():
log_record = self.log_records.create(
type=AlertGroupLogRecord.TYPE_UN_RESOLVED,
author=user,
action_source=action_source,
step_specific_info=step_specific_info,
)
if self.is_root_alert_group:
self.start_escalation_if_needed()
logger.debug(
f"send alert_group_action_triggered_signal for alert_group {self.pk}, "
f"log record {log_record.pk} with type '{log_record.get_type_display()}', "
f"action source: {action_source}"
)
transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk))
for dependent_alert_group in self.dependent_alert_groups.all():
dependent_alert_group.un_resolve_by_user_or_backsync(
user, source_channel=source_channel, action_source=action_source
)
def attach_by_user(
self, user: User, root_alert_group: "AlertGroup", action_source: typing.Optional[ActionSource] = None
) -> None:
from apps.alerts.models import AlertGroupLogRecord
if root_alert_group.root_alert_group is None and not root_alert_group.resolved:
self.root_alert_group = root_alert_group
self.save(update_fields=["root_alert_group"])
self.stop_escalation()
if root_alert_group.acknowledged and not self.acknowledged:
self.acknowledge_by_user_or_backsync(user, action_source=action_source)
elif not root_alert_group.acknowledged and self.acknowledged:
self.un_acknowledge_by_user_or_backsync(user, action_source=action_source)
if root_alert_group.silenced and not self.silenced:
self.silence_by_user_or_backsync(user, action_source=action_source, silence_delay=None)
if not root_alert_group.silenced and self.silenced:
self.un_silence_by_user_or_backsync(user, action_source=action_source)
with transaction.atomic():
log_record = self.log_records.create(
type=AlertGroupLogRecord.TYPE_ATTACHED,
author=user,
root_alert_group=root_alert_group,
reason="Attach dropdown",
action_source=action_source,
)
logger.debug(
f"send alert_group_action_triggered_signal for alert_group {self.pk}, "
f"log record {log_record.pk} with type '{log_record.get_type_display()}', "
f"action source: {action_source}"
)
transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk))
log_record_for_root_incident = root_alert_group.log_records.create(
type=AlertGroupLogRecord.TYPE_ATTACHED,
author=user,
dependent_alert_group=self,
reason="Attach dropdown",
action_source=action_source,
)
logger.debug(
f"send alert_group_action_triggered_signal for alert_group {root_alert_group.pk}, "
f"log record {log_record_for_root_incident.pk} with type "
f"'{log_record_for_root_incident.get_type_display()}', action source: {action_source}"
)
transaction.on_commit(partial(send_alert_group_signal.delay, log_record_for_root_incident.pk))
else:
log_record = self.log_records.create(
type=AlertGroupLogRecord.TYPE_FAILED_ATTACHMENT,
author=user,
root_alert_group=root_alert_group,
reason="Failed to attach dropdown",
action_source=action_source,
)
logger.debug(
f"send alert_group_action_triggered_signal for alert_group {self.pk}, "
f"log record {log_record.pk} with type '{log_record.get_type_display()}', "
f"action source: {action_source}"
)
transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk))
def un_attach_by_user(self, user: User, action_source: typing.Optional[ActionSource] = None) -> None:
from apps.alerts.models import AlertGroupLogRecord
root_alert_group: AlertGroup = self.root_alert_group
self.root_alert_group = None
self.save(update_fields=["root_alert_group"])
self.start_escalation_if_needed()
with transaction.atomic():
log_record = self.log_records.create(
type=AlertGroupLogRecord.TYPE_UNATTACHED,
author=user,
root_alert_group=root_alert_group,
reason="Unattach button",
action_source=action_source,
)
logger.debug(
f"send alert_group_action_triggered_signal for alert_group {self.pk}, "
f"log record {log_record.pk} with type '{log_record.get_type_display()}', "
f"action source: {action_source}"
)
transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk))
log_record_for_root_incident = root_alert_group.log_records.create(
type=AlertGroupLogRecord.TYPE_UNATTACHED,
author=user,
dependent_alert_group=self,
reason="Unattach dropdown",
action_source=action_source,
)
logger.debug(
f"send alert_group_action_triggered_signal for alert_group {root_alert_group.pk}, "
f"log record {log_record_for_root_incident.pk} "
f"with type '{log_record_for_root_incident.get_type_display()}', action source: {action_source}"
)
transaction.on_commit(partial(send_alert_group_signal.delay, log_record_for_root_incident.pk))
def un_attach_by_delete(self):
from apps.alerts.models import AlertGroupLogRecord
self.root_alert_group = None
self.save(update_fields=["root_alert_group"])
self.start_escalation_if_needed()
with transaction.atomic():
log_record = self.log_records.create(
type=AlertGroupLogRecord.TYPE_UNATTACHED,
reason="Unattach by deleting root incident",
)
logger.debug(
f"send alert_group_action_triggered_signal for alert_group {self.pk}, "
f"log record {log_record.pk} with type '{log_record.get_type_display()}', "
f"action source: delete"
)
transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk))
def silence_by_user_or_backsync(
self,
user: typing.Optional[User] = None,
source_channel: typing.Optional["AlertReceiveChannel"] = None,
silence_delay: typing.Optional[int] = None,
action_source: typing.Optional[ActionSource] = None,
) -> None:
from apps.alerts.models import AlertGroupLogRecord
initial_state = self.state
reason = "Silence button" if user else "Backsync signal"
step_specific_info = (
{"source_integration_name": source_channel.verbal_name} if action_source == ActionSource.BACKSYNC else None
)
organization_id = user.organization_id if user else self.channel.organization_id
if self.resolved:
self.unresolve()
self.log_records.create(
type=AlertGroupLogRecord.TYPE_UN_RESOLVED,
author=user,
reason=reason,
action_source=action_source,
step_specific_info=step_specific_info,
)
if self.acknowledged:
self.unacknowledge()
self.log_records.create(
type=AlertGroupLogRecord.TYPE_UN_ACK,
author=user,
reason=reason,
action_source=action_source,
step_specific_info=step_specific_info,
)
if self.silenced:
self.un_silence()
self.log_records.create(
type=AlertGroupLogRecord.TYPE_UN_SILENCE,
author=user,
silence_delay=None,
reason=reason,
action_source=action_source,
step_specific_info=step_specific_info,
)
now = timezone.now()
if silence_delay is not None and silence_delay > 0:
silence_delay_timedelta = datetime.timedelta(seconds=silence_delay)
silenced_until = now + silence_delay_timedelta
if self.is_root_alert_group:
self.update_next_step_eta(datetime.timedelta(seconds=silence_delay + START_ESCALATION_DELAY))
self.start_unsilence_task(countdown=silence_delay)
else:
silence_delay_timedelta = None
silenced_until = None
self.silence(
silenced_at=now,
silenced_until=silenced_until,
silenced_by_user=user,
raw_escalation_snapshot=self.raw_escalation_snapshot,
)
# Update alert group state and response time metrics cache
self._update_metrics(organization_id=organization_id, previous_state=initial_state, state=self.state)
with transaction.atomic():
log_record = self.log_records.create(
type=AlertGroupLogRecord.TYPE_SILENCE,
author=user,
silence_delay=silence_delay_timedelta,
reason=reason,
action_source=action_source,
step_specific_info=step_specific_info,
)
logger.debug(
f"send alert_group_action_triggered_signal for alert_group {self.pk}, "
f"log record {log_record.pk} with type '{log_record.get_type_display()}', "
f"action source: {action_source}"
)
transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk))
for dependent_alert_group in self.dependent_alert_groups.all():
dependent_alert_group.silence_by_user_or_backsync(user, source_channel, silence_delay, action_source)
def un_silence_by_user_or_backsync(
self,
user: typing.Optional[User] = None,
source_channel: typing.Optional["AlertReceiveChannel"] = None,
action_source: typing.Optional[ActionSource] = None,
) -> None:
from apps.alerts.models import AlertGroupLogRecord
initial_state = self.state
reason = "Unsilence button" if user else "Backsync signal"
step_specific_info = (
{"source_integration_name": source_channel.verbal_name} if action_source == ActionSource.BACKSYNC else None
)
organization_id = user.organization_id if user else self.channel.organization_id
self.un_silence()
# Update alert group state metric cache
self._update_metrics(organization_id=organization_id, previous_state=initial_state, state=self.state)
if self.is_root_alert_group:
self.start_escalation_if_needed()
with transaction.atomic():
log_record = self.log_records.create(
type=AlertGroupLogRecord.TYPE_UN_SILENCE,
author=user,
silence_delay=None,
# 2.Look like some time ago there was no TYPE_UN_SILENCE
reason=reason,
action_source=action_source,
step_specific_info=step_specific_info,
)
logger.debug(
f"send alert_group_action_triggered_signal for alert_group {self.pk}, "
f"log record {log_record.pk} with type '{log_record.get_type_display()}', "
f"action source: {action_source}"
)
transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk))
for dependent_alert_group in self.dependent_alert_groups.all():
dependent_alert_group.un_silence_by_user_or_backsync(
user, source_channel=source_channel, action_source=action_source
)
def wipe_by_user(self, user: User) -> None:
from apps.alerts.models import AlertGroupLogRecord
initial_state = self.state
if not self.wiped_at:
self.resolve(resolved_by=AlertGroup.WIPED)
self.stop_escalation()
self.distinction = ""
self.web_title_cache = None
self.wiped_at = timezone.now()
self.wiped_by = user
update_fields = ["distinction", "web_title_cache", "wiped_at", "wiped_by"]
if self.response_time is None:
self.response_time = self._get_response_time()
update_fields += ["response_time"]
for alert in self.alerts.all():
alert.wipe(wiped_by=self.wiped_by, wiped_at=self.wiped_at)
self.save(update_fields=update_fields)
# Update alert group state and response time metrics cache
self._update_metrics(organization_id=user.organization_id, previous_state=initial_state, state=self.state)
with transaction.atomic():
log_record = self.log_records.create(
type=AlertGroupLogRecord.TYPE_WIPED,
author=user,
)
logger.debug(
f"send alert_group_action_triggered_signal for alert_group {self.pk}, "
f"log record {log_record.pk} with type '{log_record.get_type_display()}', "
f"action source: wipe"
)
transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk))
for dependent_alert_group in self.dependent_alert_groups.all():
dependent_alert_group.wipe_by_user(user)
def delete_by_user(self, user: User):
from apps.alerts.models import AlertGroupLogRecord
self.stop_escalation()
with transaction.atomic():
# prevent creating multiple logs
# filter instead of get_or_create cause it can be multiple logs of this type due deleting error
log_record = self.log_records.filter(type=AlertGroupLogRecord.TYPE_DELETED).last()
if not log_record:
log_record = self.log_records.create(
type=AlertGroupLogRecord.TYPE_DELETED,
author=user,
)
logger.debug(
f"send alert_group_action_triggered_signal for alert_group {self.pk}, "
f"in channel {self.channel.pk}, in org {self.channel.organization.pk}, by user {user.pk}, "
f"log record {log_record.pk} with type '{log_record.get_type_display()}', "
f"action source: delete"
)
transaction.on_commit(partial(send_alert_group_signal_for_delete.delay, self.pk, log_record.pk))
def finish_delete_by_user(self):
dependent_alerts = list(self.dependent_alert_groups.all())
self.hard_delete()
# unattach dependent incidents
for dependent_alert_group in dependent_alerts:
dependent_alert_group.un_attach_by_delete()
def hard_delete(self):
from apps.alerts.models import ResolutionNote
alerts = self.alerts.all()
alerts.delete()
self.slack_messages.all().delete()
self.personal_log_records.all().delete()
self.log_records.all().delete()
self.invitations.all().delete()
resolution_notes = ResolutionNote.objects_with_deleted.filter(alert_group=self)
resolution_notes.delete()
self.resolution_note_slack_messages.all().delete()
self.delete()
@staticmethod
def _bulk_acknowledge(user: User, alert_groups_to_acknowledge: "QuerySet[AlertGroup]") -> None:
from apps.alerts.models import AlertGroupLogRecord
# it is needed to unserolve those alert_groups which were resolved to build proper log.
alert_groups_to_unresolve_before_acknowledge = alert_groups_to_acknowledge.filter(resolved=models.Value("1"))
# it is needed to unsilence those alert_groups which were silenced to build proper log.
alert_groups_to_unsilence_before_acknowledge = alert_groups_to_acknowledge.filter(silenced=models.Value("1"))
# convert current qs to list to prevent changes by update
alert_groups_to_acknowledge_list = list(alert_groups_to_acknowledge)
alert_groups_to_unresolve_before_acknowledge_list = list(alert_groups_to_unresolve_before_acknowledge)
alert_groups_to_unsilence_before_acknowledge_list = list(alert_groups_to_unsilence_before_acknowledge)
previous_states = []
for alert_group in alert_groups_to_acknowledge_list:
previous_states.append(alert_group.state)
alert_group.acknowledged = True
alert_group.resolved = False
alert_group.resolved_at = None
alert_group.resolved_by = AlertGroup.NOT_YET
alert_group.resolved_by_user = None
alert_group.silenced_until = None
alert_group.silenced_by_user = None
alert_group.silenced_at = None
alert_group.silenced = False
alert_group.acknowledged_at = timezone.now()
alert_group.acknowledged_by_user = user
alert_group.acknowledged_by = AlertGroup.USER
alert_group.is_escalation_finished = True
if alert_group.response_time is None:
alert_group.response_time = alert_group._get_response_time()
fields_to_update = [
"acknowledged",
"resolved",
"resolved_at",
"resolved_by",
"resolved_by_user",
"silenced_until",
"silenced_by_user",
"silenced_at",
"silenced",
"acknowledged_at",
"acknowledged_by_user",
"acknowledged_by",
"is_escalation_finished",
"response_time",
]
AlertGroup.objects.bulk_update(alert_groups_to_acknowledge_list, fields=fields_to_update, batch_size=100)
for alert_group in alert_groups_to_unresolve_before_acknowledge_list:
alert_group.log_records.create(
type=AlertGroupLogRecord.TYPE_UN_RESOLVED,
author=user,
reason="Bulk action acknowledge",
)
for alert_group in alert_groups_to_unsilence_before_acknowledge_list:
alert_group.log_records.create(
type=AlertGroupLogRecord.TYPE_UN_SILENCE, author=user, reason="Bulk action acknowledge"
)
for alert_group, previous_state in zip(alert_groups_to_acknowledge_list, previous_states):
# update metrics cache
alert_group._update_metrics(
organization_id=user.organization_id,
previous_state=previous_state,
state=AlertGroupState.ACKNOWLEDGED,
)
alert_group.start_ack_reminder_if_needed()
log_record = alert_group.log_records.create(type=AlertGroupLogRecord.TYPE_ACK, author=user)
transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk))
@staticmethod
def bulk_acknowledge(user: User, alert_groups: "QuerySet[AlertGroup]") -> None:
root_alert_groups_to_acknowledge = alert_groups.filter(
~Q(acknowledged=True, resolved=False), # don't need to ack acknowledged incidents once again
root_alert_group__isnull=True,
maintenance_uuid__isnull=True, # don't ack maintenance incident
)
# Find all dependent alert_groups to update them in one query
# convert qs to list to prevent changes by update
root_alert_group_pks = list(root_alert_groups_to_acknowledge.values_list("pk", flat=True))
dependent_alert_groups_to_acknowledge = AlertGroup.objects.filter(root_alert_group__pk__in=root_alert_group_pks)
with transaction.atomic():
AlertGroup._bulk_acknowledge(user, root_alert_groups_to_acknowledge)
AlertGroup._bulk_acknowledge(user, dependent_alert_groups_to_acknowledge)
@staticmethod
def _bulk_resolve(user: User, alert_groups_to_resolve: "QuerySet[AlertGroup]") -> None:
from apps.alerts.models import AlertGroupLogRecord
# it is needed to unsilence those alert_groups which were silenced to build proper log.
alert_groups_to_unsilence_before_resolve = alert_groups_to_resolve.filter(silenced=models.Value("1"))
# convert current qs to list to prevent changes by update
alert_groups_to_resolve_list = list(alert_groups_to_resolve)
alert_groups_to_unsilence_before_resolve_list = list(alert_groups_to_unsilence_before_resolve)
previous_states = []
for alert_group in alert_groups_to_resolve_list:
previous_states.append(alert_group.state)
alert_group.resolved = True
alert_group.resolved_at = timezone.now()
alert_group.is_open_for_grouping = None
alert_group.resolved_by_user = user
alert_group.resolved_by = AlertGroup.USER
alert_group.is_escalation_finished = True
alert_group.silenced_until = None
alert_group.silenced_by_user = None
alert_group.silenced_at = None
alert_group.silenced = False
if alert_group.response_time is None:
alert_group.response_time = alert_group._get_response_time()
fields_to_update = [
"resolved",
"resolved_at",
"resolved_by",
"resolved_by_user",
"is_open_for_grouping",
"silenced_until",
"silenced_by_user",
"silenced_at",
"silenced",
"is_escalation_finished",
"response_time",
]
AlertGroup.objects.bulk_update(alert_groups_to_resolve_list, fields=fields_to_update, batch_size=100)
for alert_group in alert_groups_to_unsilence_before_resolve_list:
alert_group.log_records.create(
type=AlertGroupLogRecord.TYPE_UN_SILENCE, author=user, reason="Bulk action resolve"
)
for alert_group, previous_state in zip(alert_groups_to_resolve_list, previous_states):
# update metrics cache
alert_group._update_metrics(
organization_id=user.organization_id,
previous_state=previous_state,
state=AlertGroupState.RESOLVED,
)
log_record = alert_group.log_records.create(type=AlertGroupLogRecord.TYPE_RESOLVED, author=user)
transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk))
@staticmethod
def bulk_resolve(user: User, alert_groups: "QuerySet[AlertGroup]") -> None:
# stop maintenance for maintenance incidents
alert_groups_to_stop_maintenance = alert_groups.filter(resolved=False, maintenance_uuid__isnull=False)
for alert_group in alert_groups_to_stop_maintenance:
alert_group.stop_maintenance(user)
root_alert_groups_to_resolve = alert_groups.filter(
resolved=False,
root_alert_group__isnull=True,
maintenance_uuid__isnull=True,
)
if not root_alert_groups_to_resolve.exists():
return
# we know this is an AlertGroup because of the .exists() check just above
first_alert_group: AlertGroup = root_alert_groups_to_resolve.first()
organization = first_alert_group.channel.organization
if organization.is_resolution_note_required:
root_alert_groups_to_resolve = root_alert_groups_to_resolve.filter(
Q(resolution_notes__isnull=False, resolution_notes__deleted_at=None)
)
# convert qs to list to prevent changes by update
root_alert_group_pks = list(root_alert_groups_to_resolve.values_list("pk", flat=True))
dependent_alert_groups_to_resolve = AlertGroup.objects.filter(root_alert_group__pk__in=root_alert_group_pks)
with transaction.atomic():
AlertGroup._bulk_resolve(user, root_alert_groups_to_resolve)
AlertGroup._bulk_resolve(user, dependent_alert_groups_to_resolve)
@staticmethod
def _bulk_restart_unack(user: User, alert_groups_to_restart_unack: "QuerySet[AlertGroup]") -> None:
from apps.alerts.models import AlertGroupLogRecord
# convert current qs to list to prevent changes by update
alert_groups_to_restart_unack_list = list(alert_groups_to_restart_unack)
alert_groups_to_restart_unack.update(
acknowledged=False,
acknowledged_at=None,
acknowledged_by_user=None,
acknowledged_by=AlertGroup.NOT_YET,
resolved=False,
resolved_at=None,
is_open_for_grouping=None,
resolved_by_user=None,
resolved_by=AlertGroup.NOT_YET,
silenced_until=None,
silenced_by_user=None,
silenced_at=None,
silenced=False,
restarted_at=timezone.now(),
)
# unacknowledge alert groups
for alert_group in alert_groups_to_restart_unack_list:
# update metrics cache (note alert_group.state is the original alert group's state)
alert_group._update_metrics(
organization_id=user.organization_id,
previous_state=alert_group.state,
state=AlertGroupState.FIRING,
)
log_record = alert_group.log_records.create(
type=AlertGroupLogRecord.TYPE_UN_ACK,
author=user,
reason="Bulk action restart",
)
if alert_group.is_root_alert_group:
alert_group.start_escalation_if_needed()
transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk))
@staticmethod
def _bulk_restart_unresolve(user: User, alert_groups_to_restart_unresolve: "QuerySet[AlertGroup]") -> None:
from apps.alerts.models import AlertGroupLogRecord
# convert current qs to list to prevent changes by update
alert_groups_to_restart_unresolve_list = list(alert_groups_to_restart_unresolve)
alert_groups_to_restart_unresolve.update(
acknowledged=False,
acknowledged_at=None,
acknowledged_by_user=None,
acknowledged_by=AlertGroup.NOT_YET,
resolved=False,
resolved_at=None,
is_open_for_grouping=None,
resolved_by_user=None,
resolved_by=AlertGroup.NOT_YET,
silenced_until=None,
silenced_by_user=None,
silenced_at=None,
silenced=False,
restarted_at=timezone.now(),
)
# unresolve alert groups
for alert_group in alert_groups_to_restart_unresolve_list:
# update metrics cache (note alert_group.state is the original alert group's state)
alert_group._update_metrics(
organization_id=user.organization_id,
previous_state=alert_group.state,
state=AlertGroupState.FIRING,
)
log_record = alert_group.log_records.create(
type=AlertGroupLogRecord.TYPE_UN_RESOLVED,
author=user,
reason="Bulk action restart",
)
if alert_group.is_root_alert_group:
alert_group.start_escalation_if_needed()
transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk))
@staticmethod
def _bulk_restart_unsilence(user: User, alert_groups_to_restart_unsilence: "QuerySet[AlertGroup]") -> None:
from apps.alerts.models import AlertGroupLogRecord
# convert current qs to list to prevent changes by update
alert_groups_to_restart_unsilence_list = list(alert_groups_to_restart_unsilence)
alert_groups_to_restart_unsilence.update(
acknowledged=False,
acknowledged_at=None,
acknowledged_by_user=None,
acknowledged_by=AlertGroup.NOT_YET,
resolved=False,
resolved_at=None,
is_open_for_grouping=None,
resolved_by_user=None,
resolved_by=AlertGroup.NOT_YET,
silenced_until=None,
silenced_by_user=None,
silenced_at=None,
silenced=False,
restarted_at=timezone.now(),
)
# unsilence alert groups
for alert_group in alert_groups_to_restart_unsilence_list:
# update metrics cache (note alert_group.state is the original alert group's state)
alert_group._update_metrics(
organization_id=user.organization_id,
previous_state=alert_group.state,
state=AlertGroupState.FIRING,
)
log_record = alert_group.log_records.create(
type=AlertGroupLogRecord.TYPE_UN_SILENCE, author=user, reason="Bulk action restart"
)
alert_group.start_escalation_if_needed()
transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk))
@staticmethod
def bulk_restart(user: User, alert_groups: "QuerySet[AlertGroup]") -> None:
root_alert_groups_unack = alert_groups.filter(
resolved=False,
acknowledged=True,
root_alert_group__isnull=True,
maintenance_uuid__isnull=True, # don't restart maintenance incident
)
# convert qs to list to prevent changes by update
root_alert_group_pks = list(root_alert_groups_unack.values_list("pk", flat=True))
dependent_alert_groups_unack = AlertGroup.objects.filter(root_alert_group__pk__in=root_alert_group_pks)
with transaction.atomic():
AlertGroup._bulk_restart_unack(user, root_alert_groups_unack)
AlertGroup._bulk_restart_unack(user, dependent_alert_groups_unack)
root_alert_groups_unresolve = alert_groups.filter(resolved=True, root_alert_group__isnull=True)
# convert qs to list to prevent changes by update
root_alert_group_pks = list(root_alert_groups_unresolve.values_list("pk", flat=True))
dependent_alert_groups_unresolve = AlertGroup.objects.filter(root_alert_group__pk__in=root_alert_group_pks)
with transaction.atomic():
AlertGroup._bulk_restart_unresolve(user, root_alert_groups_unresolve)
AlertGroup._bulk_restart_unresolve(user, dependent_alert_groups_unresolve)
alert_groups_to_restart_unsilence = alert_groups.filter(
resolved=False,
acknowledged=False,
silenced=True,
root_alert_group__isnull=True,
)
AlertGroup._bulk_restart_unsilence(user, alert_groups_to_restart_unsilence)
@staticmethod
def _bulk_silence(user: User, alert_groups_to_silence: "QuerySet[AlertGroup]", silence_delay: int) -> None:
from apps.alerts.models import AlertGroupLogRecord
now = timezone.now()
silence_for_period = silence_delay is not None and silence_delay > 0
if silence_for_period:
silence_delay_timedelta = datetime.timedelta(seconds=silence_delay)
silenced_until = now + silence_delay_timedelta
else:
silence_delay_timedelta = None
silenced_until = None
alert_groups_to_unsilence_before_silence = alert_groups_to_silence.filter(
silenced=True, acknowledged=False, resolved=False
)
alert_groups_to_unacknowledge_before_silence = alert_groups_to_silence.filter(resolved=False, acknowledged=True)
alert_groups_to_unresolve_before_silence = alert_groups_to_silence.filter(resolved=True)
# convert current qs to list to prevent changes by update
alert_groups_to_silence_list = list(alert_groups_to_silence)
alert_groups_to_unsilence_before_silence_list = list(alert_groups_to_unsilence_before_silence)
alert_groups_to_unacknowledge_before_silence_list = list(alert_groups_to_unacknowledge_before_silence)
alert_groups_to_unresolve_before_silence_list = list(alert_groups_to_unresolve_before_silence)
previous_states = []
for alert_group in alert_groups_to_silence_list:
previous_states.append(alert_group.state)
alert_group.acknowledged = False
alert_group.acknowledged_at = None
alert_group.acknowledged_by_user = None
alert_group.acknowledged_by = AlertGroup.NOT_YET
alert_group.resolved = False
alert_group.resolved_at = None
alert_group.resolved_by_user = None
alert_group.resolved_by = AlertGroup.NOT_YET
alert_group.silenced = True
alert_group.silenced_at = now
alert_group.silenced_until = silenced_until
alert_group.silenced_by_user = user
if not silence_for_period:
alert_group.is_escalation_finished = True
else:
alert_group.update_next_step_eta(datetime.timedelta(seconds=silence_delay + START_ESCALATION_DELAY))
if alert_group.response_time is None:
alert_group.response_time = alert_group._get_response_time()
fields_to_update = [
"acknowledged",
"acknowledged_at",
"acknowledged_by_user",
"acknowledged_by",
"resolved",
"resolved_at",
"resolved_by_user",
"resolved_by",
"silenced",
"silenced_at",
"silenced_until",
"silenced_by_user",
"is_escalation_finished",
"raw_escalation_snapshot",
"response_time",
]
AlertGroup.objects.bulk_update(alert_groups_to_silence_list, fields=fields_to_update, batch_size=100)
# create log records
for alert_group in alert_groups_to_unresolve_before_silence_list:
alert_group.log_records.create(
type=AlertGroupLogRecord.TYPE_UN_RESOLVED,
author=user,
reason="Bulk action silence",
)
for alert_group in alert_groups_to_unsilence_before_silence_list:
alert_group.log_records.create(
type=AlertGroupLogRecord.TYPE_UN_SILENCE,
author=user,
reason="Bulk action silence",
)
for alert_group in alert_groups_to_unacknowledge_before_silence_list:
alert_group.log_records.create(
type=AlertGroupLogRecord.TYPE_UN_ACK,
author=user,
reason="Bulk action silence",
)
for alert_group, previous_state in zip(alert_groups_to_silence_list, previous_states):
# update metrics cache
alert_group._update_metrics(
organization_id=user.organization_id,
previous_state=previous_state,
state=AlertGroupState.SILENCED,
)
log_record = alert_group.log_records.create(
type=AlertGroupLogRecord.TYPE_SILENCE,
author=user,
silence_delay=silence_delay_timedelta,
reason="Bulk action silence",
)
transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk))
if silence_for_period and alert_group.is_root_alert_group:
alert_group.start_unsilence_task(countdown=silence_delay)
@staticmethod
def bulk_silence(user: User, alert_groups: "QuerySet[AlertGroup]", silence_delay: int) -> None:
root_alert_groups_to_silence = alert_groups.filter(
root_alert_group__isnull=True,
maintenance_uuid__isnull=True, # don't silence maintenance incident
)
# convert qs to list to prevent changes by update
root_alert_group_pks = list(root_alert_groups_to_silence.values_list("pk", flat=True))
dependent_alert_groups_to_silence = alert_groups.filter(root_alert_group__pk__in=root_alert_group_pks)
with transaction.atomic():
AlertGroup._bulk_silence(user, root_alert_groups_to_silence, silence_delay)
AlertGroup._bulk_silence(user, dependent_alert_groups_to_silence, silence_delay)
def start_ack_reminder_if_needed(self) -> None:
from apps.user_management.models import Organization
if not self.is_root_alert_group:
return
# Check if the "Remind every N hours" setting is enabled
countdown = Organization.ACKNOWLEDGE_REMIND_DELAY[self.channel.organization.acknowledge_remind_timeout]
if not countdown:
return
self.last_unique_unacknowledge_process_id = celery_uuid()
self.save(update_fields=["last_unique_unacknowledge_process_id"])
acknowledge_reminder_task.apply_async((self.pk, self.last_unique_unacknowledge_process_id), countdown=countdown)
def start_unsilence_task(self, countdown):
task_id = celery_uuid()
self.unsilence_task_uuid = task_id
self.save(update_fields=["unsilence_task_uuid"])
unsilence_task.apply_async((self.pk,), task_id=task_id, countdown=countdown)
@property
def is_root_alert_group(self):
return self.root_alert_group is None
def acknowledge(self, **kwargs):
if not self.acknowledged:
self.acknowledged = True
self.acknowledged_at = timezone.now()
for k, v in kwargs.items():
setattr(self, k, v)
update_fields = ["acknowledged", "acknowledged_at", *kwargs.keys()]
if self.response_time is None:
self.response_time = self._get_response_time()
update_fields += ["response_time"]
self.save(update_fields=update_fields)
def unacknowledge(self):
self.un_silence()
if self.acknowledged:
self.acknowledged = False
self.acknowledged_at = None
self.acknowledged_by_user = None
self.acknowledged_by = AlertGroup.NOT_YET
self.save(update_fields=["acknowledged", "acknowledged_at", "acknowledged_by_user", "acknowledged_by"])
def resolve(self, **kwargs):
if not self.resolved:
self.resolved = True
self.resolved_at = timezone.now()
self.is_open_for_grouping = None
for k, v in kwargs.items():
setattr(self, k, v)
update_fields = ["resolved", "resolved_at", "is_open_for_grouping", *kwargs.keys()]
if self.response_time is None:
self.response_time = self._get_response_time()
update_fields += ["response_time"]
self.save(update_fields=update_fields)
def unresolve(self):
self.unacknowledge()
if self.resolved:
self.resolved = False
self.resolved_at = None
self.resolved_by = AlertGroup.NOT_YET
self.resolved_by_user = None
self.save(update_fields=["resolved", "resolved_at", "resolved_by", "resolved_by_user"])
def silence(self, **kwargs):
if not self.silenced:
self.silenced = True
if "silenced_at" not in kwargs:
kwargs["silenced_at"] = timezone.now()
for k, v in kwargs.items():
setattr(self, k, v)
update_fields = ["silenced", *kwargs.keys()]
if self.response_time is None:
self.response_time = self._get_response_time()
update_fields += ["response_time"]
self.save(update_fields=update_fields)
def un_silence(self):
self.silenced_until = None
self.silenced_by_user = None
self.silenced_at = None
self.silenced = False
self.unsilence_task_uuid = None
self.restarted_at = timezone.now()
self.save(
update_fields=[
"silenced_until",
"silenced",
"silenced_by_user",
"silenced_at",
"unsilence_task_uuid",
"restarted_at",
]
)
@property
def long_verbose_name(self):
title = str_or_backup(self.slack_templated_first_alert.title, DEFAULT_BACKUP_TITLE)
return title
@property
def long_verbose_name_without_formatting(self):
sf = SlackFormatter(self.channel.organization)
title = self.long_verbose_name
title = sf.format(title)
title = clean_markup(title)
return title
def get_resolve_text(self, mention_user=False):
if self.resolved_by == AlertGroup.SOURCE:
return "Resolved by alert source"
elif self.resolved_by == AlertGroup.LAST_STEP:
return "Resolved automatically"
elif self.resolved_by == AlertGroup.WIPED:
return "Resolved by wipe"
elif self.resolved_by == AlertGroup.DISABLE_MAINTENANCE:
return "Resolved by stop maintenance"
else:
if self.resolved_by_user is not None:
user_text = self.resolved_by_user.get_username_with_slack_verbal(mention=mention_user)
return f"Resolved by {user_text}"
else:
return "Resolved"
def get_acknowledge_text(self, mention_user=False):
if self.acknowledged_by == AlertGroup.SOURCE:
return "Acknowledged by alert source"
elif self.acknowledged_by == AlertGroup.USER and self.acknowledged_by_user is not None:
user_text = self.acknowledged_by_user.get_username_with_slack_verbal(mention=mention_user)
return f"Acknowledged by {user_text}"
else:
return "Acknowledged"
def render_after_resolve_report_json(self) -> list[LogRecords]:
from apps.alerts.models import AlertGroupLogRecord, ResolutionNote
from apps.base.models import UserNotificationPolicyLogRecord
log_builder = IncidentLogBuilder(self)
log_records_list = log_builder.get_log_records_list(with_resolution_notes=True)
result_log_report = list()
for log_record in log_records_list:
if type(log_record) is AlertGroupLogRecord:
result_log_report.append(log_record.render_log_line_json())
elif type(log_record) is UserNotificationPolicyLogRecord:
result_log_report.append(log_record.rendered_notification_log_line_json)
elif type(log_record) is ResolutionNote:
result_log_report.append(log_record.render_log_line_json())
return result_log_report
@property
def has_resolution_notes(self):
return self.resolution_notes.exists()
@property
def state(self):
if self.resolved:
return AlertGroupState.RESOLVED
elif self.acknowledged:
return AlertGroupState.ACKNOWLEDGED
elif self.silenced:
return AlertGroupState.SILENCED
else:
return AlertGroupState.FIRING
@property
def notify_in_slack_enabled(self):
channel_filter = self.channel_filter_with_respect_to_escalation_snapshot
if channel_filter is not None:
return channel_filter.notify_in_slack
else:
return True
@property
def is_presented_in_slack(self):
return self.slack_message and self.channel.organization.slack_team_identity
@property
def slack_channel_id(self) -> str | None:
if not self.channel.organization.slack_team_identity:
return None
elif self.slack_message:
return self.slack_message.channel_id
elif self.channel_filter:
return self.channel_filter.slack_channel_id_or_org_default_id
return None
@property
def slack_message(self) -> typing.Optional["SlackMessage"]:
try:
# prefetched_slack_messages could be set in apps.api.serializers.alert_group.AlertGroupListSerializer
return self.prefetched_slack_messages[0] if self.prefetched_slack_messages else None
except AttributeError:
return self.slack_messages.order_by("created_at").first()
@cached_property
def last_stop_escalation_log(self):
from apps.alerts.models import AlertGroupLogRecord
stop_escalation_log = (
self.log_records.filter(
type__in=[
AlertGroupLogRecord.TYPE_RESOLVED,
AlertGroupLogRecord.TYPE_ACK,
AlertGroupLogRecord.TYPE_SILENCE,
]
)
.order_by("pk")
.last()
)
return stop_escalation_log
def alerts_count_gt(self, max_alerts) -> bool:
"""
alerts_count_gt checks if there are more than max_alerts alerts in given alert group.
It's optimized for alert groups with big number of alerts and relatively small max_alerts.
"""
count = self.alerts.all()[: max_alerts + 1].count()
return count > max_alerts