import datetime import logging import typing import urllib from collections import namedtuple from functools import partial from celery import uuid as celery_uuid from django.conf import settings from django.core.validators import MinLengthValidator from django.db import IntegrityError, models, transaction from django.db.models import JSONField, Q, QuerySet from django.utils import timezone from django.utils.functional import cached_property from apps.alerts.constants import ActionSource, AlertGroupState from apps.alerts.escalation_snapshot import EscalationSnapshotMixin from apps.alerts.escalation_snapshot.escalation_snapshot_mixin import START_ESCALATION_DELAY from apps.alerts.incident_appearance.renderers.constants import DEFAULT_BACKUP_TITLE from apps.alerts.incident_appearance.renderers.slack_renderer import AlertGroupSlackRenderer from apps.alerts.incident_log_builder import IncidentLogBuilder from apps.alerts.signals import alert_group_created_signal from apps.alerts.tasks import ( acknowledge_reminder_task, send_alert_group_signal, send_alert_group_signal_for_delete, unsilence_task, ) from apps.grafana_plugin.ui_url_builder import UIURLBuilder from apps.metrics_exporter.tasks import update_metrics_for_alert_group from apps.slack.slack_formatter import SlackFormatter from apps.user_management.models import User from common.public_primary_keys import generate_public_primary_key, increase_public_primary_key_length from common.utils import clean_markup, str_or_backup from .alert_group_counter import AlertGroupCounter if typing.TYPE_CHECKING: from django.db.models.manager import RelatedManager from apps.alerts.models import ( Alert, AlertGroupLogRecord, AlertReceiveChannel, BundledNotification, Invitation, RelatedIncident, ResolutionNote, ResolutionNoteSlackMessage, ) from apps.base.models import UserNotificationPolicyLogRecord from apps.labels.models import AlertGroupAssociatedLabel from apps.slack.models import SlackMessage logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) def generate_public_primary_key_for_alert_group(): prefix = "I" new_public_primary_key = generate_public_primary_key(prefix) failure_counter = 0 while AlertGroup.objects.filter(public_primary_key=new_public_primary_key).exists(): new_public_primary_key = increase_public_primary_key_length( failure_counter=failure_counter, prefix=prefix, model_name="AlertGroup" ) failure_counter += 1 return new_public_primary_key class LogRecordUser(typing.TypedDict): username: str pk: str avatar: str avatar_full: str class PagedUser(typing.TypedDict): id: int username: str name: str pk: str avatar: str avatar_full: str important: bool class LogRecords(typing.TypedDict): time: str # humanized delta relative to now action: str # human-friendly description realm: typing.Literal["user_notification", "alert_group", "resolution_note"] type: int # depending on realm, check type choices created_at: str # timestamp author: LogRecordUser class Permalinks(typing.TypedDict): slack: typing.Optional[str] slack_app: typing.Optional[str] telegram: typing.Optional[str] web: str class AlertGroupQuerySet(models.QuerySet): def create(self, **kwargs): organization = kwargs["channel"].organization inside_organization_number = AlertGroupCounter.objects.get_value(organization=organization) + 1 return super().create(**kwargs, inside_organization_number=inside_organization_number) def get_or_create_grouping(self, channel, channel_filter, group_data, received_at=None): """ This method is similar to default Django QuerySet.get_or_create(), please see the original get_or_create method. The difference is that this method is trying to get an object using multiple queries with different filters. Also, "create" is invoked without transaction.atomic to reduce number of ConcurrentUpdateError's which can be raised in AlertGroupQuerySet.create() due to optimistic locking of AlertGroupCounter model. """ search_params = { "channel": channel, "channel_filter": channel_filter, "distinction": group_data.group_distinction, } # Try to return the last open group # Note that (channel, channel_filter, distinction, is_open_for_grouping) is in unique_together try: return self.get(**search_params, is_open_for_grouping__isnull=False), False except self.model.DoesNotExist: pass # If it's an "OK" alert, try to return the latest resolved group # (only if the channel allows source base resolving and the alert is a resolve signal) if channel.allow_source_based_resolving and group_data.is_resolve_signal: try: return self.filter(**search_params, resolved=True).latest(), False except self.model.DoesNotExist: pass # Create a new group if we couldn't group it to any existing ones try: alert_group = self.create( **search_params, is_open_for_grouping=True, web_title_cache=group_data.web_title_cache, received_at=received_at, ) alert_group_created_signal.send(sender=self.__class__, alert_group=alert_group) return (alert_group, True) except IntegrityError: try: return self.get(**search_params, is_open_for_grouping__isnull=False), False except self.model.DoesNotExist: pass raise def filter_active(self, *args, **kwargs): # filter alert groups with active escalation return super().filter( *args, ~Q(silenced=True, silenced_until__isnull=True), # filter silenced forever alert_groups **kwargs, maintenance_uuid__isnull=True, is_escalation_finished=False, resolved=False, acknowledged=False, root_alert_group=None, ) class AlertGroupSlackRenderingMixin: """ Ideally this mixin should not exist. Instead of this instance of AlertGroupSlackRenderer should be created and used but slack rendering is distributed throughout the codebase. """ @cached_property def slack_renderer(self): return AlertGroupSlackRenderer(self) def render_slack_attachments(self): return self.slack_renderer.render_alert_group_attachments() def render_slack_blocks(self): return self.slack_renderer.render_alert_group_blocks() @property def slack_templated_first_alert(self): return self.slack_renderer.alert_renderer.templated_alert class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models.Model): acknowledged_by_user: typing.Optional["User"] alerts: "RelatedManager['Alert']" bundled_notifications: "RelatedManager['BundledNotification']" channel: "AlertReceiveChannel" dependent_alert_groups: "RelatedManager['AlertGroup']" invitations: "RelatedManager['Invitation']" labels: "RelatedManager['AlertGroupAssociatedLabel']" log_records: "RelatedManager['AlertGroupLogRecord']" personal_log_records: "RelatedManager['UserNotificationPolicyLogRecord']" related_incidents: "RelatedManager['RelatedIncident']" resolution_notes: "RelatedManager['ResolutionNote']" resolution_note_slack_messages: "RelatedManager['ResolutionNoteSlackMessage']" resolved_by_user: typing.Optional["User"] root_alert_group: typing.Optional["AlertGroup"] silenced_by_user: typing.Optional["User"] slack_messages: "RelatedManager['SlackMessage']" users: "RelatedManager['User']" objects: models.Manager["AlertGroup"] = AlertGroupQuerySet.as_manager() ( NEW, ACKNOWLEDGED, RESOLVED, SILENCED, ) = range(4) # exists for status filter in API STATUS_CHOICES = ((NEW, "New"), (ACKNOWLEDGED, "Acknowledged"), (RESOLVED, "Resolved"), (SILENCED, "Silenced")) GroupData = namedtuple( "GroupData", ["is_resolve_signal", "group_distinction", "web_title_cache", "is_acknowledge_signal"] ) SOURCE, USER, NOT_YET, LAST_STEP, ARCHIVED, WIPED, DISABLE_MAINTENANCE, NOT_YET_STOP_AUTORESOLVE = range(8) SOURCE_CHOICES = ( (SOURCE, "source"), (USER, "user"), (NOT_YET, "not yet"), (LAST_STEP, "last escalation step"), (ARCHIVED, "archived"), # deprecated. don't use (WIPED, "wiped"), (DISABLE_MAINTENANCE, "stop maintenance"), (NOT_YET_STOP_AUTORESOLVE, "not yet, autoresolve disabled"), ) ACKNOWLEDGE = "acknowledge" RESOLVE = "resolve" SILENCE = "silence" RESTART = "restart" BULK_ACTIONS = [ ACKNOWLEDGE, RESOLVE, SILENCE, RESTART, ] public_primary_key = models.CharField( max_length=20, validators=[MinLengthValidator(settings.PUBLIC_PRIMARY_KEY_MIN_LENGTH + 1)], unique=True, default=generate_public_primary_key_for_alert_group, ) channel = models.ForeignKey( "alerts.AlertReceiveChannel", on_delete=models.CASCADE, related_name="alert_groups", ) # Distinction is a difference between groups inside the same channel. # For example different types of alerts from the same channel should go to different groups. # Distinction is what describes their difference. distinction = models.CharField(max_length=100, null=True, default=None, db_index=True) web_title_cache = models.TextField(null=True, default=None) inside_organization_number = models.IntegerField(default=0) channel_filter = models.ForeignKey( "alerts.ChannelFilter", on_delete=models.SET_DEFAULT, related_name="alert_groups", null=True, default=None, ) resolved = models.BooleanField(default=False) resolved_by = models.IntegerField(choices=SOURCE_CHOICES, default=NOT_YET) resolved_by_user = models.ForeignKey( "user_management.User", on_delete=models.SET_NULL, null=True, default=None, related_name="resolved_alert_groups", ) resolved_by_alert = models.ForeignKey( "alerts.Alert", on_delete=models.DO_NOTHING, db_constraint=False, null=True, default=None, related_name="resolved_alert_groups", ) """ ⚠️ This field is no longer being set/read anywhere, DON'T USE IT! ⚠️ TODO: We still need to figure out how to remove it safely. See [this conversation](https://raintank-corp.slack.com/archives/C07RGREUH4Z/p1728494111646319) for more context """ resolved_at = models.DateTimeField(blank=True, null=True) acknowledged = models.BooleanField(default=False) acknowledged_on_source = models.BooleanField(default=False) acknowledged_at = models.DateTimeField(blank=True, null=True) acknowledged_by = models.IntegerField(choices=SOURCE_CHOICES, default=NOT_YET) acknowledged_by_user = models.ForeignKey( "user_management.User", on_delete=models.SET_NULL, null=True, default=None, related_name="acknowledged_alert_groups", ) acknowledged_by_confirmed = models.DateTimeField(null=True, default=None) is_escalation_finished = models.BooleanField(default=False) started_at = models.DateTimeField(auto_now_add=True, db_index=True) slack_message_sent = models.BooleanField(default=False) active_escalation_id = models.CharField(max_length=100, null=True, default=None) # ID generated by celery active_resolve_calculation_id = models.CharField(max_length=100, null=True, default=None) # ID generated by celery SILENCE_DELAY_OPTIONS = ( (1800, "30 minutes"), (3600, "1 hour"), (7200, "2 hours"), (10800, "3 hours"), (14400, "4 hours"), (21600, "6 hours"), (43200, "12 hours"), (57600, "16 hours"), (72000, "20 hours"), (86400, "24 hours"), (-1, "Forever"), ) silenced = models.BooleanField(default=False) silenced_at = models.DateTimeField(null=True) silenced_by_user = models.ForeignKey( "user_management.User", on_delete=models.SET_NULL, null=True, default=None, related_name="silenced_alert_groups", ) silenced_until = models.DateTimeField(blank=True, null=True) unsilence_task_uuid = models.CharField(max_length=100, null=True, default=None) restarted_at = models.DateTimeField(blank=True, null=True, default=None) response_time = models.DurationField(null=True, default=None) received_at = models.DateTimeField(blank=True, null=True, default=None) @property def is_silenced_forever(self): return self.silenced and self.silenced_until is None @property def is_silenced_for_period(self): return self.silenced and self.silenced_until is not None @property def status(self) -> int: if self.resolved: return AlertGroup.RESOLVED elif self.acknowledged: return AlertGroup.ACKNOWLEDGED elif self.silenced: return AlertGroup.SILENCED else: return AlertGroup.NEW ( ACCOUNT_INACTIVE, CHANNEL_ARCHIVED, NO_REASON, RATE_LIMITED, CHANNEL_NOT_SPECIFIED, RESTRICTED_ACTION, INVALID_AUTH, ) = range(7) REASONS_TO_SKIP_ESCALATIONS = ( (ACCOUNT_INACTIVE, "account_inactive"), (CHANNEL_ARCHIVED, "is_archived"), (NO_REASON, "no_reason"), (RATE_LIMITED, "rate_limited"), (CHANNEL_NOT_SPECIFIED, "channel_not_specified"), (RESTRICTED_ACTION, "restricted_action"), (INVALID_AUTH, "invalid_auth"), ) reason_to_skip_escalation = models.IntegerField(choices=REASONS_TO_SKIP_ESCALATIONS, default=NO_REASON) root_alert_group = models.ForeignKey( "alerts.AlertGroup", on_delete=models.SET_NULL, null=True, default=None, related_name="dependent_alert_groups", ) last_unique_unacknowledge_process_id = models.CharField(max_length=100, null=True, default=None) wiped_at = models.DateTimeField(null=True, default=None) wiped_by = models.ForeignKey( "user_management.User", on_delete=models.SET_NULL, null=True, default=None, related_name="wiped_alert_groups", ) prevent_posting_alerts = models.BooleanField(default=False) """ TODO: this column is no longer used, drop it in a subsequent PR/release """ maintenance_uuid = models.CharField(max_length=100, unique=True, null=True, default=None) raw_escalation_snapshot = JSONField(null=True, default=None) # This field is used for constraints so we can use get_or_create() in concurrent calls # https://docs.djangoproject.com/en/3.2/ref/models/querysets/#get-or-create # Combined with unique_together below, it allows only one alert group with # the combination (alert_receive_channel_id, channel_filter_id, distinction, is_open_for_grouping=True) # If is_open_for_grouping=None, then we can have as many combinations of # (alert_receive_channel_id, channel_filter_id, distinction, is_open_for_grouping=None) as we want # We just don't care about that because we'll use only get_or_create(...is_open_for_grouping=True...) # https://code.djangoproject.com/ticket/28545 is_open_for_grouping = models.BooleanField(default=None, null=True, blank=True) grafana_incident_id = models.CharField(max_length=100, null=True, default=None) @staticmethod def get_silenced_state_filter(): """ models.Value(0/1) is used instead of True/False because django translates that into WHERE bool_field=0/1 instead of WHERE bool_field/NOT bool_field which works much faster in mysql """ return Q(silenced=models.Value("1")) & Q(acknowledged=models.Value("0")) & Q(resolved=models.Value("0")) @staticmethod def get_new_state_filter(): """ models.Value(0/1) is used instead of True/False because django translates that into WHERE bool_field=0/1 instead of WHERE bool_field/NOT bool_field which works much faster in mysql """ return Q(silenced=models.Value("0")) & Q(acknowledged=models.Value("0")) & Q(resolved=models.Value("0")) @staticmethod def get_acknowledged_state_filter(): """ models.Value(0/1) is used instead of True/False because django translates that into WHERE bool_field=0/1 instead of WHERE bool_field/NOT bool_field which works much faster in mysql """ return Q(acknowledged=models.Value("1")) & Q(resolved=models.Value("0")) @staticmethod def get_resolved_state_filter(): """ models.Value(0/1) is used instead of True/False because django translates that into WHERE bool_field=0/1 instead of WHERE bool_field/NOT bool_field which works much faster in mysql """ return Q(resolved=models.Value("1")) class Meta: get_latest_by = "pk" unique_together = [ "channel_id", "channel_filter_id", "distinction", "is_open_for_grouping", ] indexes = [ models.Index( fields=["channel_id", "resolved", "acknowledged", "silenced", "root_alert_group_id", "started_at"], name="alert_group_list_index", ), ] def __str__(self): return f"{self.pk}: {self.web_title_cache}" @property def is_maintenance_incident(self): return self.maintenance_uuid is not None def stop_maintenance(self, user: User) -> None: from apps.alerts.models import AlertReceiveChannel try: integration_on_maintenance = AlertReceiveChannel.objects.get(maintenance_uuid=self.maintenance_uuid) integration_on_maintenance.force_disable_maintenance(user) return except AlertReceiveChannel.DoesNotExist: pass self.resolve_by_disable_maintenance() @property def skip_escalation_in_slack(self): return self.reason_to_skip_escalation in ( AlertGroup.CHANNEL_ARCHIVED, AlertGroup.ACCOUNT_INACTIVE, AlertGroup.RATE_LIMITED, AlertGroup.CHANNEL_NOT_SPECIFIED, AlertGroup.RESTRICTED_ACTION, ) def is_alert_a_resolve_signal(self, alert): raise NotImplementedError @property def slack_permalink(self) -> typing.Optional[str]: return None if self.slack_message is None else self.slack_message.permalink @property def slack_app_link(self) -> typing.Optional[str]: return None if self.slack_message is None else self.slack_message.deep_link @property def telegram_permalink(self) -> typing.Optional[str]: from apps.telegram.models.message import TelegramMessage try: # prefetched_telegram_messages could be set in apps.api.serializers.alert_group.AlertGroupListSerializer main_telegram_message = self.prefetched_telegram_messages[0] if self.prefetched_telegram_messages else None except AttributeError: main_telegram_message = ( self.telegram_messages.filter(chat_id__startswith="-", message_type=TelegramMessage.ALERT_GROUP_MESSAGE) .order_by("id") .first() ) return main_telegram_message.link if main_telegram_message else None @property def permalinks(self) -> Permalinks: return { "slack": self.slack_permalink, "slack_app": self.slack_app_link, "telegram": self.telegram_permalink, "web": self.web_link, } @property def web_link(self) -> str: return UIURLBuilder(self.channel.organization).alert_group_detail(self.public_primary_key) @property def declare_incident_link(self) -> str: """ Generate a link for AlertGroup to declare Grafana Incident by click """ caption = urllib.parse.quote_plus("OnCall Alert Group") title = urllib.parse.quote_plus(self.web_title_cache) if self.web_title_cache else DEFAULT_BACKUP_TITLE title = title[:2000] # set max title length to avoid exceptions with too long declare incident link link = urllib.parse.quote_plus(self.web_link) return UIURLBuilder(self.channel.organization).declare_incident(f"?caption={caption}&url={link}&title={title}") @property def happened_while_maintenance(self): return self.root_alert_group is not None and self.root_alert_group.maintenance_uuid is not None def get_paged_users(self) -> typing.List[PagedUser]: from apps.alerts.models import AlertGroupLogRecord user_ids: typing.Set[str] = set() users: typing.Dict[str, PagedUser] = {} organization = self.channel.organization log_records = self.log_records.filter( type__in=(AlertGroupLogRecord.TYPE_DIRECT_PAGING, AlertGroupLogRecord.TYPE_UNPAGE_USER) ).order_by("created_at") for log_record in log_records: # filter paging events, track still active escalations info = log_record.get_step_specific_info() user_id = info.get("user") if info else None important = info.get("important") if info else None if user_id is not None: user_ids.add( user_id ) if log_record.type == AlertGroupLogRecord.TYPE_DIRECT_PAGING else user_ids.discard(user_id) user_instances = User.objects.filter(public_primary_key__in=user_ids) user_map = {u.public_primary_key: u for u in user_instances} # mostly doing this second loop to avoid having to query each user individually in the first loop for log_record in log_records: # filter paging events, track still active escalations info = log_record.get_step_specific_info() user_id = info.get("user") if info else None important = info.get("important") if info else False if user_id is not None and (user := user_map.get(user_id)) is not None: if log_record.type == AlertGroupLogRecord.TYPE_DIRECT_PAGING: # add the user users[user_id] = { "id": user.pk, "pk": user.public_primary_key, "name": user.name, "username": user.username, "avatar": user.avatar_url, "avatar_full": user.avatar_full_url(organization), "important": important, "teams": [{"pk": t.public_primary_key, "name": t.name} for t in user.teams.all()], } else: # user was unpaged at some point, remove them # there could be multiple unpage log records if API was hit several times if user_id in users: del users[user_id] return list(users.values()) def _get_response_time(self): """Return response_time based on current alert group status.""" response_time = None timestamps = (self.acknowledged_at, self.resolved_at, self.silenced_at, self.wiped_at) min_timestamp = min((ts for ts in timestamps if ts), default=None) if min_timestamp: response_time = min_timestamp - self.started_at return response_time def _update_metrics(self, organization_id, previous_state, state): """Update metrics cache for response time and state as needed.""" update_metrics_for_alert_group.apply_async((self.id, organization_id, previous_state, state)) def update_state_by_backsync(self, new_state: AlertGroupState, source_channel: "AlertReceiveChannel") -> None: if self.state == new_state: return logger.debug(f"Update state {self.state} -> {new_state} for alert_group {self.pk}") kwargs = { "source_channel": source_channel, "action_source": ActionSource.BACKSYNC, } if new_state == AlertGroupState.FIRING: if self.state == AlertGroupState.ACKNOWLEDGED: self.un_acknowledge_by_user_or_backsync(**kwargs) elif self.state == AlertGroupState.RESOLVED: self.un_resolve_by_user_or_backsync(**kwargs) elif self.state == AlertGroupState.SILENCED: self.un_silence_by_user_or_backsync(**kwargs) elif new_state == AlertGroupState.ACKNOWLEDGED: self.acknowledge_by_user_or_backsync(**kwargs) elif new_state == AlertGroupState.RESOLVED: self.resolve_by_user_or_backsync(**kwargs) elif new_state == AlertGroupState.SILENCED: self.silence_by_user_or_backsync(**kwargs) def acknowledge_by_user_or_backsync( self, user: typing.Optional[User] = None, source_channel: typing.Optional["AlertReceiveChannel"] = None, action_source: typing.Optional[ActionSource] = None, ) -> None: from apps.alerts.models import AlertGroupLogRecord initial_state = self.state reason = "Acknowledge button" if user else "Backsync signal" acknowledged_by = AlertGroup.USER if user else AlertGroup.SOURCE step_specific_info = ( {"source_integration_name": source_channel.verbal_name} if action_source == ActionSource.BACKSYNC else None ) organization_id = user.organization_id if user else self.channel.organization_id logger.debug(f"Started acknowledge_by_user_or_backsync for alert_group {self.pk}") # if alert group was silenced or resolved, unsilence/unresolve it without starting escalation if self.silenced: self.un_silence() self.log_records.create( type=AlertGroupLogRecord.TYPE_UN_SILENCE, author=user, silence_delay=None, reason=reason, action_source=action_source, step_specific_info=step_specific_info, ) if self.resolved: self.unresolve() self.log_records.create( type=AlertGroupLogRecord.TYPE_UN_RESOLVED, author=user, reason=reason, action_source=action_source, step_specific_info=step_specific_info, ) self.acknowledge(acknowledged_by_user=user, acknowledged_by=acknowledged_by) # Update alert group state and response time metrics cache self._update_metrics(organization_id=organization_id, previous_state=initial_state, state=self.state) self.stop_escalation() if user: # ack reminder works only for actions performed by user self.start_ack_reminder_if_needed() with transaction.atomic(): log_record = self.log_records.create( type=AlertGroupLogRecord.TYPE_ACK, author=user, action_source=action_source, step_specific_info=step_specific_info, ) logger.debug( f"send alert_group_action_triggered_signal for alert_group {self.pk}, " f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: {action_source}" ) transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) for dependent_alert_group in self.dependent_alert_groups.all(): dependent_alert_group.acknowledge_by_user_or_backsync( user, source_channel=source_channel, action_source=action_source ) logger.debug(f"Finished acknowledge_by_user_or_backsync for alert_group {self.pk}") def acknowledge_by_source(self): from apps.alerts.models import AlertGroupLogRecord initial_state = self.state # if incident was silenced, unsilence it without starting escalation if self.silenced: self.un_silence() self.log_records.create( type=AlertGroupLogRecord.TYPE_UN_SILENCE, silence_delay=None, reason="Acknowledge by source", ) self.acknowledge(acknowledged_by=AlertGroup.SOURCE) # Update alert group state and response time metrics cache self._update_metrics( organization_id=self.channel.organization_id, previous_state=initial_state, state=self.state ) self.stop_escalation() with transaction.atomic(): log_record = self.log_records.create(type=AlertGroupLogRecord.TYPE_ACK) logger.debug( f"send alert_group_action_triggered_signal for alert_group {self.pk}, " f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: alert" ) transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) for dependent_alert_group in self.dependent_alert_groups.all(): dependent_alert_group.acknowledge_by_source() def un_acknowledge_by_user_or_backsync( self, user: typing.Optional[User] = None, source_channel: typing.Optional["AlertReceiveChannel"] = None, action_source: typing.Optional[ActionSource] = None, ) -> None: from apps.alerts.models import AlertGroupLogRecord initial_state = self.state step_specific_info = ( {"source_integration_name": source_channel.verbal_name} if action_source == ActionSource.BACKSYNC else None ) organization_id = user.organization_id if user else self.channel.organization_id logger.debug(f"Started un_acknowledge_by_user_or_backsync for alert_group {self.pk}") self.unacknowledge() # Update alert group state metric cache self._update_metrics(organization_id=organization_id, previous_state=initial_state, state=self.state) if self.is_root_alert_group: self.start_escalation_if_needed() with transaction.atomic(): log_record = self.log_records.create( type=AlertGroupLogRecord.TYPE_UN_ACK, author=user, action_source=action_source, step_specific_info=step_specific_info, ) logger.debug( f"send alert_group_action_triggered_signal for alert_group {self.pk}, " f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: {action_source}" ) transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) for dependent_alert_group in self.dependent_alert_groups.all(): dependent_alert_group.un_acknowledge_by_user_or_backsync( user, source_channel=source_channel, action_source=action_source ) logger.debug(f"Finished un_acknowledge_by_user_or_backsync for alert_group {self.pk}") def resolve_by_user_or_backsync( self, user: typing.Optional[User] = None, source_channel: typing.Optional["AlertReceiveChannel"] = None, action_source: typing.Optional[ActionSource] = None, ) -> None: from apps.alerts.models import AlertGroupLogRecord initial_state = self.state reason = "Resolve button" if user else "Backsync signal" resolved_by = AlertGroup.USER if user else AlertGroup.SOURCE step_specific_info = ( {"source_integration_name": source_channel.verbal_name} if action_source == ActionSource.BACKSYNC else None ) organization_id = user.organization_id if user else self.channel.organization_id # if incident was silenced, unsilence it without starting escalation if self.silenced: self.un_silence() self.log_records.create( type=AlertGroupLogRecord.TYPE_UN_SILENCE, author=user, silence_delay=None, reason=reason, action_source=action_source, step_specific_info=step_specific_info, ) self.resolve(resolved_by=resolved_by, resolved_by_user=user) # Update alert group state and response time metrics cache self._update_metrics(organization_id=organization_id, previous_state=initial_state, state=self.state) self.stop_escalation() with transaction.atomic(): log_record = self.log_records.create( type=AlertGroupLogRecord.TYPE_RESOLVED, author=user, action_source=action_source, step_specific_info=step_specific_info, ) logger.debug( f"send alert_group_action_triggered_signal for alert_group {self.pk}, " f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: {action_source}" ) transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) for dependent_alert_group in self.dependent_alert_groups.all(): dependent_alert_group.resolve_by_user_or_backsync( user, source_channel=source_channel, action_source=action_source ) def resolve_by_source(self): from apps.alerts.models import AlertGroupLogRecord initial_state = self.state # if incident was silenced, unsilence it without starting escalation if self.silenced: self.un_silence() self.log_records.create( type=AlertGroupLogRecord.TYPE_UN_SILENCE, silence_delay=None, reason="Resolve by source", ) self.resolve(resolved_by=AlertGroup.SOURCE) # Update alert group state and response time metrics cache self._update_metrics( organization_id=self.channel.organization_id, previous_state=initial_state, state=self.state ) self.stop_escalation() with transaction.atomic(): log_record = self.log_records.create(type=AlertGroupLogRecord.TYPE_RESOLVED) logger.debug( f"send alert_group_action_triggered_signal for alert_group {self.pk}, " f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: alert" ) transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) for dependent_alert_group in self.dependent_alert_groups.all(): dependent_alert_group.resolve_by_source() def resolve_by_last_step(self): from apps.alerts.models import AlertGroupLogRecord initial_state = self.state self.resolve(resolved_by=AlertGroup.LAST_STEP) # Update alert group state and response time metrics cache self._update_metrics( organization_id=self.channel.organization_id, previous_state=initial_state, state=self.state ) self.stop_escalation() with transaction.atomic(): log_record = self.log_records.create(type=AlertGroupLogRecord.TYPE_RESOLVED) logger.debug( f"send alert_group_action_triggered_signal for alert_group {self.pk}, " f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: resolve step" ) transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) for dependent_alert_group in self.dependent_alert_groups.all(): dependent_alert_group.resolve_by_last_step() def resolve_by_disable_maintenance(self): from apps.alerts.models import AlertGroupLogRecord self.resolve(resolved_by=AlertGroup.DISABLE_MAINTENANCE) self.stop_escalation() with transaction.atomic(): log_record = self.log_records.create(type=AlertGroupLogRecord.TYPE_RESOLVED) logger.debug( f"send alert_group_action_triggered_signal for alert_group {self.pk}, " f"log record {log_record.pk} with type '{log_record.get_type_display()}', " f"action source: disable maintenance" ) transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) for dependent_alert_group in self.dependent_alert_groups.all(): dependent_alert_group.resolve_by_disable_maintenance() def un_resolve_by_user_or_backsync( self, user: typing.Optional[User] = None, source_channel: typing.Optional["AlertReceiveChannel"] = None, action_source: typing.Optional[ActionSource] = None, ) -> None: from apps.alerts.models import AlertGroupLogRecord if self.wiped_at is None: initial_state = self.state step_specific_info = ( {"source_integration_name": source_channel.verbal_name} if action_source == ActionSource.BACKSYNC else None ) organization_id = user.organization_id if user else self.channel.organization_id self.unresolve() # Update alert group state metric cache self._update_metrics(organization_id=organization_id, previous_state=initial_state, state=self.state) with transaction.atomic(): log_record = self.log_records.create( type=AlertGroupLogRecord.TYPE_UN_RESOLVED, author=user, action_source=action_source, step_specific_info=step_specific_info, ) if self.is_root_alert_group: self.start_escalation_if_needed() logger.debug( f"send alert_group_action_triggered_signal for alert_group {self.pk}, " f"log record {log_record.pk} with type '{log_record.get_type_display()}', " f"action source: {action_source}" ) transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) for dependent_alert_group in self.dependent_alert_groups.all(): dependent_alert_group.un_resolve_by_user_or_backsync( user, source_channel=source_channel, action_source=action_source ) def attach_by_user( self, user: User, root_alert_group: "AlertGroup", action_source: typing.Optional[ActionSource] = None ) -> None: from apps.alerts.models import AlertGroupLogRecord if root_alert_group.root_alert_group is None and not root_alert_group.resolved: self.root_alert_group = root_alert_group self.save(update_fields=["root_alert_group"]) self.stop_escalation() if root_alert_group.acknowledged and not self.acknowledged: self.acknowledge_by_user_or_backsync(user, action_source=action_source) elif not root_alert_group.acknowledged and self.acknowledged: self.un_acknowledge_by_user_or_backsync(user, action_source=action_source) if root_alert_group.silenced and not self.silenced: self.silence_by_user_or_backsync(user, action_source=action_source, silence_delay=None) if not root_alert_group.silenced and self.silenced: self.un_silence_by_user_or_backsync(user, action_source=action_source) with transaction.atomic(): log_record = self.log_records.create( type=AlertGroupLogRecord.TYPE_ATTACHED, author=user, root_alert_group=root_alert_group, reason="Attach dropdown", action_source=action_source, ) logger.debug( f"send alert_group_action_triggered_signal for alert_group {self.pk}, " f"log record {log_record.pk} with type '{log_record.get_type_display()}', " f"action source: {action_source}" ) transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) log_record_for_root_incident = root_alert_group.log_records.create( type=AlertGroupLogRecord.TYPE_ATTACHED, author=user, dependent_alert_group=self, reason="Attach dropdown", action_source=action_source, ) logger.debug( f"send alert_group_action_triggered_signal for alert_group {root_alert_group.pk}, " f"log record {log_record_for_root_incident.pk} with type " f"'{log_record_for_root_incident.get_type_display()}', action source: {action_source}" ) transaction.on_commit(partial(send_alert_group_signal.delay, log_record_for_root_incident.pk)) else: log_record = self.log_records.create( type=AlertGroupLogRecord.TYPE_FAILED_ATTACHMENT, author=user, root_alert_group=root_alert_group, reason="Failed to attach dropdown", action_source=action_source, ) logger.debug( f"send alert_group_action_triggered_signal for alert_group {self.pk}, " f"log record {log_record.pk} with type '{log_record.get_type_display()}', " f"action source: {action_source}" ) transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) def un_attach_by_user(self, user: User, action_source: typing.Optional[ActionSource] = None) -> None: from apps.alerts.models import AlertGroupLogRecord root_alert_group: AlertGroup = self.root_alert_group self.root_alert_group = None self.save(update_fields=["root_alert_group"]) self.start_escalation_if_needed() with transaction.atomic(): log_record = self.log_records.create( type=AlertGroupLogRecord.TYPE_UNATTACHED, author=user, root_alert_group=root_alert_group, reason="Unattach button", action_source=action_source, ) logger.debug( f"send alert_group_action_triggered_signal for alert_group {self.pk}, " f"log record {log_record.pk} with type '{log_record.get_type_display()}', " f"action source: {action_source}" ) transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) log_record_for_root_incident = root_alert_group.log_records.create( type=AlertGroupLogRecord.TYPE_UNATTACHED, author=user, dependent_alert_group=self, reason="Unattach dropdown", action_source=action_source, ) logger.debug( f"send alert_group_action_triggered_signal for alert_group {root_alert_group.pk}, " f"log record {log_record_for_root_incident.pk} " f"with type '{log_record_for_root_incident.get_type_display()}', action source: {action_source}" ) transaction.on_commit(partial(send_alert_group_signal.delay, log_record_for_root_incident.pk)) def un_attach_by_delete(self): from apps.alerts.models import AlertGroupLogRecord self.root_alert_group = None self.save(update_fields=["root_alert_group"]) self.start_escalation_if_needed() with transaction.atomic(): log_record = self.log_records.create( type=AlertGroupLogRecord.TYPE_UNATTACHED, reason="Unattach by deleting root incident", ) logger.debug( f"send alert_group_action_triggered_signal for alert_group {self.pk}, " f"log record {log_record.pk} with type '{log_record.get_type_display()}', " f"action source: delete" ) transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) def silence_by_user_or_backsync( self, user: typing.Optional[User] = None, source_channel: typing.Optional["AlertReceiveChannel"] = None, silence_delay: typing.Optional[int] = None, action_source: typing.Optional[ActionSource] = None, ) -> None: from apps.alerts.models import AlertGroupLogRecord initial_state = self.state reason = "Silence button" if user else "Backsync signal" step_specific_info = ( {"source_integration_name": source_channel.verbal_name} if action_source == ActionSource.BACKSYNC else None ) organization_id = user.organization_id if user else self.channel.organization_id if self.resolved: self.unresolve() self.log_records.create( type=AlertGroupLogRecord.TYPE_UN_RESOLVED, author=user, reason=reason, action_source=action_source, step_specific_info=step_specific_info, ) if self.acknowledged: self.unacknowledge() self.log_records.create( type=AlertGroupLogRecord.TYPE_UN_ACK, author=user, reason=reason, action_source=action_source, step_specific_info=step_specific_info, ) if self.silenced: self.un_silence() self.log_records.create( type=AlertGroupLogRecord.TYPE_UN_SILENCE, author=user, silence_delay=None, reason=reason, action_source=action_source, step_specific_info=step_specific_info, ) now = timezone.now() if silence_delay is not None and silence_delay > 0: silence_delay_timedelta = datetime.timedelta(seconds=silence_delay) silenced_until = now + silence_delay_timedelta if self.is_root_alert_group: self.update_next_step_eta(datetime.timedelta(seconds=silence_delay + START_ESCALATION_DELAY)) self.start_unsilence_task(countdown=silence_delay) else: silence_delay_timedelta = None silenced_until = None self.silence( silenced_at=now, silenced_until=silenced_until, silenced_by_user=user, raw_escalation_snapshot=self.raw_escalation_snapshot, ) # Update alert group state and response time metrics cache self._update_metrics(organization_id=organization_id, previous_state=initial_state, state=self.state) with transaction.atomic(): log_record = self.log_records.create( type=AlertGroupLogRecord.TYPE_SILENCE, author=user, silence_delay=silence_delay_timedelta, reason=reason, action_source=action_source, step_specific_info=step_specific_info, ) logger.debug( f"send alert_group_action_triggered_signal for alert_group {self.pk}, " f"log record {log_record.pk} with type '{log_record.get_type_display()}', " f"action source: {action_source}" ) transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) for dependent_alert_group in self.dependent_alert_groups.all(): dependent_alert_group.silence_by_user_or_backsync(user, source_channel, silence_delay, action_source) def un_silence_by_user_or_backsync( self, user: typing.Optional[User] = None, source_channel: typing.Optional["AlertReceiveChannel"] = None, action_source: typing.Optional[ActionSource] = None, ) -> None: from apps.alerts.models import AlertGroupLogRecord initial_state = self.state reason = "Unsilence button" if user else "Backsync signal" step_specific_info = ( {"source_integration_name": source_channel.verbal_name} if action_source == ActionSource.BACKSYNC else None ) organization_id = user.organization_id if user else self.channel.organization_id self.un_silence() # Update alert group state metric cache self._update_metrics(organization_id=organization_id, previous_state=initial_state, state=self.state) if self.is_root_alert_group: self.start_escalation_if_needed() with transaction.atomic(): log_record = self.log_records.create( type=AlertGroupLogRecord.TYPE_UN_SILENCE, author=user, silence_delay=None, # 2.Look like some time ago there was no TYPE_UN_SILENCE reason=reason, action_source=action_source, step_specific_info=step_specific_info, ) logger.debug( f"send alert_group_action_triggered_signal for alert_group {self.pk}, " f"log record {log_record.pk} with type '{log_record.get_type_display()}', " f"action source: {action_source}" ) transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) for dependent_alert_group in self.dependent_alert_groups.all(): dependent_alert_group.un_silence_by_user_or_backsync( user, source_channel=source_channel, action_source=action_source ) def wipe_by_user(self, user: User) -> None: from apps.alerts.models import AlertGroupLogRecord initial_state = self.state if not self.wiped_at: self.resolve(resolved_by=AlertGroup.WIPED) self.stop_escalation() self.distinction = "" self.web_title_cache = None self.wiped_at = timezone.now() self.wiped_by = user update_fields = ["distinction", "web_title_cache", "wiped_at", "wiped_by"] if self.response_time is None: self.response_time = self._get_response_time() update_fields += ["response_time"] for alert in self.alerts.all(): alert.wipe(wiped_by=self.wiped_by, wiped_at=self.wiped_at) self.save(update_fields=update_fields) # Update alert group state and response time metrics cache self._update_metrics(organization_id=user.organization_id, previous_state=initial_state, state=self.state) with transaction.atomic(): log_record = self.log_records.create( type=AlertGroupLogRecord.TYPE_WIPED, author=user, ) logger.debug( f"send alert_group_action_triggered_signal for alert_group {self.pk}, " f"log record {log_record.pk} with type '{log_record.get_type_display()}', " f"action source: wipe" ) transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) for dependent_alert_group in self.dependent_alert_groups.all(): dependent_alert_group.wipe_by_user(user) def delete_by_user(self, user: User): from apps.alerts.models import AlertGroupLogRecord self.stop_escalation() with transaction.atomic(): # prevent creating multiple logs # filter instead of get_or_create cause it can be multiple logs of this type due deleting error log_record = self.log_records.filter(type=AlertGroupLogRecord.TYPE_DELETED).last() if not log_record: log_record = self.log_records.create( type=AlertGroupLogRecord.TYPE_DELETED, author=user, ) logger.debug( f"send alert_group_action_triggered_signal for alert_group {self.pk}, " f"in channel {self.channel.pk}, in org {self.channel.organization.pk}, by user {user.pk}, " f"log record {log_record.pk} with type '{log_record.get_type_display()}', " f"action source: delete" ) transaction.on_commit(partial(send_alert_group_signal_for_delete.delay, self.pk, log_record.pk)) def finish_delete_by_user(self): dependent_alerts = list(self.dependent_alert_groups.all()) self.hard_delete() # unattach dependent incidents for dependent_alert_group in dependent_alerts: dependent_alert_group.un_attach_by_delete() def hard_delete(self): from apps.alerts.models import ResolutionNote alerts = self.alerts.all() alerts.delete() self.slack_messages.all().delete() self.personal_log_records.all().delete() self.log_records.all().delete() self.invitations.all().delete() resolution_notes = ResolutionNote.objects_with_deleted.filter(alert_group=self) resolution_notes.delete() self.resolution_note_slack_messages.all().delete() self.delete() @staticmethod def _bulk_acknowledge(user: User, alert_groups_to_acknowledge: "QuerySet[AlertGroup]") -> None: from apps.alerts.models import AlertGroupLogRecord # it is needed to unserolve those alert_groups which were resolved to build proper log. alert_groups_to_unresolve_before_acknowledge = alert_groups_to_acknowledge.filter(resolved=models.Value("1")) # it is needed to unsilence those alert_groups which were silenced to build proper log. alert_groups_to_unsilence_before_acknowledge = alert_groups_to_acknowledge.filter(silenced=models.Value("1")) # convert current qs to list to prevent changes by update alert_groups_to_acknowledge_list = list(alert_groups_to_acknowledge) alert_groups_to_unresolve_before_acknowledge_list = list(alert_groups_to_unresolve_before_acknowledge) alert_groups_to_unsilence_before_acknowledge_list = list(alert_groups_to_unsilence_before_acknowledge) previous_states = [] for alert_group in alert_groups_to_acknowledge_list: previous_states.append(alert_group.state) alert_group.acknowledged = True alert_group.resolved = False alert_group.resolved_at = None alert_group.resolved_by = AlertGroup.NOT_YET alert_group.resolved_by_user = None alert_group.silenced_until = None alert_group.silenced_by_user = None alert_group.silenced_at = None alert_group.silenced = False alert_group.acknowledged_at = timezone.now() alert_group.acknowledged_by_user = user alert_group.acknowledged_by = AlertGroup.USER alert_group.is_escalation_finished = True if alert_group.response_time is None: alert_group.response_time = alert_group._get_response_time() fields_to_update = [ "acknowledged", "resolved", "resolved_at", "resolved_by", "resolved_by_user", "silenced_until", "silenced_by_user", "silenced_at", "silenced", "acknowledged_at", "acknowledged_by_user", "acknowledged_by", "is_escalation_finished", "response_time", ] AlertGroup.objects.bulk_update(alert_groups_to_acknowledge_list, fields=fields_to_update, batch_size=100) for alert_group in alert_groups_to_unresolve_before_acknowledge_list: alert_group.log_records.create( type=AlertGroupLogRecord.TYPE_UN_RESOLVED, author=user, reason="Bulk action acknowledge", ) for alert_group in alert_groups_to_unsilence_before_acknowledge_list: alert_group.log_records.create( type=AlertGroupLogRecord.TYPE_UN_SILENCE, author=user, reason="Bulk action acknowledge" ) for alert_group, previous_state in zip(alert_groups_to_acknowledge_list, previous_states): # update metrics cache alert_group._update_metrics( organization_id=user.organization_id, previous_state=previous_state, state=AlertGroupState.ACKNOWLEDGED, ) alert_group.start_ack_reminder_if_needed() log_record = alert_group.log_records.create(type=AlertGroupLogRecord.TYPE_ACK, author=user) transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) @staticmethod def bulk_acknowledge(user: User, alert_groups: "QuerySet[AlertGroup]") -> None: root_alert_groups_to_acknowledge = alert_groups.filter( ~Q(acknowledged=True, resolved=False), # don't need to ack acknowledged incidents once again root_alert_group__isnull=True, maintenance_uuid__isnull=True, # don't ack maintenance incident ) # Find all dependent alert_groups to update them in one query # convert qs to list to prevent changes by update root_alert_group_pks = list(root_alert_groups_to_acknowledge.values_list("pk", flat=True)) dependent_alert_groups_to_acknowledge = AlertGroup.objects.filter(root_alert_group__pk__in=root_alert_group_pks) with transaction.atomic(): AlertGroup._bulk_acknowledge(user, root_alert_groups_to_acknowledge) AlertGroup._bulk_acknowledge(user, dependent_alert_groups_to_acknowledge) @staticmethod def _bulk_resolve(user: User, alert_groups_to_resolve: "QuerySet[AlertGroup]") -> None: from apps.alerts.models import AlertGroupLogRecord # it is needed to unsilence those alert_groups which were silenced to build proper log. alert_groups_to_unsilence_before_resolve = alert_groups_to_resolve.filter(silenced=models.Value("1")) # convert current qs to list to prevent changes by update alert_groups_to_resolve_list = list(alert_groups_to_resolve) alert_groups_to_unsilence_before_resolve_list = list(alert_groups_to_unsilence_before_resolve) previous_states = [] for alert_group in alert_groups_to_resolve_list: previous_states.append(alert_group.state) alert_group.resolved = True alert_group.resolved_at = timezone.now() alert_group.is_open_for_grouping = None alert_group.resolved_by_user = user alert_group.resolved_by = AlertGroup.USER alert_group.is_escalation_finished = True alert_group.silenced_until = None alert_group.silenced_by_user = None alert_group.silenced_at = None alert_group.silenced = False if alert_group.response_time is None: alert_group.response_time = alert_group._get_response_time() fields_to_update = [ "resolved", "resolved_at", "resolved_by", "resolved_by_user", "is_open_for_grouping", "silenced_until", "silenced_by_user", "silenced_at", "silenced", "is_escalation_finished", "response_time", ] AlertGroup.objects.bulk_update(alert_groups_to_resolve_list, fields=fields_to_update, batch_size=100) for alert_group in alert_groups_to_unsilence_before_resolve_list: alert_group.log_records.create( type=AlertGroupLogRecord.TYPE_UN_SILENCE, author=user, reason="Bulk action resolve" ) for alert_group, previous_state in zip(alert_groups_to_resolve_list, previous_states): # update metrics cache alert_group._update_metrics( organization_id=user.organization_id, previous_state=previous_state, state=AlertGroupState.RESOLVED, ) log_record = alert_group.log_records.create(type=AlertGroupLogRecord.TYPE_RESOLVED, author=user) transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) @staticmethod def bulk_resolve(user: User, alert_groups: "QuerySet[AlertGroup]") -> None: # stop maintenance for maintenance incidents alert_groups_to_stop_maintenance = alert_groups.filter(resolved=False, maintenance_uuid__isnull=False) for alert_group in alert_groups_to_stop_maintenance: alert_group.stop_maintenance(user) root_alert_groups_to_resolve = alert_groups.filter( resolved=False, root_alert_group__isnull=True, maintenance_uuid__isnull=True, ) if not root_alert_groups_to_resolve.exists(): return # we know this is an AlertGroup because of the .exists() check just above first_alert_group: AlertGroup = root_alert_groups_to_resolve.first() organization = first_alert_group.channel.organization if organization.is_resolution_note_required: root_alert_groups_to_resolve = root_alert_groups_to_resolve.filter( Q(resolution_notes__isnull=False, resolution_notes__deleted_at=None) ) # convert qs to list to prevent changes by update root_alert_group_pks = list(root_alert_groups_to_resolve.values_list("pk", flat=True)) dependent_alert_groups_to_resolve = AlertGroup.objects.filter(root_alert_group__pk__in=root_alert_group_pks) with transaction.atomic(): AlertGroup._bulk_resolve(user, root_alert_groups_to_resolve) AlertGroup._bulk_resolve(user, dependent_alert_groups_to_resolve) @staticmethod def _bulk_restart_unack(user: User, alert_groups_to_restart_unack: "QuerySet[AlertGroup]") -> None: from apps.alerts.models import AlertGroupLogRecord # convert current qs to list to prevent changes by update alert_groups_to_restart_unack_list = list(alert_groups_to_restart_unack) alert_groups_to_restart_unack.update( acknowledged=False, acknowledged_at=None, acknowledged_by_user=None, acknowledged_by=AlertGroup.NOT_YET, resolved=False, resolved_at=None, is_open_for_grouping=None, resolved_by_user=None, resolved_by=AlertGroup.NOT_YET, silenced_until=None, silenced_by_user=None, silenced_at=None, silenced=False, restarted_at=timezone.now(), ) # unacknowledge alert groups for alert_group in alert_groups_to_restart_unack_list: # update metrics cache (note alert_group.state is the original alert group's state) alert_group._update_metrics( organization_id=user.organization_id, previous_state=alert_group.state, state=AlertGroupState.FIRING, ) log_record = alert_group.log_records.create( type=AlertGroupLogRecord.TYPE_UN_ACK, author=user, reason="Bulk action restart", ) if alert_group.is_root_alert_group: alert_group.start_escalation_if_needed() transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) @staticmethod def _bulk_restart_unresolve(user: User, alert_groups_to_restart_unresolve: "QuerySet[AlertGroup]") -> None: from apps.alerts.models import AlertGroupLogRecord # convert current qs to list to prevent changes by update alert_groups_to_restart_unresolve_list = list(alert_groups_to_restart_unresolve) alert_groups_to_restart_unresolve.update( acknowledged=False, acknowledged_at=None, acknowledged_by_user=None, acknowledged_by=AlertGroup.NOT_YET, resolved=False, resolved_at=None, is_open_for_grouping=None, resolved_by_user=None, resolved_by=AlertGroup.NOT_YET, silenced_until=None, silenced_by_user=None, silenced_at=None, silenced=False, restarted_at=timezone.now(), ) # unresolve alert groups for alert_group in alert_groups_to_restart_unresolve_list: # update metrics cache (note alert_group.state is the original alert group's state) alert_group._update_metrics( organization_id=user.organization_id, previous_state=alert_group.state, state=AlertGroupState.FIRING, ) log_record = alert_group.log_records.create( type=AlertGroupLogRecord.TYPE_UN_RESOLVED, author=user, reason="Bulk action restart", ) if alert_group.is_root_alert_group: alert_group.start_escalation_if_needed() transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) @staticmethod def _bulk_restart_unsilence(user: User, alert_groups_to_restart_unsilence: "QuerySet[AlertGroup]") -> None: from apps.alerts.models import AlertGroupLogRecord # convert current qs to list to prevent changes by update alert_groups_to_restart_unsilence_list = list(alert_groups_to_restart_unsilence) alert_groups_to_restart_unsilence.update( acknowledged=False, acknowledged_at=None, acknowledged_by_user=None, acknowledged_by=AlertGroup.NOT_YET, resolved=False, resolved_at=None, is_open_for_grouping=None, resolved_by_user=None, resolved_by=AlertGroup.NOT_YET, silenced_until=None, silenced_by_user=None, silenced_at=None, silenced=False, restarted_at=timezone.now(), ) # unsilence alert groups for alert_group in alert_groups_to_restart_unsilence_list: # update metrics cache (note alert_group.state is the original alert group's state) alert_group._update_metrics( organization_id=user.organization_id, previous_state=alert_group.state, state=AlertGroupState.FIRING, ) log_record = alert_group.log_records.create( type=AlertGroupLogRecord.TYPE_UN_SILENCE, author=user, reason="Bulk action restart" ) alert_group.start_escalation_if_needed() transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) @staticmethod def bulk_restart(user: User, alert_groups: "QuerySet[AlertGroup]") -> None: root_alert_groups_unack = alert_groups.filter( resolved=False, acknowledged=True, root_alert_group__isnull=True, maintenance_uuid__isnull=True, # don't restart maintenance incident ) # convert qs to list to prevent changes by update root_alert_group_pks = list(root_alert_groups_unack.values_list("pk", flat=True)) dependent_alert_groups_unack = AlertGroup.objects.filter(root_alert_group__pk__in=root_alert_group_pks) with transaction.atomic(): AlertGroup._bulk_restart_unack(user, root_alert_groups_unack) AlertGroup._bulk_restart_unack(user, dependent_alert_groups_unack) root_alert_groups_unresolve = alert_groups.filter(resolved=True, root_alert_group__isnull=True) # convert qs to list to prevent changes by update root_alert_group_pks = list(root_alert_groups_unresolve.values_list("pk", flat=True)) dependent_alert_groups_unresolve = AlertGroup.objects.filter(root_alert_group__pk__in=root_alert_group_pks) with transaction.atomic(): AlertGroup._bulk_restart_unresolve(user, root_alert_groups_unresolve) AlertGroup._bulk_restart_unresolve(user, dependent_alert_groups_unresolve) alert_groups_to_restart_unsilence = alert_groups.filter( resolved=False, acknowledged=False, silenced=True, root_alert_group__isnull=True, ) AlertGroup._bulk_restart_unsilence(user, alert_groups_to_restart_unsilence) @staticmethod def _bulk_silence(user: User, alert_groups_to_silence: "QuerySet[AlertGroup]", silence_delay: int) -> None: from apps.alerts.models import AlertGroupLogRecord now = timezone.now() silence_for_period = silence_delay is not None and silence_delay > 0 if silence_for_period: silence_delay_timedelta = datetime.timedelta(seconds=silence_delay) silenced_until = now + silence_delay_timedelta else: silence_delay_timedelta = None silenced_until = None alert_groups_to_unsilence_before_silence = alert_groups_to_silence.filter( silenced=True, acknowledged=False, resolved=False ) alert_groups_to_unacknowledge_before_silence = alert_groups_to_silence.filter(resolved=False, acknowledged=True) alert_groups_to_unresolve_before_silence = alert_groups_to_silence.filter(resolved=True) # convert current qs to list to prevent changes by update alert_groups_to_silence_list = list(alert_groups_to_silence) alert_groups_to_unsilence_before_silence_list = list(alert_groups_to_unsilence_before_silence) alert_groups_to_unacknowledge_before_silence_list = list(alert_groups_to_unacknowledge_before_silence) alert_groups_to_unresolve_before_silence_list = list(alert_groups_to_unresolve_before_silence) previous_states = [] for alert_group in alert_groups_to_silence_list: previous_states.append(alert_group.state) alert_group.acknowledged = False alert_group.acknowledged_at = None alert_group.acknowledged_by_user = None alert_group.acknowledged_by = AlertGroup.NOT_YET alert_group.resolved = False alert_group.resolved_at = None alert_group.resolved_by_user = None alert_group.resolved_by = AlertGroup.NOT_YET alert_group.silenced = True alert_group.silenced_at = now alert_group.silenced_until = silenced_until alert_group.silenced_by_user = user if not silence_for_period: alert_group.is_escalation_finished = True else: alert_group.update_next_step_eta(datetime.timedelta(seconds=silence_delay + START_ESCALATION_DELAY)) if alert_group.response_time is None: alert_group.response_time = alert_group._get_response_time() fields_to_update = [ "acknowledged", "acknowledged_at", "acknowledged_by_user", "acknowledged_by", "resolved", "resolved_at", "resolved_by_user", "resolved_by", "silenced", "silenced_at", "silenced_until", "silenced_by_user", "is_escalation_finished", "raw_escalation_snapshot", "response_time", ] AlertGroup.objects.bulk_update(alert_groups_to_silence_list, fields=fields_to_update, batch_size=100) # create log records for alert_group in alert_groups_to_unresolve_before_silence_list: alert_group.log_records.create( type=AlertGroupLogRecord.TYPE_UN_RESOLVED, author=user, reason="Bulk action silence", ) for alert_group in alert_groups_to_unsilence_before_silence_list: alert_group.log_records.create( type=AlertGroupLogRecord.TYPE_UN_SILENCE, author=user, reason="Bulk action silence", ) for alert_group in alert_groups_to_unacknowledge_before_silence_list: alert_group.log_records.create( type=AlertGroupLogRecord.TYPE_UN_ACK, author=user, reason="Bulk action silence", ) for alert_group, previous_state in zip(alert_groups_to_silence_list, previous_states): # update metrics cache alert_group._update_metrics( organization_id=user.organization_id, previous_state=previous_state, state=AlertGroupState.SILENCED, ) log_record = alert_group.log_records.create( type=AlertGroupLogRecord.TYPE_SILENCE, author=user, silence_delay=silence_delay_timedelta, reason="Bulk action silence", ) transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) if silence_for_period and alert_group.is_root_alert_group: alert_group.start_unsilence_task(countdown=silence_delay) @staticmethod def bulk_silence(user: User, alert_groups: "QuerySet[AlertGroup]", silence_delay: int) -> None: root_alert_groups_to_silence = alert_groups.filter( root_alert_group__isnull=True, maintenance_uuid__isnull=True, # don't silence maintenance incident ) # convert qs to list to prevent changes by update root_alert_group_pks = list(root_alert_groups_to_silence.values_list("pk", flat=True)) dependent_alert_groups_to_silence = alert_groups.filter(root_alert_group__pk__in=root_alert_group_pks) with transaction.atomic(): AlertGroup._bulk_silence(user, root_alert_groups_to_silence, silence_delay) AlertGroup._bulk_silence(user, dependent_alert_groups_to_silence, silence_delay) def start_ack_reminder_if_needed(self) -> None: from apps.user_management.models import Organization if not self.is_root_alert_group: return # Check if the "Remind every N hours" setting is enabled countdown = Organization.ACKNOWLEDGE_REMIND_DELAY[self.channel.organization.acknowledge_remind_timeout] if not countdown: return self.last_unique_unacknowledge_process_id = celery_uuid() self.save(update_fields=["last_unique_unacknowledge_process_id"]) acknowledge_reminder_task.apply_async((self.pk, self.last_unique_unacknowledge_process_id), countdown=countdown) def start_unsilence_task(self, countdown): task_id = celery_uuid() self.unsilence_task_uuid = task_id self.save(update_fields=["unsilence_task_uuid"]) unsilence_task.apply_async((self.pk,), task_id=task_id, countdown=countdown) @property def is_root_alert_group(self): return self.root_alert_group is None def acknowledge(self, **kwargs): if not self.acknowledged: self.acknowledged = True self.acknowledged_at = timezone.now() for k, v in kwargs.items(): setattr(self, k, v) update_fields = ["acknowledged", "acknowledged_at", *kwargs.keys()] if self.response_time is None: self.response_time = self._get_response_time() update_fields += ["response_time"] self.save(update_fields=update_fields) def unacknowledge(self): self.un_silence() if self.acknowledged: self.acknowledged = False self.acknowledged_at = None self.acknowledged_by_user = None self.acknowledged_by = AlertGroup.NOT_YET self.save(update_fields=["acknowledged", "acknowledged_at", "acknowledged_by_user", "acknowledged_by"]) def resolve(self, **kwargs): if not self.resolved: self.resolved = True self.resolved_at = timezone.now() self.is_open_for_grouping = None for k, v in kwargs.items(): setattr(self, k, v) update_fields = ["resolved", "resolved_at", "is_open_for_grouping", *kwargs.keys()] if self.response_time is None: self.response_time = self._get_response_time() update_fields += ["response_time"] self.save(update_fields=update_fields) def unresolve(self): self.unacknowledge() if self.resolved: self.resolved = False self.resolved_at = None self.resolved_by = AlertGroup.NOT_YET self.resolved_by_user = None self.save(update_fields=["resolved", "resolved_at", "resolved_by", "resolved_by_user"]) def silence(self, **kwargs): if not self.silenced: self.silenced = True if "silenced_at" not in kwargs: kwargs["silenced_at"] = timezone.now() for k, v in kwargs.items(): setattr(self, k, v) update_fields = ["silenced", *kwargs.keys()] if self.response_time is None: self.response_time = self._get_response_time() update_fields += ["response_time"] self.save(update_fields=update_fields) def un_silence(self): self.silenced_until = None self.silenced_by_user = None self.silenced_at = None self.silenced = False self.unsilence_task_uuid = None self.restarted_at = timezone.now() self.save( update_fields=[ "silenced_until", "silenced", "silenced_by_user", "silenced_at", "unsilence_task_uuid", "restarted_at", ] ) @property def long_verbose_name(self): title = str_or_backup(self.slack_templated_first_alert.title, DEFAULT_BACKUP_TITLE) return title @property def long_verbose_name_without_formatting(self): sf = SlackFormatter(self.channel.organization) title = self.long_verbose_name title = sf.format(title) title = clean_markup(title) return title def get_resolve_text(self, mention_user=False): if self.resolved_by == AlertGroup.SOURCE: return "Resolved by alert source" elif self.resolved_by == AlertGroup.LAST_STEP: return "Resolved automatically" elif self.resolved_by == AlertGroup.WIPED: return "Resolved by wipe" elif self.resolved_by == AlertGroup.DISABLE_MAINTENANCE: return "Resolved by stop maintenance" else: if self.resolved_by_user is not None: user_text = self.resolved_by_user.get_username_with_slack_verbal(mention=mention_user) return f"Resolved by {user_text}" else: return "Resolved" def get_acknowledge_text(self, mention_user=False): if self.acknowledged_by == AlertGroup.SOURCE: return "Acknowledged by alert source" elif self.acknowledged_by == AlertGroup.USER and self.acknowledged_by_user is not None: user_text = self.acknowledged_by_user.get_username_with_slack_verbal(mention=mention_user) return f"Acknowledged by {user_text}" else: return "Acknowledged" def render_after_resolve_report_json(self) -> list[LogRecords]: from apps.alerts.models import AlertGroupLogRecord, ResolutionNote from apps.base.models import UserNotificationPolicyLogRecord log_builder = IncidentLogBuilder(self) log_records_list = log_builder.get_log_records_list(with_resolution_notes=True) result_log_report = list() for log_record in log_records_list: if type(log_record) is AlertGroupLogRecord: result_log_report.append(log_record.render_log_line_json()) elif type(log_record) is UserNotificationPolicyLogRecord: result_log_report.append(log_record.rendered_notification_log_line_json) elif type(log_record) is ResolutionNote: result_log_report.append(log_record.render_log_line_json()) return result_log_report @property def has_resolution_notes(self): return self.resolution_notes.exists() @property def state(self): if self.resolved: return AlertGroupState.RESOLVED elif self.acknowledged: return AlertGroupState.ACKNOWLEDGED elif self.silenced: return AlertGroupState.SILENCED else: return AlertGroupState.FIRING @property def notify_in_slack_enabled(self): channel_filter = self.channel_filter_with_respect_to_escalation_snapshot if channel_filter is not None: return channel_filter.notify_in_slack else: return True @property def is_presented_in_slack(self): return self.slack_message and self.channel.organization.slack_team_identity @property def slack_channel_id(self) -> str | None: channel_filter = self.channel_filter if self.slack_message: return self.slack_message.channel.slack_id elif channel_filter and channel_filter.slack_channel_or_org_default: return channel_filter.slack_channel_or_org_default.slack_id return None @property def slack_message(self) -> typing.Optional["SlackMessage"]: """ `slack_message` property returns the first `SlackMessage` for the `AlertGroup`. This corresponds to the Slack message representing the main message in Slack (ie. not a message in a thread). This should not be confused with `slack_messages`, which is a `RelatedManager` that returns all `SlackMessage` instances for the `AlertGroup`. """ try: # prefetched_slack_messages could be set in apps.api.serializers.alert_group.AlertGroupListSerializer return self.prefetched_slack_messages[0] if self.prefetched_slack_messages else None except AttributeError: return self.slack_messages.order_by("created_at").first() @cached_property def last_stop_escalation_log(self): from apps.alerts.models import AlertGroupLogRecord stop_escalation_log = ( self.log_records.filter( type__in=[ AlertGroupLogRecord.TYPE_RESOLVED, AlertGroupLogRecord.TYPE_ACK, AlertGroupLogRecord.TYPE_SILENCE, ] ) .order_by("pk") .last() ) return stop_escalation_log def alerts_count_gt(self, max_alerts) -> bool: """ alerts_count_gt checks if there are more than max_alerts alerts in given alert group. It's optimized for alert groups with big number of alerts and relatively small max_alerts. """ count = self.alerts.all()[: max_alerts + 1].count() return count > max_alerts