diff --git a/CHANGELOG.md b/CHANGELOG.md index 666b54a1..77134072 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## Unreleased +### Changed + +- Ensure alert group log records are committed to DB before signalling about them @mderynck([#3731](https://github.com/grafana/oncall/pull/3731)) + ### Fixed - Address `SlackAPIRatelimitError` exceptions in `apps.slack.tasks.send_message_to_thread_if_bot_not_in_channel` task diff --git a/engine/apps/alerts/models/alert_group.py b/engine/apps/alerts/models/alert_group.py index d3d2479d..a663b571 100644 --- a/engine/apps/alerts/models/alert_group.py +++ b/engine/apps/alerts/models/alert_group.py @@ -20,8 +20,13 @@ from apps.alerts.escalation_snapshot.escalation_snapshot_mixin import START_ESCA from apps.alerts.incident_appearance.renderers.constants import DEFAULT_BACKUP_TITLE from apps.alerts.incident_appearance.renderers.slack_renderer import AlertGroupSlackRenderer from apps.alerts.incident_log_builder import IncidentLogBuilder -from apps.alerts.signals import alert_group_action_triggered_signal, alert_group_created_signal -from apps.alerts.tasks import acknowledge_reminder_task, send_alert_group_signal, unsilence_task +from apps.alerts.signals import alert_group_created_signal +from apps.alerts.tasks import ( + acknowledge_reminder_task, + send_alert_group_signal, + send_alert_group_signal_for_delete, + unsilence_task, +) from apps.metrics_exporter.tasks import update_metrics_for_alert_group from apps.slack.slack_formatter import SlackFormatter from apps.user_management.models import User @@ -639,20 +644,17 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. self.stop_escalation() self.start_ack_reminder_if_needed() - log_record = self.log_records.create( - type=AlertGroupLogRecord.TYPE_ACK, author=user, action_source=action_source - ) + with transaction.atomic(): + log_record = self.log_records.create( + type=AlertGroupLogRecord.TYPE_ACK, author=user, action_source=action_source + ) - logger.debug( - f"send alert_group_action_triggered_signal for alert_group {self.pk}, " - f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: {action_source}" - ) + logger.debug( + f"send alert_group_action_triggered_signal for alert_group {self.pk}, " + f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: {action_source}" + ) - alert_group_action_triggered_signal.send( - sender=self.acknowledge_by_user, - log_record=log_record.pk, - action_source=action_source, - ) + transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) for dependent_alert_group in self.dependent_alert_groups.all(): dependent_alert_group.acknowledge_by_user(user, action_source=action_source) @@ -679,18 +681,15 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. ) self.stop_escalation() - log_record = self.log_records.create(type=AlertGroupLogRecord.TYPE_ACK) + with transaction.atomic(): + log_record = self.log_records.create(type=AlertGroupLogRecord.TYPE_ACK) - logger.debug( - f"send alert_group_action_triggered_signal for alert_group {self.pk}, " - f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: alert" - ) + logger.debug( + f"send alert_group_action_triggered_signal for alert_group {self.pk}, " + f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: alert" + ) - alert_group_action_triggered_signal.send( - sender=self.acknowledge_by_source, - log_record=log_record.pk, - action_source=None, - ) + transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) for dependent_alert_group in self.dependent_alert_groups.all(): dependent_alert_group.acknowledge_by_source() @@ -707,20 +706,17 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. if self.is_root_alert_group: self.start_escalation_if_needed() - log_record = self.log_records.create( - type=AlertGroupLogRecord.TYPE_UN_ACK, author=user, action_source=action_source - ) + with transaction.atomic(): + log_record = self.log_records.create( + type=AlertGroupLogRecord.TYPE_UN_ACK, author=user, action_source=action_source + ) - logger.debug( - f"send alert_group_action_triggered_signal for alert_group {self.pk}, " - f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: {action_source}" - ) + logger.debug( + f"send alert_group_action_triggered_signal for alert_group {self.pk}, " + f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: {action_source}" + ) - alert_group_action_triggered_signal.send( - sender=self.un_acknowledge_by_user, - log_record=log_record.pk, - action_source=action_source, - ) + transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) for dependent_alert_group in self.dependent_alert_groups.all(): dependent_alert_group.un_acknowledge_by_user(user, action_source=action_source) @@ -745,20 +741,18 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. # Update alert group state and response time metrics cache self._update_metrics(organization_id=user.organization_id, previous_state=initial_state, state=self.state) self.stop_escalation() - log_record = self.log_records.create( - type=AlertGroupLogRecord.TYPE_RESOLVED, author=user, action_source=action_source - ) - logger.debug( - f"send alert_group_action_triggered_signal for alert_group {self.pk}, " - f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: {action_source}" - ) + with transaction.atomic(): + log_record = self.log_records.create( + type=AlertGroupLogRecord.TYPE_RESOLVED, author=user, action_source=action_source + ) - alert_group_action_triggered_signal.send( - sender=self.resolve_by_user, - log_record=log_record.pk, - action_source=action_source, - ) + logger.debug( + f"send alert_group_action_triggered_signal for alert_group {self.pk}, " + f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: {action_source}" + ) + + transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) for dependent_alert_group in self.dependent_alert_groups.all(): dependent_alert_group.resolve_by_user(user, action_source=action_source) @@ -782,18 +776,16 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. organization_id=self.channel.organization_id, previous_state=initial_state, state=self.state ) self.stop_escalation() - log_record = self.log_records.create(type=AlertGroupLogRecord.TYPE_RESOLVED) - logger.debug( - f"send alert_group_action_triggered_signal for alert_group {self.pk}, " - f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: alert" - ) + with transaction.atomic(): + log_record = self.log_records.create(type=AlertGroupLogRecord.TYPE_RESOLVED) - alert_group_action_triggered_signal.send( - sender=self.resolve_by_source, - log_record=log_record.pk, - action_source=None, - ) + logger.debug( + f"send alert_group_action_triggered_signal for alert_group {self.pk}, " + f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: alert" + ) + + transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) for dependent_alert_group in self.dependent_alert_groups.all(): dependent_alert_group.resolve_by_source() @@ -809,18 +801,16 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. organization_id=self.channel.organization_id, previous_state=initial_state, state=self.state ) self.stop_escalation() - log_record = self.log_records.create(type=AlertGroupLogRecord.TYPE_RESOLVED) - logger.debug( - f"send alert_group_action_triggered_signal for alert_group {self.pk}, " - f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: resolve step" - ) + with transaction.atomic(): + log_record = self.log_records.create(type=AlertGroupLogRecord.TYPE_RESOLVED) - alert_group_action_triggered_signal.send( - sender=self.resolve_by_last_step, - log_record=log_record.pk, - action_source=None, - ) + logger.debug( + f"send alert_group_action_triggered_signal for alert_group {self.pk}, " + f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: resolve step" + ) + + transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) for dependent_alert_group in self.dependent_alert_groups.all(): dependent_alert_group.resolve_by_last_step() @@ -830,19 +820,17 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. self.resolve(resolved_by=AlertGroup.DISABLE_MAINTENANCE) self.stop_escalation() - log_record = self.log_records.create(type=AlertGroupLogRecord.TYPE_RESOLVED) - logger.debug( - f"send alert_group_action_triggered_signal for alert_group {self.pk}, " - f"log record {log_record.pk} with type '{log_record.get_type_display()}', " - f"action source: disable maintenance" - ) + with transaction.atomic(): + log_record = self.log_records.create(type=AlertGroupLogRecord.TYPE_RESOLVED) - alert_group_action_triggered_signal.send( - sender=self.resolve_by_disable_maintenance, - log_record=log_record.pk, - action_source=None, - ) + logger.debug( + f"send alert_group_action_triggered_signal for alert_group {self.pk}, " + f"log record {log_record.pk} with type '{log_record.get_type_display()}', " + f"action source: disable maintenance" + ) + + transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) for dependent_alert_group in self.dependent_alert_groups.all(): dependent_alert_group.resolve_by_disable_maintenance() @@ -856,24 +844,21 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. # Update alert group state metric cache self._update_metrics(organization_id=user.organization_id, previous_state=initial_state, state=self.state) - log_record = self.log_records.create( - type=AlertGroupLogRecord.TYPE_UN_RESOLVED, author=user, action_source=action_source - ) + with transaction.atomic(): + log_record = self.log_records.create( + type=AlertGroupLogRecord.TYPE_UN_RESOLVED, author=user, action_source=action_source + ) - if self.is_root_alert_group: - self.start_escalation_if_needed() + if self.is_root_alert_group: + self.start_escalation_if_needed() - logger.debug( - f"send alert_group_action_triggered_signal for alert_group {self.pk}, " - f"log record {log_record.pk} with type '{log_record.get_type_display()}', " - f"action source: {action_source}" - ) + logger.debug( + f"send alert_group_action_triggered_signal for alert_group {self.pk}, " + f"log record {log_record.pk} with type '{log_record.get_type_display()}', " + f"action source: {action_source}" + ) - alert_group_action_triggered_signal.send( - sender=self.un_resolve_by_user, - log_record=log_record.pk, - action_source=action_source, - ) + transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) for dependent_alert_group in self.dependent_alert_groups.all(): dependent_alert_group.un_resolve_by_user(user, action_source=action_source) @@ -898,25 +883,22 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. if not root_alert_group.silenced and self.silenced: self.un_silence_by_user(user, action_source=action_source) - log_record = self.log_records.create( - type=AlertGroupLogRecord.TYPE_ATTACHED, - author=user, - root_alert_group=root_alert_group, - reason="Attach dropdown", - action_source=action_source, - ) + with transaction.atomic(): + log_record = self.log_records.create( + type=AlertGroupLogRecord.TYPE_ATTACHED, + author=user, + root_alert_group=root_alert_group, + reason="Attach dropdown", + action_source=action_source, + ) - logger.debug( - f"send alert_group_action_triggered_signal for alert_group {self.pk}, " - f"log record {log_record.pk} with type '{log_record.get_type_display()}', " - f"action source: {action_source}" - ) + logger.debug( + f"send alert_group_action_triggered_signal for alert_group {self.pk}, " + f"log record {log_record.pk} with type '{log_record.get_type_display()}', " + f"action source: {action_source}" + ) - alert_group_action_triggered_signal.send( - sender=self.attach_by_user, - log_record=log_record.pk, - action_source=action_source, - ) + transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) log_record_for_root_incident = root_alert_group.log_records.create( type=AlertGroupLogRecord.TYPE_ATTACHED, @@ -932,11 +914,7 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. f"'{log_record_for_root_incident.get_type_display()}', action source: {action_source}" ) - alert_group_action_triggered_signal.send( - sender=self.attach_by_user, - log_record=log_record_for_root_incident.pk, - action_source=action_source, - ) + transaction.on_commit(partial(send_alert_group_signal.delay, log_record_for_root_incident.pk)) else: log_record = self.log_records.create( @@ -953,11 +931,7 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. f"action source: {action_source}" ) - alert_group_action_triggered_signal.send( - sender=self.attach_by_user, - log_record=log_record.pk, - action_source=action_source, - ) + transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) def un_attach_by_user(self, user: User, action_source: typing.Optional[ActionSource] = None) -> None: from apps.alerts.models import AlertGroupLogRecord @@ -968,45 +942,38 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. self.start_escalation_if_needed() - log_record = self.log_records.create( - type=AlertGroupLogRecord.TYPE_UNATTACHED, - author=user, - root_alert_group=root_alert_group, - reason="Unattach button", - action_source=action_source, - ) + with transaction.atomic(): + log_record = self.log_records.create( + type=AlertGroupLogRecord.TYPE_UNATTACHED, + author=user, + root_alert_group=root_alert_group, + reason="Unattach button", + action_source=action_source, + ) - logger.debug( - f"send alert_group_action_triggered_signal for alert_group {self.pk}, " - f"log record {log_record.pk} with type '{log_record.get_type_display()}', " - f"action source: {action_source}" - ) + logger.debug( + f"send alert_group_action_triggered_signal for alert_group {self.pk}, " + f"log record {log_record.pk} with type '{log_record.get_type_display()}', " + f"action source: {action_source}" + ) - alert_group_action_triggered_signal.send( - sender=self.un_attach_by_user, - log_record=log_record.pk, - action_source=action_source, - ) + transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) - log_record_for_root_incident = root_alert_group.log_records.create( - type=AlertGroupLogRecord.TYPE_UNATTACHED, - author=user, - dependent_alert_group=self, - reason="Unattach dropdown", - action_source=action_source, - ) + log_record_for_root_incident = root_alert_group.log_records.create( + type=AlertGroupLogRecord.TYPE_UNATTACHED, + author=user, + dependent_alert_group=self, + reason="Unattach dropdown", + action_source=action_source, + ) - logger.debug( - f"send alert_group_action_triggered_signal for alert_group {root_alert_group.pk}, " - f"log record {log_record_for_root_incident.pk} " - f"with type '{log_record_for_root_incident.get_type_display()}', action source: {action_source}" - ) + logger.debug( + f"send alert_group_action_triggered_signal for alert_group {root_alert_group.pk}, " + f"log record {log_record_for_root_incident.pk} " + f"with type '{log_record_for_root_incident.get_type_display()}', action source: {action_source}" + ) - alert_group_action_triggered_signal.send( - sender=self.un_attach_by_user, - log_record=log_record_for_root_incident.pk, - action_source=action_source, - ) + transaction.on_commit(partial(send_alert_group_signal.delay, log_record_for_root_incident.pk)) def un_attach_by_delete(self): from apps.alerts.models import AlertGroupLogRecord @@ -1016,22 +983,19 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. self.start_escalation_if_needed() - log_record = self.log_records.create( - type=AlertGroupLogRecord.TYPE_UNATTACHED, - reason="Unattach by deleting root incident", - ) + with transaction.atomic(): + log_record = self.log_records.create( + type=AlertGroupLogRecord.TYPE_UNATTACHED, + reason="Unattach by deleting root incident", + ) - logger.debug( - f"send alert_group_action_triggered_signal for alert_group {self.pk}, " - f"log record {log_record.pk} with type '{log_record.get_type_display()}', " - f"action source: delete" - ) + logger.debug( + f"send alert_group_action_triggered_signal for alert_group {self.pk}, " + f"log record {log_record.pk} with type '{log_record.get_type_display()}', " + f"action source: delete" + ) - alert_group_action_triggered_signal.send( - sender=self.un_attach_by_delete, - log_record=log_record.pk, - action_source=None, - ) + transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) def silence_by_user( self, user: User, silence_delay: typing.Optional[int], action_source: typing.Optional[ActionSource] = None @@ -1086,25 +1050,23 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. # Update alert group state and response time metrics cache self._update_metrics(organization_id=user.organization_id, previous_state=initial_state, state=self.state) - log_record = self.log_records.create( - type=AlertGroupLogRecord.TYPE_SILENCE, - author=user, - silence_delay=silence_delay_timedelta, - reason="Silence button", - action_source=action_source, - ) + with transaction.atomic(): + log_record = self.log_records.create( + type=AlertGroupLogRecord.TYPE_SILENCE, + author=user, + silence_delay=silence_delay_timedelta, + reason="Silence button", + action_source=action_source, + ) - logger.debug( - f"send alert_group_action_triggered_signal for alert_group {self.pk}, " - f"log record {log_record.pk} with type '{log_record.get_type_display()}', " - f"action source: {action_source}" - ) + logger.debug( + f"send alert_group_action_triggered_signal for alert_group {self.pk}, " + f"log record {log_record.pk} with type '{log_record.get_type_display()}', " + f"action source: {action_source}" + ) + + transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) - alert_group_action_triggered_signal.send( - sender=self.silence_by_user, - log_record=log_record.pk, - action_source=action_source, - ) for dependent_alert_group in self.dependent_alert_groups.all(): dependent_alert_group.silence_by_user(user, silence_delay, action_source) @@ -1120,26 +1082,24 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. if self.is_root_alert_group: self.start_escalation_if_needed() - log_record = self.log_records.create( - type=AlertGroupLogRecord.TYPE_UN_SILENCE, - author=user, - silence_delay=None, - # 2.Look like some time ago there was no TYPE_UN_SILENCE - reason="Unsilence button", - action_source=action_source, - ) + with transaction.atomic(): + log_record = self.log_records.create( + type=AlertGroupLogRecord.TYPE_UN_SILENCE, + author=user, + silence_delay=None, + # 2.Look like some time ago there was no TYPE_UN_SILENCE + reason="Unsilence button", + action_source=action_source, + ) - logger.debug( - f"send alert_group_action_triggered_signal for alert_group {self.pk}, " - f"log record {log_record.pk} with type '{log_record.get_type_display()}', " - f"action source: {action_source}" - ) + logger.debug( + f"send alert_group_action_triggered_signal for alert_group {self.pk}, " + f"log record {log_record.pk} with type '{log_record.get_type_display()}', " + f"action source: {action_source}" + ) + + transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) - alert_group_action_triggered_signal.send( - sender=self.un_silence_by_user, - log_record=log_record.pk, - action_source=action_source, - ) for dependent_alert_group in self.dependent_alert_groups.all(): dependent_alert_group.un_silence_by_user(user, action_source=action_source) @@ -1169,22 +1129,19 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. # Update alert group state and response time metrics cache self._update_metrics(organization_id=user.organization_id, previous_state=initial_state, state=self.state) - log_record = self.log_records.create( - type=AlertGroupLogRecord.TYPE_WIPED, - author=user, - ) + with transaction.atomic(): + log_record = self.log_records.create( + type=AlertGroupLogRecord.TYPE_WIPED, + author=user, + ) - logger.debug( - f"send alert_group_action_triggered_signal for alert_group {self.pk}, " - f"log record {log_record.pk} with type '{log_record.get_type_display()}', " - f"action source: wipe" - ) + logger.debug( + f"send alert_group_action_triggered_signal for alert_group {self.pk}, " + f"log record {log_record.pk} with type '{log_record.get_type_display()}', " + f"action source: wipe" + ) - alert_group_action_triggered_signal.send( - sender=self.wipe_by_user, - log_record=log_record.pk, - action_source=None, - ) + transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) for dependent_alert_group in self.dependent_alert_groups.all(): dependent_alert_group.wipe_by_user(user) @@ -1193,31 +1150,27 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. from apps.alerts.models import AlertGroupLogRecord self.stop_escalation() - # prevent creating multiple logs - # filter instead of get_or_create cause it can be multiple logs of this type due deleting error - log_record = self.log_records.filter(type=AlertGroupLogRecord.TYPE_DELETED).last() - if not log_record: - log_record = self.log_records.create( - type=AlertGroupLogRecord.TYPE_DELETED, - author=user, + with transaction.atomic(): + # prevent creating multiple logs + # filter instead of get_or_create cause it can be multiple logs of this type due deleting error + log_record = self.log_records.filter(type=AlertGroupLogRecord.TYPE_DELETED).last() + + if not log_record: + log_record = self.log_records.create( + type=AlertGroupLogRecord.TYPE_DELETED, + author=user, + ) + + logger.debug( + f"send alert_group_action_triggered_signal for alert_group {self.pk}, " + f"log record {log_record.pk} with type '{log_record.get_type_display()}', " + f"action source: delete" ) - logger.debug( - f"send alert_group_action_triggered_signal for alert_group {self.pk}, " - f"log record {log_record.pk} with type '{log_record.get_type_display()}', " - f"action source: delete" - ) - - alert_group_action_triggered_signal.send( - sender=self.delete_by_user, - log_record=log_record.pk, - action_source=None, # TODO: Action source is none - it is suspicious - # this flag forces synchrony call for action handler in representatives - # (for now it is actual only for Slack representative) - force_sync=True, - ) + transaction.on_commit(partial(send_alert_group_signal_for_delete.delay, self.pk, log_record.pk)) + def finish_delete_by_user(self): dependent_alerts = list(self.dependent_alert_groups.all()) self.hard_delete() diff --git a/engine/apps/alerts/models/alert_group_log_record.py b/engine/apps/alerts/models/alert_group_log_record.py index dfcabfa7..3187ebb7 100644 --- a/engine/apps/alerts/models/alert_group_log_record.py +++ b/engine/apps/alerts/models/alert_group_log_record.py @@ -594,6 +594,12 @@ class AlertGroupLogRecord(models.Model): step_specific_info = json.loads(self.step_specific_info) return step_specific_info + def delete(self): + logger.debug( + f"alert_group_log_record for alert_group deleted" f"alert_group={self.alert_group.pk} log_id={self.pk}" + ) + super().delete() + @receiver(post_save, sender=AlertGroupLogRecord) def listen_for_alertgrouplogrecord(sender, instance, created, *args, **kwargs): diff --git a/engine/apps/alerts/tasks/__init__.py b/engine/apps/alerts/tasks/__init__.py index 0dce9b3a..a736c921 100644 --- a/engine/apps/alerts/tasks/__init__.py +++ b/engine/apps/alerts/tasks/__init__.py @@ -7,6 +7,8 @@ from .check_escalation_finished import check_escalation_finished_task # noqa: F from .custom_button_result import custom_button_result # noqa: F401 from .custom_webhook_result import custom_webhook_result # noqa: F401 from .delete_alert_group import delete_alert_group # noqa: F401 +from .delete_alert_group import finish_delete_alert_group # noqa: F401 +from .delete_alert_group import send_alert_group_signal_for_delete # noqa: F401 from .distribute_alert import distribute_alert # noqa: F401 from .escalate_alert_group import escalate_alert_group # noqa: F401 from .invite_user_to_join_incident import invite_user_to_join_incident # noqa: F401 diff --git a/engine/apps/alerts/tasks/acknowledge_reminder.py b/engine/apps/alerts/tasks/acknowledge_reminder.py index e0e41500..73d86491 100644 --- a/engine/apps/alerts/tasks/acknowledge_reminder.py +++ b/engine/apps/alerts/tasks/acknowledge_reminder.py @@ -76,6 +76,7 @@ def acknowledge_reminder_task(alert_group_pk: int, unacknowledge_process_id: str log_record = alert_group.log_records.create( type=AlertGroupLogRecord.TYPE_ACK_REMINDER_TRIGGERED, author=alert_group.acknowledged_by_user ) + task_logger.info(f"created log record {log_record.pk}, sending signal...") transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) diff --git a/engine/apps/alerts/tasks/delete_alert_group.py b/engine/apps/alerts/tasks/delete_alert_group.py index e97ce515..cd4d5807 100644 --- a/engine/apps/alerts/tasks/delete_alert_group.py +++ b/engine/apps/alerts/tasks/delete_alert_group.py @@ -1,6 +1,7 @@ from celery.utils.log import get_task_logger from django.conf import settings +from apps.alerts.signals import alert_group_action_triggered_signal from apps.slack.errors import SlackAPIRatelimitError from common.custom_celery_tasks import shared_dedicated_queue_retry_task @@ -10,7 +11,7 @@ logger = get_task_logger(__name__) @shared_dedicated_queue_retry_task( autoretry_for=(Exception,), retry_backoff=True, max_retries=1 if settings.DEBUG else None ) -def delete_alert_group(alert_group_pk, user_pk): +def delete_alert_group(alert_group_pk: int, user_pk: int) -> None: from apps.alerts.models import AlertGroup from apps.user_management.models import User @@ -25,9 +26,35 @@ def delete_alert_group(alert_group_pk, user_pk): return logger.debug(f"User {user} is deleting alert group {alert_group} (channel: {alert_group.channel})") + alert_group.delete_by_user(user) + +@shared_dedicated_queue_retry_task( + autoretry_for=(Exception,), retry_backoff=True, max_retries=1 if settings.DEBUG else None +) +def send_alert_group_signal_for_delete(alert_group_pk: int, log_record_pk: int) -> None: try: - alert_group.delete_by_user(user) + alert_group_action_triggered_signal.send( + sender=send_alert_group_signal_for_delete, + log_record=log_record_pk, + force_sync=True, + ) except SlackAPIRatelimitError as e: # Handle Slack API ratelimit raised in apps.slack.scenarios.distribute_alerts.DeleteGroupStep.process_signal - delete_alert_group.apply_async((alert_group_pk, user_pk), countdown=e.retry_after) + send_alert_group_signal_for_delete.apply_async((alert_group_pk, log_record_pk), countdown=e.retry_after) + return + + finish_delete_alert_group.apply_async((alert_group_pk,)) + + +@shared_dedicated_queue_retry_task( + autoretry_for=(Exception,), retry_backoff=True, max_retries=1 if settings.DEBUG else None +) +def finish_delete_alert_group(alert_group_pk: int) -> None: + from apps.alerts.models import AlertGroup + + alert_group = AlertGroup.objects.filter(pk=alert_group_pk).first() + if not alert_group: + logger.debug(f"Alert group id={alert_group_pk} not found, already deleted") + return + alert_group.finish_delete_by_user() diff --git a/engine/apps/alerts/tasks/send_alert_group_signal.py b/engine/apps/alerts/tasks/send_alert_group_signal.py index be632a2b..65f0b8f2 100644 --- a/engine/apps/alerts/tasks/send_alert_group_signal.py +++ b/engine/apps/alerts/tasks/send_alert_group_signal.py @@ -5,13 +5,15 @@ from django.conf import settings from apps.alerts.signals import alert_group_action_triggered_signal from common.custom_celery_tasks import shared_dedicated_queue_retry_task +from .task_logger import task_logger + @shared_dedicated_queue_retry_task( autoretry_for=(Exception,), retry_backoff=True, max_retries=0 if settings.DEBUG else None ) def send_alert_group_signal(log_record_id): start_time = time.time() - + task_logger.info(f"sending signal for log record {log_record_id}") alert_group_action_triggered_signal.send(sender=send_alert_group_signal, log_record=log_record_id) print("--- %s seconds ---" % (time.time() - start_time)) diff --git a/engine/apps/alerts/tests/test_alert_group.py b/engine/apps/alerts/tests/test_alert_group.py index d2ba04d2..0d7166c6 100644 --- a/engine/apps/alerts/tests/test_alert_group.py +++ b/engine/apps/alerts/tests/test_alert_group.py @@ -6,7 +6,11 @@ from apps.alerts.constants import ActionSource from apps.alerts.incident_appearance.renderers.phone_call_renderer import AlertGroupPhoneCallRenderer from apps.alerts.models import AlertGroup, AlertGroupLogRecord from apps.alerts.tasks import wipe -from apps.alerts.tasks.delete_alert_group import delete_alert_group +from apps.alerts.tasks.delete_alert_group import ( + delete_alert_group, + finish_delete_alert_group, + send_alert_group_signal_for_delete, +) from apps.slack.client import SlackClient from apps.slack.errors import SlackAPIMessageNotFoundError, SlackAPIRatelimitError from apps.slack.models import SlackMessage @@ -85,9 +89,9 @@ def test_delete( make_alert, make_slack_message, make_resolution_note_slack_message, + django_capture_on_commit_callbacks, ): """test alert group deleting""" - organization, slack_team_identity = make_organization_with_slack_team_identity() user = make_user(organization=organization) @@ -119,7 +123,20 @@ def test_delete( assert alert_group.slack_messages.count() == 1 assert alert_group.resolution_note_slack_messages.count() == 2 - delete_alert_group(alert_group.pk, user.pk) + with patch( + "apps.alerts.tasks.delete_alert_group.send_alert_group_signal_for_delete.delay", return_value=None + ) as mock_send_alert_group_signal: + with django_capture_on_commit_callbacks(execute=True): + delete_alert_group(alert_group.pk, user.pk) + assert mock_send_alert_group_signal.call_count == 1 + + with patch( + "apps.alerts.tasks.delete_alert_group.finish_delete_alert_group.apply_async", return_value=None + ) as mock_finish_delete_alert_group: + send_alert_group_signal_for_delete(*mock_send_alert_group_signal.call_args.args) + assert mock_finish_delete_alert_group.call_count == 1 + + finish_delete_alert_group(alert_group.pk) assert not alert_group.alerts.exists() assert not alert_group.slack_messages.exists() @@ -140,10 +157,10 @@ def test_delete( @pytest.mark.parametrize("api_method", ["reactions_remove", "chat_delete"]) -@patch.object(delete_alert_group, "apply_async") +@patch.object(send_alert_group_signal_for_delete, "apply_async") @pytest.mark.django_db def test_delete_slack_ratelimit( - mock_delete_alert_group, + mock_send_alert_group_signal_for_delete, api_method, make_organization_with_slack_team_identity, make_user, @@ -152,6 +169,7 @@ def test_delete_slack_ratelimit( make_alert, make_slack_message, make_resolution_note_slack_message, + django_capture_on_commit_callbacks, ): organization, slack_team_identity = make_organization_with_slack_team_identity() user = make_user(organization=organization) @@ -180,17 +198,31 @@ def test_delete_slack_ratelimit( ts="test2_ts", ) - with patch.object( - SlackClient, - api_method, - side_effect=SlackAPIRatelimitError( - response=build_slack_response({"ok": False, "error": "ratelimited"}, headers={"Retry-After": 42}) - ), - ): - delete_alert_group(alert_group.pk, user.pk) + with patch( + "apps.alerts.tasks.delete_alert_group.send_alert_group_signal_for_delete.delay", return_value=None + ) as mock_send_alert_group_signal: + with django_capture_on_commit_callbacks(execute=True): + delete_alert_group(alert_group.pk, user.pk) + assert mock_send_alert_group_signal.call_count == 1 + + with patch( + "apps.alerts.tasks.delete_alert_group.finish_delete_alert_group.apply_async", return_value=None + ) as mock_finish_delete_alert_group: + with patch.object( + SlackClient, + api_method, + side_effect=SlackAPIRatelimitError( + response=build_slack_response({"ok": False, "error": "ratelimited"}, headers={"Retry-After": 42}) + ), + ): + send_alert_group_signal_for_delete(*mock_send_alert_group_signal.call_args.args) + + assert mock_finish_delete_alert_group.call_count == 0 # Check task is retried gracefully - mock_delete_alert_group.assert_called_once_with((alert_group.pk, user.pk), countdown=42) + mock_send_alert_group_signal_for_delete.assert_called_once_with( + mock_send_alert_group_signal.call_args.args, countdown=42 + ) @pytest.mark.parametrize("api_method", ["reactions_remove", "chat_delete"]) @@ -582,7 +614,7 @@ def test_filter_active_alert_groups( @patch("apps.alerts.models.AlertGroup.hard_delete") @patch("apps.alerts.models.AlertGroup.un_attach_by_delete") @patch("apps.alerts.models.AlertGroup.stop_escalation") -@patch("apps.alerts.models.alert_group.alert_group_action_triggered_signal") +@patch("apps.alerts.tasks.delete_alert_group.alert_group_action_triggered_signal") @pytest.mark.django_db def test_delete_by_user( mock_alert_group_action_triggered_signal, @@ -592,6 +624,7 @@ def test_delete_by_user( make_organization_and_user, make_alert_receive_channel, make_alert_group, + django_capture_on_commit_callbacks, ): organization, user = make_organization_and_user() alert_receive_channel = make_alert_receive_channel(organization) @@ -603,20 +636,31 @@ def test_delete_by_user( assert alert_group.log_records.filter(type=AlertGroupLogRecord.TYPE_DELETED).count() == 0 - alert_group.delete_by_user(user) + with patch( + "apps.alerts.tasks.delete_alert_group.send_alert_group_signal_for_delete.delay", return_value=None + ) as mock_send_alert_group_signal: + with django_capture_on_commit_callbacks(execute=True): + delete_alert_group(alert_group.pk, user.pk) + assert mock_send_alert_group_signal.call_count == 1 assert alert_group.log_records.filter(type=AlertGroupLogRecord.TYPE_DELETED).count() == 1 deleted_log_record = alert_group.log_records.get(type=AlertGroupLogRecord.TYPE_DELETED) - alert_group.stop_escalation.assert_called_once_with() + with patch( + "apps.alerts.tasks.delete_alert_group.finish_delete_alert_group.apply_async", return_value=None + ) as mock_finish_delete_alert_group: + send_alert_group_signal_for_delete(*mock_send_alert_group_signal.call_args.args) + assert mock_finish_delete_alert_group.call_count == 1 + mock_alert_group_action_triggered_signal.send.assert_called_once_with( - sender=alert_group.delete_by_user, + sender=send_alert_group_signal_for_delete, log_record=deleted_log_record.pk, - action_source=None, force_sync=True, ) + finish_delete_alert_group(alert_group.pk) + alert_group.hard_delete.assert_called_once_with() for dependent_alert_group in dependent_alert_groups: diff --git a/engine/apps/metrics_exporter/tests/test_update_metrics_cache.py b/engine/apps/metrics_exporter/tests/test_update_metrics_cache.py index 29fb52f7..fa05949b 100644 --- a/engine/apps/metrics_exporter/tests/test_update_metrics_cache.py +++ b/engine/apps/metrics_exporter/tests/test_update_metrics_cache.py @@ -31,7 +31,7 @@ def mock_apply_async(monkeypatch): @patch("apps.alerts.models.alert_group_log_record.tasks.send_update_log_report_signal.apply_async") -@patch("apps.alerts.models.alert_group.alert_group_action_triggered_signal.send") +@patch("apps.alerts.tasks.send_alert_group_signal.alert_group_action_triggered_signal.send") @pytest.mark.django_db @override_settings(CELERY_TASK_ALWAYS_EAGER=True) def test_update_metric_alert_groups_total_cache_on_action( @@ -142,7 +142,7 @@ def test_update_metric_alert_groups_total_cache_on_action( @patch("apps.alerts.models.alert_group_log_record.tasks.send_update_log_report_signal.apply_async") -@patch("apps.alerts.models.alert_group.alert_group_action_triggered_signal.send") +@patch("apps.alerts.tasks.send_alert_group_signal.alert_group_action_triggered_signal.send") @pytest.mark.django_db @override_settings(CELERY_TASK_ALWAYS_EAGER=True) def test_update_metric_alert_groups_response_time_cache_on_action( diff --git a/engine/apps/slack/representatives/alert_group_representative.py b/engine/apps/slack/representatives/alert_group_representative.py index 1ef8b5c5..67fd4ce6 100644 --- a/engine/apps/slack/representatives/alert_group_representative.py +++ b/engine/apps/slack/representatives/alert_group_representative.py @@ -2,6 +2,7 @@ import logging from celery.utils.log import get_task_logger from django.conf import settings +from django.core.exceptions import ObjectDoesNotExist from apps.alerts.constants import ActionSource from apps.alerts.representative import AlertGroupAbstractRepresentative @@ -49,14 +50,20 @@ def on_create_alert_slack_representative_async(alert_pk): @shared_dedicated_queue_retry_task( - autoretry_for=(Exception,), retry_backoff=True, max_retries=1 if settings.DEBUG else None + autoretry_for=(Exception,), + retry_backoff=True, + dont_autoretry_for=(ObjectDoesNotExist,), + max_retries=1 if settings.DEBUG else None, ) def on_alert_group_action_triggered_async(log_record_id): from apps.alerts.models import AlertGroupLogRecord - logger.debug(f"SLACK representative: get log record {log_record_id}") + try: + log_record = AlertGroupLogRecord.objects.get(pk=log_record_id) + except AlertGroupLogRecord.DoesNotExist as e: + logger.warning(f"SLACK representative: log record {log_record_id} never created or has been deleted") + raise e - log_record = AlertGroupLogRecord.objects.get(pk=log_record_id) alert_group_id = log_record.alert_group_id logger.debug(f"Start on_alert_group_action_triggered for alert_group {alert_group_id}, log record {log_record_id}") instance = AlertGroupSlackRepresentative(log_record) @@ -145,16 +152,25 @@ class AlertGroupSlackRepresentative(AlertGroupAbstractRepresentative): from apps.alerts.models import AlertGroupLogRecord log_record = kwargs["log_record"] - action_source = kwargs.get("action_source") force_sync = kwargs.get("force_sync", False) if isinstance(log_record, AlertGroupLogRecord): log_record_id = log_record.pk else: log_record_id = log_record - if action_source == ActionSource.SLACK or force_sync: + try: + log_record = AlertGroupLogRecord.objects.get(pk=log_record_id) + except AlertGroupLogRecord.DoesNotExist: + logger.warning( + f"on_alert_group_action_triggered: log record {log_record_id} never created or has been deleted" + ) + return + + if log_record.action_source == ActionSource.SLACK or force_sync: + logger.debug(f"SLACK on_alert_group_action_triggered: sync {log_record_id} {force_sync}") on_alert_group_action_triggered_async(log_record_id) else: + logger.debug(f"SLACK on_alert_group_action_triggered: async {log_record_id} {force_sync}") on_alert_group_action_triggered_async.apply_async((log_record_id,)) @classmethod @@ -167,7 +183,11 @@ class AlertGroupSlackRepresentative(AlertGroupAbstractRepresentative): alert_group_id = alert_group.pk else: alert_group_id = alert_group - alert_group = AlertGroup.objects.get(pk=alert_group_id) + try: + alert_group = AlertGroup.objects.get(pk=alert_group_id) + except AlertGroup.DoesNotExist as e: + logger.warning(f"SLACK update log report: alert group {alert_group_id} has been deleted") + raise e logger.debug( f"Received alert_group_update_log_report signal in SLACK representative for alert_group {alert_group_id}" diff --git a/engine/apps/telegram/alert_group_representative.py b/engine/apps/telegram/alert_group_representative.py index 635420c6..ef3ae992 100644 --- a/engine/apps/telegram/alert_group_representative.py +++ b/engine/apps/telegram/alert_group_representative.py @@ -64,8 +64,13 @@ class AlertGroupTelegramRepresentative(AlertGroupAbstractRepresentative): def on_alert_group_update_log_report(cls, **kwargs): logger.info("AlertGroupTelegramRepresentative UPDATE LOG REPORT SIGNAL") alert_group = kwargs["alert_group"] + if not isinstance(alert_group, AlertGroup): - alert_group = AlertGroup.objects.get(pk=alert_group) + try: + alert_group = AlertGroup.objects.get(pk=alert_group) + except AlertGroup.DoesNotExist as e: + logger.warning(f"Telegram update log report: alert group {alert_group} has been deleted") + raise e messages_to_edit = alert_group.telegram_messages.filter( message_type__in=( diff --git a/engine/apps/telegram/tasks.py b/engine/apps/telegram/tasks.py index 41073848..5aeb9948 100644 --- a/engine/apps/telegram/tasks.py +++ b/engine/apps/telegram/tasks.py @@ -3,6 +3,7 @@ import logging from celery import uuid as celery_uuid from celery.utils.log import get_task_logger from django.conf import settings +from django.core.exceptions import ObjectDoesNotExist from telegram import error from apps.alerts.models import Alert, AlertGroup @@ -230,7 +231,10 @@ def on_create_alert_telegram_representative_async(self, alert_pk): @shared_dedicated_queue_retry_task( - autoretry_for=(Exception,), retry_backoff=True, max_retries=1 if settings.DEBUG else None + autoretry_for=(Exception,), + retry_backoff=True, + dont_autoretry_for=(ObjectDoesNotExist,), + max_retries=1 if settings.DEBUG else None, ) def on_alert_group_action_triggered_async(log_record_id): from apps.alerts.models import AlertGroupLogRecord @@ -239,18 +243,14 @@ def on_alert_group_action_triggered_async(log_record_id): logger.info(f"AlertGroupTelegramRepresentative ACTION SIGNAL, log record {log_record_id}") # temporary solution to handle cases when alert group and related log records were deleted + try: log_record = AlertGroupLogRecord.objects.get(pk=log_record_id) except AlertGroupLogRecord.DoesNotExist as e: - retries_count = on_alert_group_action_triggered_async.request.retries - if retries_count >= 10: - logger.error( - f"AlertGroupTelegramRepresentative: was not able to get AlertGroupLogRecord, probably alert group " - f"was deleted. log record {log_record_id}, retries: {retries_count}" - ) - return - else: - raise e + logger.warning( + f"AlertGroupTelegramRepresentative: log record {log_record_id} never created or has been deleted" + ) + raise e instance = AlertGroupTelegramRepresentative(log_record) if instance.is_applicable(): diff --git a/engine/apps/webhooks/listeners.py b/engine/apps/webhooks/listeners.py index 5ff9b192..ebbed55b 100644 --- a/engine/apps/webhooks/listeners.py +++ b/engine/apps/webhooks/listeners.py @@ -15,5 +15,9 @@ def on_action_triggered(**kwargs): log_record = kwargs["log_record"] if not isinstance(log_record, AlertGroupLogRecord): - log_record = AlertGroupLogRecord.objects.get(pk=log_record) + try: + log_record = AlertGroupLogRecord.objects.get(pk=log_record) + except AlertGroupLogRecord.DoesNotExist as e: + logger.warning(f"Webhook action triggered: log record {log_record} never created or has been deleted") + raise e alert_group_status_change.apply_async((log_record.type, log_record.alert_group_id, log_record.author_id)) diff --git a/engine/settings/celery_task_routes.py b/engine/settings/celery_task_routes.py index 0a6f319c..73dea53d 100644 --- a/engine/settings/celery_task_routes.py +++ b/engine/settings/celery_task_routes.py @@ -4,6 +4,8 @@ CELERY_TASK_ROUTES = { "queue": "default" }, "apps.alerts.tasks.delete_alert_group.delete_alert_group": {"queue": "default"}, + "apps.alerts.tasks.delete_alert_group.send_alert_group_signal_for_delete": {"queue": "default"}, + "apps.alerts.tasks.delete_alert_group.finish_delete_alert_group": {"queue": "default"}, "apps.alerts.tasks.invalidate_web_cache_for_alert_group.invalidate_web_cache_for_alert_group": {"queue": "default"}, "apps.alerts.tasks.send_alert_group_signal.send_alert_group_signal": {"queue": "default"}, "apps.alerts.tasks.wipe.wipe": {"queue": "default"},