From 2a466a0c4f7209ffe22b300639b1377cdc84189d Mon Sep 17 00:00:00 2001 From: Michael Derynck Date: Wed, 31 Jan 2024 15:54:50 -0700 Subject: [PATCH] Add transaction on_commit before signals for alert group actions (#3731) # What this PR does Add transactions around log record creation and check transaction on_commit before sending signals passing DB id of alert group log records. In cases for delete we can then assume any missing IDs on tasks are from intentionally deleted alert groups and we can stop tasks from retrying endlessly. ## Which issue(s) this PR fixes ## Checklist - [x] Unit, integration, and e2e (if applicable) tests updated - [x] Documentation added (or `pr:no public docs` PR label added if not required) - [x] `CHANGELOG.md` updated (or `pr:no changelog` PR label added if not required) --- CHANGELOG.md | 4 + engine/apps/alerts/models/alert_group.py | 431 ++++++++---------- .../alerts/models/alert_group_log_record.py | 6 + engine/apps/alerts/tasks/__init__.py | 2 + .../apps/alerts/tasks/acknowledge_reminder.py | 1 + .../apps/alerts/tasks/delete_alert_group.py | 33 +- .../alerts/tasks/send_alert_group_signal.py | 4 +- engine/apps/alerts/tests/test_alert_group.py | 82 +++- .../tests/test_update_metrics_cache.py | 4 +- .../alert_group_representative.py | 32 +- .../telegram/alert_group_representative.py | 7 +- engine/apps/telegram/tasks.py | 20 +- engine/apps/webhooks/listeners.py | 6 +- engine/settings/celery_task_routes.py | 2 + 14 files changed, 352 insertions(+), 282 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 666b54a1..77134072 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## Unreleased +### Changed + +- Ensure alert group log records are committed to DB before signalling about them @mderynck([#3731](https://github.com/grafana/oncall/pull/3731)) + ### Fixed - Address `SlackAPIRatelimitError` exceptions in `apps.slack.tasks.send_message_to_thread_if_bot_not_in_channel` task diff --git a/engine/apps/alerts/models/alert_group.py b/engine/apps/alerts/models/alert_group.py index d3d2479d..a663b571 100644 --- a/engine/apps/alerts/models/alert_group.py +++ b/engine/apps/alerts/models/alert_group.py @@ -20,8 +20,13 @@ from apps.alerts.escalation_snapshot.escalation_snapshot_mixin import START_ESCA from apps.alerts.incident_appearance.renderers.constants import DEFAULT_BACKUP_TITLE from apps.alerts.incident_appearance.renderers.slack_renderer import AlertGroupSlackRenderer from apps.alerts.incident_log_builder import IncidentLogBuilder -from apps.alerts.signals import alert_group_action_triggered_signal, alert_group_created_signal -from apps.alerts.tasks import acknowledge_reminder_task, send_alert_group_signal, unsilence_task +from apps.alerts.signals import alert_group_created_signal +from apps.alerts.tasks import ( + acknowledge_reminder_task, + send_alert_group_signal, + send_alert_group_signal_for_delete, + unsilence_task, +) from apps.metrics_exporter.tasks import update_metrics_for_alert_group from apps.slack.slack_formatter import SlackFormatter from apps.user_management.models import User @@ -639,20 +644,17 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. self.stop_escalation() self.start_ack_reminder_if_needed() - log_record = self.log_records.create( - type=AlertGroupLogRecord.TYPE_ACK, author=user, action_source=action_source - ) + with transaction.atomic(): + log_record = self.log_records.create( + type=AlertGroupLogRecord.TYPE_ACK, author=user, action_source=action_source + ) - logger.debug( - f"send alert_group_action_triggered_signal for alert_group {self.pk}, " - f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: {action_source}" - ) + logger.debug( + f"send alert_group_action_triggered_signal for alert_group {self.pk}, " + f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: {action_source}" + ) - alert_group_action_triggered_signal.send( - sender=self.acknowledge_by_user, - log_record=log_record.pk, - action_source=action_source, - ) + transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) for dependent_alert_group in self.dependent_alert_groups.all(): dependent_alert_group.acknowledge_by_user(user, action_source=action_source) @@ -679,18 +681,15 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. ) self.stop_escalation() - log_record = self.log_records.create(type=AlertGroupLogRecord.TYPE_ACK) + with transaction.atomic(): + log_record = self.log_records.create(type=AlertGroupLogRecord.TYPE_ACK) - logger.debug( - f"send alert_group_action_triggered_signal for alert_group {self.pk}, " - f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: alert" - ) + logger.debug( + f"send alert_group_action_triggered_signal for alert_group {self.pk}, " + f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: alert" + ) - alert_group_action_triggered_signal.send( - sender=self.acknowledge_by_source, - log_record=log_record.pk, - action_source=None, - ) + transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) for dependent_alert_group in self.dependent_alert_groups.all(): dependent_alert_group.acknowledge_by_source() @@ -707,20 +706,17 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. if self.is_root_alert_group: self.start_escalation_if_needed() - log_record = self.log_records.create( - type=AlertGroupLogRecord.TYPE_UN_ACK, author=user, action_source=action_source - ) + with transaction.atomic(): + log_record = self.log_records.create( + type=AlertGroupLogRecord.TYPE_UN_ACK, author=user, action_source=action_source + ) - logger.debug( - f"send alert_group_action_triggered_signal for alert_group {self.pk}, " - f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: {action_source}" - ) + logger.debug( + f"send alert_group_action_triggered_signal for alert_group {self.pk}, " + f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: {action_source}" + ) - alert_group_action_triggered_signal.send( - sender=self.un_acknowledge_by_user, - log_record=log_record.pk, - action_source=action_source, - ) + transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) for dependent_alert_group in self.dependent_alert_groups.all(): dependent_alert_group.un_acknowledge_by_user(user, action_source=action_source) @@ -745,20 +741,18 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. # Update alert group state and response time metrics cache self._update_metrics(organization_id=user.organization_id, previous_state=initial_state, state=self.state) self.stop_escalation() - log_record = self.log_records.create( - type=AlertGroupLogRecord.TYPE_RESOLVED, author=user, action_source=action_source - ) - logger.debug( - f"send alert_group_action_triggered_signal for alert_group {self.pk}, " - f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: {action_source}" - ) + with transaction.atomic(): + log_record = self.log_records.create( + type=AlertGroupLogRecord.TYPE_RESOLVED, author=user, action_source=action_source + ) - alert_group_action_triggered_signal.send( - sender=self.resolve_by_user, - log_record=log_record.pk, - action_source=action_source, - ) + logger.debug( + f"send alert_group_action_triggered_signal for alert_group {self.pk}, " + f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: {action_source}" + ) + + transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) for dependent_alert_group in self.dependent_alert_groups.all(): dependent_alert_group.resolve_by_user(user, action_source=action_source) @@ -782,18 +776,16 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. organization_id=self.channel.organization_id, previous_state=initial_state, state=self.state ) self.stop_escalation() - log_record = self.log_records.create(type=AlertGroupLogRecord.TYPE_RESOLVED) - logger.debug( - f"send alert_group_action_triggered_signal for alert_group {self.pk}, " - f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: alert" - ) + with transaction.atomic(): + log_record = self.log_records.create(type=AlertGroupLogRecord.TYPE_RESOLVED) - alert_group_action_triggered_signal.send( - sender=self.resolve_by_source, - log_record=log_record.pk, - action_source=None, - ) + logger.debug( + f"send alert_group_action_triggered_signal for alert_group {self.pk}, " + f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: alert" + ) + + transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) for dependent_alert_group in self.dependent_alert_groups.all(): dependent_alert_group.resolve_by_source() @@ -809,18 +801,16 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. organization_id=self.channel.organization_id, previous_state=initial_state, state=self.state ) self.stop_escalation() - log_record = self.log_records.create(type=AlertGroupLogRecord.TYPE_RESOLVED) - logger.debug( - f"send alert_group_action_triggered_signal for alert_group {self.pk}, " - f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: resolve step" - ) + with transaction.atomic(): + log_record = self.log_records.create(type=AlertGroupLogRecord.TYPE_RESOLVED) - alert_group_action_triggered_signal.send( - sender=self.resolve_by_last_step, - log_record=log_record.pk, - action_source=None, - ) + logger.debug( + f"send alert_group_action_triggered_signal for alert_group {self.pk}, " + f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: resolve step" + ) + + transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) for dependent_alert_group in self.dependent_alert_groups.all(): dependent_alert_group.resolve_by_last_step() @@ -830,19 +820,17 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. self.resolve(resolved_by=AlertGroup.DISABLE_MAINTENANCE) self.stop_escalation() - log_record = self.log_records.create(type=AlertGroupLogRecord.TYPE_RESOLVED) - logger.debug( - f"send alert_group_action_triggered_signal for alert_group {self.pk}, " - f"log record {log_record.pk} with type '{log_record.get_type_display()}', " - f"action source: disable maintenance" - ) + with transaction.atomic(): + log_record = self.log_records.create(type=AlertGroupLogRecord.TYPE_RESOLVED) - alert_group_action_triggered_signal.send( - sender=self.resolve_by_disable_maintenance, - log_record=log_record.pk, - action_source=None, - ) + logger.debug( + f"send alert_group_action_triggered_signal for alert_group {self.pk}, " + f"log record {log_record.pk} with type '{log_record.get_type_display()}', " + f"action source: disable maintenance" + ) + + transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) for dependent_alert_group in self.dependent_alert_groups.all(): dependent_alert_group.resolve_by_disable_maintenance() @@ -856,24 +844,21 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. # Update alert group state metric cache self._update_metrics(organization_id=user.organization_id, previous_state=initial_state, state=self.state) - log_record = self.log_records.create( - type=AlertGroupLogRecord.TYPE_UN_RESOLVED, author=user, action_source=action_source - ) + with transaction.atomic(): + log_record = self.log_records.create( + type=AlertGroupLogRecord.TYPE_UN_RESOLVED, author=user, action_source=action_source + ) - if self.is_root_alert_group: - self.start_escalation_if_needed() + if self.is_root_alert_group: + self.start_escalation_if_needed() - logger.debug( - f"send alert_group_action_triggered_signal for alert_group {self.pk}, " - f"log record {log_record.pk} with type '{log_record.get_type_display()}', " - f"action source: {action_source}" - ) + logger.debug( + f"send alert_group_action_triggered_signal for alert_group {self.pk}, " + f"log record {log_record.pk} with type '{log_record.get_type_display()}', " + f"action source: {action_source}" + ) - alert_group_action_triggered_signal.send( - sender=self.un_resolve_by_user, - log_record=log_record.pk, - action_source=action_source, - ) + transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) for dependent_alert_group in self.dependent_alert_groups.all(): dependent_alert_group.un_resolve_by_user(user, action_source=action_source) @@ -898,25 +883,22 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. if not root_alert_group.silenced and self.silenced: self.un_silence_by_user(user, action_source=action_source) - log_record = self.log_records.create( - type=AlertGroupLogRecord.TYPE_ATTACHED, - author=user, - root_alert_group=root_alert_group, - reason="Attach dropdown", - action_source=action_source, - ) + with transaction.atomic(): + log_record = self.log_records.create( + type=AlertGroupLogRecord.TYPE_ATTACHED, + author=user, + root_alert_group=root_alert_group, + reason="Attach dropdown", + action_source=action_source, + ) - logger.debug( - f"send alert_group_action_triggered_signal for alert_group {self.pk}, " - f"log record {log_record.pk} with type '{log_record.get_type_display()}', " - f"action source: {action_source}" - ) + logger.debug( + f"send alert_group_action_triggered_signal for alert_group {self.pk}, " + f"log record {log_record.pk} with type '{log_record.get_type_display()}', " + f"action source: {action_source}" + ) - alert_group_action_triggered_signal.send( - sender=self.attach_by_user, - log_record=log_record.pk, - action_source=action_source, - ) + transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) log_record_for_root_incident = root_alert_group.log_records.create( type=AlertGroupLogRecord.TYPE_ATTACHED, @@ -932,11 +914,7 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. f"'{log_record_for_root_incident.get_type_display()}', action source: {action_source}" ) - alert_group_action_triggered_signal.send( - sender=self.attach_by_user, - log_record=log_record_for_root_incident.pk, - action_source=action_source, - ) + transaction.on_commit(partial(send_alert_group_signal.delay, log_record_for_root_incident.pk)) else: log_record = self.log_records.create( @@ -953,11 +931,7 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. f"action source: {action_source}" ) - alert_group_action_triggered_signal.send( - sender=self.attach_by_user, - log_record=log_record.pk, - action_source=action_source, - ) + transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) def un_attach_by_user(self, user: User, action_source: typing.Optional[ActionSource] = None) -> None: from apps.alerts.models import AlertGroupLogRecord @@ -968,45 +942,38 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. self.start_escalation_if_needed() - log_record = self.log_records.create( - type=AlertGroupLogRecord.TYPE_UNATTACHED, - author=user, - root_alert_group=root_alert_group, - reason="Unattach button", - action_source=action_source, - ) + with transaction.atomic(): + log_record = self.log_records.create( + type=AlertGroupLogRecord.TYPE_UNATTACHED, + author=user, + root_alert_group=root_alert_group, + reason="Unattach button", + action_source=action_source, + ) - logger.debug( - f"send alert_group_action_triggered_signal for alert_group {self.pk}, " - f"log record {log_record.pk} with type '{log_record.get_type_display()}', " - f"action source: {action_source}" - ) + logger.debug( + f"send alert_group_action_triggered_signal for alert_group {self.pk}, " + f"log record {log_record.pk} with type '{log_record.get_type_display()}', " + f"action source: {action_source}" + ) - alert_group_action_triggered_signal.send( - sender=self.un_attach_by_user, - log_record=log_record.pk, - action_source=action_source, - ) + transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) - log_record_for_root_incident = root_alert_group.log_records.create( - type=AlertGroupLogRecord.TYPE_UNATTACHED, - author=user, - dependent_alert_group=self, - reason="Unattach dropdown", - action_source=action_source, - ) + log_record_for_root_incident = root_alert_group.log_records.create( + type=AlertGroupLogRecord.TYPE_UNATTACHED, + author=user, + dependent_alert_group=self, + reason="Unattach dropdown", + action_source=action_source, + ) - logger.debug( - f"send alert_group_action_triggered_signal for alert_group {root_alert_group.pk}, " - f"log record {log_record_for_root_incident.pk} " - f"with type '{log_record_for_root_incident.get_type_display()}', action source: {action_source}" - ) + logger.debug( + f"send alert_group_action_triggered_signal for alert_group {root_alert_group.pk}, " + f"log record {log_record_for_root_incident.pk} " + f"with type '{log_record_for_root_incident.get_type_display()}', action source: {action_source}" + ) - alert_group_action_triggered_signal.send( - sender=self.un_attach_by_user, - log_record=log_record_for_root_incident.pk, - action_source=action_source, - ) + transaction.on_commit(partial(send_alert_group_signal.delay, log_record_for_root_incident.pk)) def un_attach_by_delete(self): from apps.alerts.models import AlertGroupLogRecord @@ -1016,22 +983,19 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. self.start_escalation_if_needed() - log_record = self.log_records.create( - type=AlertGroupLogRecord.TYPE_UNATTACHED, - reason="Unattach by deleting root incident", - ) + with transaction.atomic(): + log_record = self.log_records.create( + type=AlertGroupLogRecord.TYPE_UNATTACHED, + reason="Unattach by deleting root incident", + ) - logger.debug( - f"send alert_group_action_triggered_signal for alert_group {self.pk}, " - f"log record {log_record.pk} with type '{log_record.get_type_display()}', " - f"action source: delete" - ) + logger.debug( + f"send alert_group_action_triggered_signal for alert_group {self.pk}, " + f"log record {log_record.pk} with type '{log_record.get_type_display()}', " + f"action source: delete" + ) - alert_group_action_triggered_signal.send( - sender=self.un_attach_by_delete, - log_record=log_record.pk, - action_source=None, - ) + transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) def silence_by_user( self, user: User, silence_delay: typing.Optional[int], action_source: typing.Optional[ActionSource] = None @@ -1086,25 +1050,23 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. # Update alert group state and response time metrics cache self._update_metrics(organization_id=user.organization_id, previous_state=initial_state, state=self.state) - log_record = self.log_records.create( - type=AlertGroupLogRecord.TYPE_SILENCE, - author=user, - silence_delay=silence_delay_timedelta, - reason="Silence button", - action_source=action_source, - ) + with transaction.atomic(): + log_record = self.log_records.create( + type=AlertGroupLogRecord.TYPE_SILENCE, + author=user, + silence_delay=silence_delay_timedelta, + reason="Silence button", + action_source=action_source, + ) - logger.debug( - f"send alert_group_action_triggered_signal for alert_group {self.pk}, " - f"log record {log_record.pk} with type '{log_record.get_type_display()}', " - f"action source: {action_source}" - ) + logger.debug( + f"send alert_group_action_triggered_signal for alert_group {self.pk}, " + f"log record {log_record.pk} with type '{log_record.get_type_display()}', " + f"action source: {action_source}" + ) + + transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) - alert_group_action_triggered_signal.send( - sender=self.silence_by_user, - log_record=log_record.pk, - action_source=action_source, - ) for dependent_alert_group in self.dependent_alert_groups.all(): dependent_alert_group.silence_by_user(user, silence_delay, action_source) @@ -1120,26 +1082,24 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. if self.is_root_alert_group: self.start_escalation_if_needed() - log_record = self.log_records.create( - type=AlertGroupLogRecord.TYPE_UN_SILENCE, - author=user, - silence_delay=None, - # 2.Look like some time ago there was no TYPE_UN_SILENCE - reason="Unsilence button", - action_source=action_source, - ) + with transaction.atomic(): + log_record = self.log_records.create( + type=AlertGroupLogRecord.TYPE_UN_SILENCE, + author=user, + silence_delay=None, + # 2.Look like some time ago there was no TYPE_UN_SILENCE + reason="Unsilence button", + action_source=action_source, + ) - logger.debug( - f"send alert_group_action_triggered_signal for alert_group {self.pk}, " - f"log record {log_record.pk} with type '{log_record.get_type_display()}', " - f"action source: {action_source}" - ) + logger.debug( + f"send alert_group_action_triggered_signal for alert_group {self.pk}, " + f"log record {log_record.pk} with type '{log_record.get_type_display()}', " + f"action source: {action_source}" + ) + + transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) - alert_group_action_triggered_signal.send( - sender=self.un_silence_by_user, - log_record=log_record.pk, - action_source=action_source, - ) for dependent_alert_group in self.dependent_alert_groups.all(): dependent_alert_group.un_silence_by_user(user, action_source=action_source) @@ -1169,22 +1129,19 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. # Update alert group state and response time metrics cache self._update_metrics(organization_id=user.organization_id, previous_state=initial_state, state=self.state) - log_record = self.log_records.create( - type=AlertGroupLogRecord.TYPE_WIPED, - author=user, - ) + with transaction.atomic(): + log_record = self.log_records.create( + type=AlertGroupLogRecord.TYPE_WIPED, + author=user, + ) - logger.debug( - f"send alert_group_action_triggered_signal for alert_group {self.pk}, " - f"log record {log_record.pk} with type '{log_record.get_type_display()}', " - f"action source: wipe" - ) + logger.debug( + f"send alert_group_action_triggered_signal for alert_group {self.pk}, " + f"log record {log_record.pk} with type '{log_record.get_type_display()}', " + f"action source: wipe" + ) - alert_group_action_triggered_signal.send( - sender=self.wipe_by_user, - log_record=log_record.pk, - action_source=None, - ) + transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) for dependent_alert_group in self.dependent_alert_groups.all(): dependent_alert_group.wipe_by_user(user) @@ -1193,31 +1150,27 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. from apps.alerts.models import AlertGroupLogRecord self.stop_escalation() - # prevent creating multiple logs - # filter instead of get_or_create cause it can be multiple logs of this type due deleting error - log_record = self.log_records.filter(type=AlertGroupLogRecord.TYPE_DELETED).last() - if not log_record: - log_record = self.log_records.create( - type=AlertGroupLogRecord.TYPE_DELETED, - author=user, + with transaction.atomic(): + # prevent creating multiple logs + # filter instead of get_or_create cause it can be multiple logs of this type due deleting error + log_record = self.log_records.filter(type=AlertGroupLogRecord.TYPE_DELETED).last() + + if not log_record: + log_record = self.log_records.create( + type=AlertGroupLogRecord.TYPE_DELETED, + author=user, + ) + + logger.debug( + f"send alert_group_action_triggered_signal for alert_group {self.pk}, " + f"log record {log_record.pk} with type '{log_record.get_type_display()}', " + f"action source: delete" ) - logger.debug( - f"send alert_group_action_triggered_signal for alert_group {self.pk}, " - f"log record {log_record.pk} with type '{log_record.get_type_display()}', " - f"action source: delete" - ) - - alert_group_action_triggered_signal.send( - sender=self.delete_by_user, - log_record=log_record.pk, - action_source=None, # TODO: Action source is none - it is suspicious - # this flag forces synchrony call for action handler in representatives - # (for now it is actual only for Slack representative) - force_sync=True, - ) + transaction.on_commit(partial(send_alert_group_signal_for_delete.delay, self.pk, log_record.pk)) + def finish_delete_by_user(self): dependent_alerts = list(self.dependent_alert_groups.all()) self.hard_delete() diff --git a/engine/apps/alerts/models/alert_group_log_record.py b/engine/apps/alerts/models/alert_group_log_record.py index dfcabfa7..3187ebb7 100644 --- a/engine/apps/alerts/models/alert_group_log_record.py +++ b/engine/apps/alerts/models/alert_group_log_record.py @@ -594,6 +594,12 @@ class AlertGroupLogRecord(models.Model): step_specific_info = json.loads(self.step_specific_info) return step_specific_info + def delete(self): + logger.debug( + f"alert_group_log_record for alert_group deleted" f"alert_group={self.alert_group.pk} log_id={self.pk}" + ) + super().delete() + @receiver(post_save, sender=AlertGroupLogRecord) def listen_for_alertgrouplogrecord(sender, instance, created, *args, **kwargs): diff --git a/engine/apps/alerts/tasks/__init__.py b/engine/apps/alerts/tasks/__init__.py index 0dce9b3a..a736c921 100644 --- a/engine/apps/alerts/tasks/__init__.py +++ b/engine/apps/alerts/tasks/__init__.py @@ -7,6 +7,8 @@ from .check_escalation_finished import check_escalation_finished_task # noqa: F from .custom_button_result import custom_button_result # noqa: F401 from .custom_webhook_result import custom_webhook_result # noqa: F401 from .delete_alert_group import delete_alert_group # noqa: F401 +from .delete_alert_group import finish_delete_alert_group # noqa: F401 +from .delete_alert_group import send_alert_group_signal_for_delete # noqa: F401 from .distribute_alert import distribute_alert # noqa: F401 from .escalate_alert_group import escalate_alert_group # noqa: F401 from .invite_user_to_join_incident import invite_user_to_join_incident # noqa: F401 diff --git a/engine/apps/alerts/tasks/acknowledge_reminder.py b/engine/apps/alerts/tasks/acknowledge_reminder.py index e0e41500..73d86491 100644 --- a/engine/apps/alerts/tasks/acknowledge_reminder.py +++ b/engine/apps/alerts/tasks/acknowledge_reminder.py @@ -76,6 +76,7 @@ def acknowledge_reminder_task(alert_group_pk: int, unacknowledge_process_id: str log_record = alert_group.log_records.create( type=AlertGroupLogRecord.TYPE_ACK_REMINDER_TRIGGERED, author=alert_group.acknowledged_by_user ) + task_logger.info(f"created log record {log_record.pk}, sending signal...") transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) diff --git a/engine/apps/alerts/tasks/delete_alert_group.py b/engine/apps/alerts/tasks/delete_alert_group.py index e97ce515..cd4d5807 100644 --- a/engine/apps/alerts/tasks/delete_alert_group.py +++ b/engine/apps/alerts/tasks/delete_alert_group.py @@ -1,6 +1,7 @@ from celery.utils.log import get_task_logger from django.conf import settings +from apps.alerts.signals import alert_group_action_triggered_signal from apps.slack.errors import SlackAPIRatelimitError from common.custom_celery_tasks import shared_dedicated_queue_retry_task @@ -10,7 +11,7 @@ logger = get_task_logger(__name__) @shared_dedicated_queue_retry_task( autoretry_for=(Exception,), retry_backoff=True, max_retries=1 if settings.DEBUG else None ) -def delete_alert_group(alert_group_pk, user_pk): +def delete_alert_group(alert_group_pk: int, user_pk: int) -> None: from apps.alerts.models import AlertGroup from apps.user_management.models import User @@ -25,9 +26,35 @@ def delete_alert_group(alert_group_pk, user_pk): return logger.debug(f"User {user} is deleting alert group {alert_group} (channel: {alert_group.channel})") + alert_group.delete_by_user(user) + +@shared_dedicated_queue_retry_task( + autoretry_for=(Exception,), retry_backoff=True, max_retries=1 if settings.DEBUG else None +) +def send_alert_group_signal_for_delete(alert_group_pk: int, log_record_pk: int) -> None: try: - alert_group.delete_by_user(user) + alert_group_action_triggered_signal.send( + sender=send_alert_group_signal_for_delete, + log_record=log_record_pk, + force_sync=True, + ) except SlackAPIRatelimitError as e: # Handle Slack API ratelimit raised in apps.slack.scenarios.distribute_alerts.DeleteGroupStep.process_signal - delete_alert_group.apply_async((alert_group_pk, user_pk), countdown=e.retry_after) + send_alert_group_signal_for_delete.apply_async((alert_group_pk, log_record_pk), countdown=e.retry_after) + return + + finish_delete_alert_group.apply_async((alert_group_pk,)) + + +@shared_dedicated_queue_retry_task( + autoretry_for=(Exception,), retry_backoff=True, max_retries=1 if settings.DEBUG else None +) +def finish_delete_alert_group(alert_group_pk: int) -> None: + from apps.alerts.models import AlertGroup + + alert_group = AlertGroup.objects.filter(pk=alert_group_pk).first() + if not alert_group: + logger.debug(f"Alert group id={alert_group_pk} not found, already deleted") + return + alert_group.finish_delete_by_user() diff --git a/engine/apps/alerts/tasks/send_alert_group_signal.py b/engine/apps/alerts/tasks/send_alert_group_signal.py index be632a2b..65f0b8f2 100644 --- a/engine/apps/alerts/tasks/send_alert_group_signal.py +++ b/engine/apps/alerts/tasks/send_alert_group_signal.py @@ -5,13 +5,15 @@ from django.conf import settings from apps.alerts.signals import alert_group_action_triggered_signal from common.custom_celery_tasks import shared_dedicated_queue_retry_task +from .task_logger import task_logger + @shared_dedicated_queue_retry_task( autoretry_for=(Exception,), retry_backoff=True, max_retries=0 if settings.DEBUG else None ) def send_alert_group_signal(log_record_id): start_time = time.time() - + task_logger.info(f"sending signal for log record {log_record_id}") alert_group_action_triggered_signal.send(sender=send_alert_group_signal, log_record=log_record_id) print("--- %s seconds ---" % (time.time() - start_time)) diff --git a/engine/apps/alerts/tests/test_alert_group.py b/engine/apps/alerts/tests/test_alert_group.py index d2ba04d2..0d7166c6 100644 --- a/engine/apps/alerts/tests/test_alert_group.py +++ b/engine/apps/alerts/tests/test_alert_group.py @@ -6,7 +6,11 @@ from apps.alerts.constants import ActionSource from apps.alerts.incident_appearance.renderers.phone_call_renderer import AlertGroupPhoneCallRenderer from apps.alerts.models import AlertGroup, AlertGroupLogRecord from apps.alerts.tasks import wipe -from apps.alerts.tasks.delete_alert_group import delete_alert_group +from apps.alerts.tasks.delete_alert_group import ( + delete_alert_group, + finish_delete_alert_group, + send_alert_group_signal_for_delete, +) from apps.slack.client import SlackClient from apps.slack.errors import SlackAPIMessageNotFoundError, SlackAPIRatelimitError from apps.slack.models import SlackMessage @@ -85,9 +89,9 @@ def test_delete( make_alert, make_slack_message, make_resolution_note_slack_message, + django_capture_on_commit_callbacks, ): """test alert group deleting""" - organization, slack_team_identity = make_organization_with_slack_team_identity() user = make_user(organization=organization) @@ -119,7 +123,20 @@ def test_delete( assert alert_group.slack_messages.count() == 1 assert alert_group.resolution_note_slack_messages.count() == 2 - delete_alert_group(alert_group.pk, user.pk) + with patch( + "apps.alerts.tasks.delete_alert_group.send_alert_group_signal_for_delete.delay", return_value=None + ) as mock_send_alert_group_signal: + with django_capture_on_commit_callbacks(execute=True): + delete_alert_group(alert_group.pk, user.pk) + assert mock_send_alert_group_signal.call_count == 1 + + with patch( + "apps.alerts.tasks.delete_alert_group.finish_delete_alert_group.apply_async", return_value=None + ) as mock_finish_delete_alert_group: + send_alert_group_signal_for_delete(*mock_send_alert_group_signal.call_args.args) + assert mock_finish_delete_alert_group.call_count == 1 + + finish_delete_alert_group(alert_group.pk) assert not alert_group.alerts.exists() assert not alert_group.slack_messages.exists() @@ -140,10 +157,10 @@ def test_delete( @pytest.mark.parametrize("api_method", ["reactions_remove", "chat_delete"]) -@patch.object(delete_alert_group, "apply_async") +@patch.object(send_alert_group_signal_for_delete, "apply_async") @pytest.mark.django_db def test_delete_slack_ratelimit( - mock_delete_alert_group, + mock_send_alert_group_signal_for_delete, api_method, make_organization_with_slack_team_identity, make_user, @@ -152,6 +169,7 @@ def test_delete_slack_ratelimit( make_alert, make_slack_message, make_resolution_note_slack_message, + django_capture_on_commit_callbacks, ): organization, slack_team_identity = make_organization_with_slack_team_identity() user = make_user(organization=organization) @@ -180,17 +198,31 @@ def test_delete_slack_ratelimit( ts="test2_ts", ) - with patch.object( - SlackClient, - api_method, - side_effect=SlackAPIRatelimitError( - response=build_slack_response({"ok": False, "error": "ratelimited"}, headers={"Retry-After": 42}) - ), - ): - delete_alert_group(alert_group.pk, user.pk) + with patch( + "apps.alerts.tasks.delete_alert_group.send_alert_group_signal_for_delete.delay", return_value=None + ) as mock_send_alert_group_signal: + with django_capture_on_commit_callbacks(execute=True): + delete_alert_group(alert_group.pk, user.pk) + assert mock_send_alert_group_signal.call_count == 1 + + with patch( + "apps.alerts.tasks.delete_alert_group.finish_delete_alert_group.apply_async", return_value=None + ) as mock_finish_delete_alert_group: + with patch.object( + SlackClient, + api_method, + side_effect=SlackAPIRatelimitError( + response=build_slack_response({"ok": False, "error": "ratelimited"}, headers={"Retry-After": 42}) + ), + ): + send_alert_group_signal_for_delete(*mock_send_alert_group_signal.call_args.args) + + assert mock_finish_delete_alert_group.call_count == 0 # Check task is retried gracefully - mock_delete_alert_group.assert_called_once_with((alert_group.pk, user.pk), countdown=42) + mock_send_alert_group_signal_for_delete.assert_called_once_with( + mock_send_alert_group_signal.call_args.args, countdown=42 + ) @pytest.mark.parametrize("api_method", ["reactions_remove", "chat_delete"]) @@ -582,7 +614,7 @@ def test_filter_active_alert_groups( @patch("apps.alerts.models.AlertGroup.hard_delete") @patch("apps.alerts.models.AlertGroup.un_attach_by_delete") @patch("apps.alerts.models.AlertGroup.stop_escalation") -@patch("apps.alerts.models.alert_group.alert_group_action_triggered_signal") +@patch("apps.alerts.tasks.delete_alert_group.alert_group_action_triggered_signal") @pytest.mark.django_db def test_delete_by_user( mock_alert_group_action_triggered_signal, @@ -592,6 +624,7 @@ def test_delete_by_user( make_organization_and_user, make_alert_receive_channel, make_alert_group, + django_capture_on_commit_callbacks, ): organization, user = make_organization_and_user() alert_receive_channel = make_alert_receive_channel(organization) @@ -603,20 +636,31 @@ def test_delete_by_user( assert alert_group.log_records.filter(type=AlertGroupLogRecord.TYPE_DELETED).count() == 0 - alert_group.delete_by_user(user) + with patch( + "apps.alerts.tasks.delete_alert_group.send_alert_group_signal_for_delete.delay", return_value=None + ) as mock_send_alert_group_signal: + with django_capture_on_commit_callbacks(execute=True): + delete_alert_group(alert_group.pk, user.pk) + assert mock_send_alert_group_signal.call_count == 1 assert alert_group.log_records.filter(type=AlertGroupLogRecord.TYPE_DELETED).count() == 1 deleted_log_record = alert_group.log_records.get(type=AlertGroupLogRecord.TYPE_DELETED) - alert_group.stop_escalation.assert_called_once_with() + with patch( + "apps.alerts.tasks.delete_alert_group.finish_delete_alert_group.apply_async", return_value=None + ) as mock_finish_delete_alert_group: + send_alert_group_signal_for_delete(*mock_send_alert_group_signal.call_args.args) + assert mock_finish_delete_alert_group.call_count == 1 + mock_alert_group_action_triggered_signal.send.assert_called_once_with( - sender=alert_group.delete_by_user, + sender=send_alert_group_signal_for_delete, log_record=deleted_log_record.pk, - action_source=None, force_sync=True, ) + finish_delete_alert_group(alert_group.pk) + alert_group.hard_delete.assert_called_once_with() for dependent_alert_group in dependent_alert_groups: diff --git a/engine/apps/metrics_exporter/tests/test_update_metrics_cache.py b/engine/apps/metrics_exporter/tests/test_update_metrics_cache.py index 29fb52f7..fa05949b 100644 --- a/engine/apps/metrics_exporter/tests/test_update_metrics_cache.py +++ b/engine/apps/metrics_exporter/tests/test_update_metrics_cache.py @@ -31,7 +31,7 @@ def mock_apply_async(monkeypatch): @patch("apps.alerts.models.alert_group_log_record.tasks.send_update_log_report_signal.apply_async") -@patch("apps.alerts.models.alert_group.alert_group_action_triggered_signal.send") +@patch("apps.alerts.tasks.send_alert_group_signal.alert_group_action_triggered_signal.send") @pytest.mark.django_db @override_settings(CELERY_TASK_ALWAYS_EAGER=True) def test_update_metric_alert_groups_total_cache_on_action( @@ -142,7 +142,7 @@ def test_update_metric_alert_groups_total_cache_on_action( @patch("apps.alerts.models.alert_group_log_record.tasks.send_update_log_report_signal.apply_async") -@patch("apps.alerts.models.alert_group.alert_group_action_triggered_signal.send") +@patch("apps.alerts.tasks.send_alert_group_signal.alert_group_action_triggered_signal.send") @pytest.mark.django_db @override_settings(CELERY_TASK_ALWAYS_EAGER=True) def test_update_metric_alert_groups_response_time_cache_on_action( diff --git a/engine/apps/slack/representatives/alert_group_representative.py b/engine/apps/slack/representatives/alert_group_representative.py index 1ef8b5c5..67fd4ce6 100644 --- a/engine/apps/slack/representatives/alert_group_representative.py +++ b/engine/apps/slack/representatives/alert_group_representative.py @@ -2,6 +2,7 @@ import logging from celery.utils.log import get_task_logger from django.conf import settings +from django.core.exceptions import ObjectDoesNotExist from apps.alerts.constants import ActionSource from apps.alerts.representative import AlertGroupAbstractRepresentative @@ -49,14 +50,20 @@ def on_create_alert_slack_representative_async(alert_pk): @shared_dedicated_queue_retry_task( - autoretry_for=(Exception,), retry_backoff=True, max_retries=1 if settings.DEBUG else None + autoretry_for=(Exception,), + retry_backoff=True, + dont_autoretry_for=(ObjectDoesNotExist,), + max_retries=1 if settings.DEBUG else None, ) def on_alert_group_action_triggered_async(log_record_id): from apps.alerts.models import AlertGroupLogRecord - logger.debug(f"SLACK representative: get log record {log_record_id}") + try: + log_record = AlertGroupLogRecord.objects.get(pk=log_record_id) + except AlertGroupLogRecord.DoesNotExist as e: + logger.warning(f"SLACK representative: log record {log_record_id} never created or has been deleted") + raise e - log_record = AlertGroupLogRecord.objects.get(pk=log_record_id) alert_group_id = log_record.alert_group_id logger.debug(f"Start on_alert_group_action_triggered for alert_group {alert_group_id}, log record {log_record_id}") instance = AlertGroupSlackRepresentative(log_record) @@ -145,16 +152,25 @@ class AlertGroupSlackRepresentative(AlertGroupAbstractRepresentative): from apps.alerts.models import AlertGroupLogRecord log_record = kwargs["log_record"] - action_source = kwargs.get("action_source") force_sync = kwargs.get("force_sync", False) if isinstance(log_record, AlertGroupLogRecord): log_record_id = log_record.pk else: log_record_id = log_record - if action_source == ActionSource.SLACK or force_sync: + try: + log_record = AlertGroupLogRecord.objects.get(pk=log_record_id) + except AlertGroupLogRecord.DoesNotExist: + logger.warning( + f"on_alert_group_action_triggered: log record {log_record_id} never created or has been deleted" + ) + return + + if log_record.action_source == ActionSource.SLACK or force_sync: + logger.debug(f"SLACK on_alert_group_action_triggered: sync {log_record_id} {force_sync}") on_alert_group_action_triggered_async(log_record_id) else: + logger.debug(f"SLACK on_alert_group_action_triggered: async {log_record_id} {force_sync}") on_alert_group_action_triggered_async.apply_async((log_record_id,)) @classmethod @@ -167,7 +183,11 @@ class AlertGroupSlackRepresentative(AlertGroupAbstractRepresentative): alert_group_id = alert_group.pk else: alert_group_id = alert_group - alert_group = AlertGroup.objects.get(pk=alert_group_id) + try: + alert_group = AlertGroup.objects.get(pk=alert_group_id) + except AlertGroup.DoesNotExist as e: + logger.warning(f"SLACK update log report: alert group {alert_group_id} has been deleted") + raise e logger.debug( f"Received alert_group_update_log_report signal in SLACK representative for alert_group {alert_group_id}" diff --git a/engine/apps/telegram/alert_group_representative.py b/engine/apps/telegram/alert_group_representative.py index 635420c6..ef3ae992 100644 --- a/engine/apps/telegram/alert_group_representative.py +++ b/engine/apps/telegram/alert_group_representative.py @@ -64,8 +64,13 @@ class AlertGroupTelegramRepresentative(AlertGroupAbstractRepresentative): def on_alert_group_update_log_report(cls, **kwargs): logger.info("AlertGroupTelegramRepresentative UPDATE LOG REPORT SIGNAL") alert_group = kwargs["alert_group"] + if not isinstance(alert_group, AlertGroup): - alert_group = AlertGroup.objects.get(pk=alert_group) + try: + alert_group = AlertGroup.objects.get(pk=alert_group) + except AlertGroup.DoesNotExist as e: + logger.warning(f"Telegram update log report: alert group {alert_group} has been deleted") + raise e messages_to_edit = alert_group.telegram_messages.filter( message_type__in=( diff --git a/engine/apps/telegram/tasks.py b/engine/apps/telegram/tasks.py index 41073848..5aeb9948 100644 --- a/engine/apps/telegram/tasks.py +++ b/engine/apps/telegram/tasks.py @@ -3,6 +3,7 @@ import logging from celery import uuid as celery_uuid from celery.utils.log import get_task_logger from django.conf import settings +from django.core.exceptions import ObjectDoesNotExist from telegram import error from apps.alerts.models import Alert, AlertGroup @@ -230,7 +231,10 @@ def on_create_alert_telegram_representative_async(self, alert_pk): @shared_dedicated_queue_retry_task( - autoretry_for=(Exception,), retry_backoff=True, max_retries=1 if settings.DEBUG else None + autoretry_for=(Exception,), + retry_backoff=True, + dont_autoretry_for=(ObjectDoesNotExist,), + max_retries=1 if settings.DEBUG else None, ) def on_alert_group_action_triggered_async(log_record_id): from apps.alerts.models import AlertGroupLogRecord @@ -239,18 +243,14 @@ def on_alert_group_action_triggered_async(log_record_id): logger.info(f"AlertGroupTelegramRepresentative ACTION SIGNAL, log record {log_record_id}") # temporary solution to handle cases when alert group and related log records were deleted + try: log_record = AlertGroupLogRecord.objects.get(pk=log_record_id) except AlertGroupLogRecord.DoesNotExist as e: - retries_count = on_alert_group_action_triggered_async.request.retries - if retries_count >= 10: - logger.error( - f"AlertGroupTelegramRepresentative: was not able to get AlertGroupLogRecord, probably alert group " - f"was deleted. log record {log_record_id}, retries: {retries_count}" - ) - return - else: - raise e + logger.warning( + f"AlertGroupTelegramRepresentative: log record {log_record_id} never created or has been deleted" + ) + raise e instance = AlertGroupTelegramRepresentative(log_record) if instance.is_applicable(): diff --git a/engine/apps/webhooks/listeners.py b/engine/apps/webhooks/listeners.py index 5ff9b192..ebbed55b 100644 --- a/engine/apps/webhooks/listeners.py +++ b/engine/apps/webhooks/listeners.py @@ -15,5 +15,9 @@ def on_action_triggered(**kwargs): log_record = kwargs["log_record"] if not isinstance(log_record, AlertGroupLogRecord): - log_record = AlertGroupLogRecord.objects.get(pk=log_record) + try: + log_record = AlertGroupLogRecord.objects.get(pk=log_record) + except AlertGroupLogRecord.DoesNotExist as e: + logger.warning(f"Webhook action triggered: log record {log_record} never created or has been deleted") + raise e alert_group_status_change.apply_async((log_record.type, log_record.alert_group_id, log_record.author_id)) diff --git a/engine/settings/celery_task_routes.py b/engine/settings/celery_task_routes.py index 0a6f319c..73dea53d 100644 --- a/engine/settings/celery_task_routes.py +++ b/engine/settings/celery_task_routes.py @@ -4,6 +4,8 @@ CELERY_TASK_ROUTES = { "queue": "default" }, "apps.alerts.tasks.delete_alert_group.delete_alert_group": {"queue": "default"}, + "apps.alerts.tasks.delete_alert_group.send_alert_group_signal_for_delete": {"queue": "default"}, + "apps.alerts.tasks.delete_alert_group.finish_delete_alert_group": {"queue": "default"}, "apps.alerts.tasks.invalidate_web_cache_for_alert_group.invalidate_web_cache_for_alert_group": {"queue": "default"}, "apps.alerts.tasks.send_alert_group_signal.send_alert_group_signal": {"queue": "default"}, "apps.alerts.tasks.wipe.wipe": {"queue": "default"},