diff --git a/CHANGELOG.md b/CHANGELOG.md index 670760f1..7bab77f4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,18 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## Unreleased +## v1.3.97 (2024-01-31) + +### Changed + +- Ensure alert group log records are committed to DB before signalling about them @mderynck([#3731](https://github.com/grafana/oncall/pull/3731)) + +### Fixed + +- Address `SlackAPIRatelimitError` exceptions in `apps.slack.tasks.send_message_to_thread_if_bot_not_in_channel` task + by @joeyorlando ([#3803](https://github.com/grafana/oncall/pull/3803)) +- Fix exception when parsing incident plugin config @mderynck ([#3802](https://github.com/grafana/oncall/pull/3802)) + ## v1.3.96 (2024-01-31) ### Added diff --git a/docker-compose-developer.yml b/docker-compose-developer.yml index f6d62e8f..e7189dcc 100644 --- a/docker-compose-developer.yml +++ b/docker-compose-developer.yml @@ -9,6 +9,9 @@ x-oncall-build: &oncall-build-args context: ./engine target: ${ONCALL_IMAGE_TARGET:-dev} labels: *oncall-labels + cache_from: + - grafana/oncall:latest + - grafana/oncall:dev x-oncall-volumes: &oncall-volumes - ./engine:/etc/app diff --git a/engine/apps/alerts/models/alert_group.py b/engine/apps/alerts/models/alert_group.py index d3d2479d..a663b571 100644 --- a/engine/apps/alerts/models/alert_group.py +++ b/engine/apps/alerts/models/alert_group.py @@ -20,8 +20,13 @@ from apps.alerts.escalation_snapshot.escalation_snapshot_mixin import START_ESCA from apps.alerts.incident_appearance.renderers.constants import DEFAULT_BACKUP_TITLE from apps.alerts.incident_appearance.renderers.slack_renderer import AlertGroupSlackRenderer from apps.alerts.incident_log_builder import IncidentLogBuilder -from apps.alerts.signals import alert_group_action_triggered_signal, alert_group_created_signal -from apps.alerts.tasks import acknowledge_reminder_task, send_alert_group_signal, unsilence_task +from apps.alerts.signals import alert_group_created_signal +from apps.alerts.tasks import ( + acknowledge_reminder_task, + send_alert_group_signal, + send_alert_group_signal_for_delete, + unsilence_task, +) from apps.metrics_exporter.tasks import update_metrics_for_alert_group from apps.slack.slack_formatter import SlackFormatter from apps.user_management.models import User @@ -639,20 +644,17 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. self.stop_escalation() self.start_ack_reminder_if_needed() - log_record = self.log_records.create( - type=AlertGroupLogRecord.TYPE_ACK, author=user, action_source=action_source - ) + with transaction.atomic(): + log_record = self.log_records.create( + type=AlertGroupLogRecord.TYPE_ACK, author=user, action_source=action_source + ) - logger.debug( - f"send alert_group_action_triggered_signal for alert_group {self.pk}, " - f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: {action_source}" - ) + logger.debug( + f"send alert_group_action_triggered_signal for alert_group {self.pk}, " + f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: {action_source}" + ) - alert_group_action_triggered_signal.send( - sender=self.acknowledge_by_user, - log_record=log_record.pk, - action_source=action_source, - ) + transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) for dependent_alert_group in self.dependent_alert_groups.all(): dependent_alert_group.acknowledge_by_user(user, action_source=action_source) @@ -679,18 +681,15 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. ) self.stop_escalation() - log_record = self.log_records.create(type=AlertGroupLogRecord.TYPE_ACK) + with transaction.atomic(): + log_record = self.log_records.create(type=AlertGroupLogRecord.TYPE_ACK) - logger.debug( - f"send alert_group_action_triggered_signal for alert_group {self.pk}, " - f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: alert" - ) + logger.debug( + f"send alert_group_action_triggered_signal for alert_group {self.pk}, " + f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: alert" + ) - alert_group_action_triggered_signal.send( - sender=self.acknowledge_by_source, - log_record=log_record.pk, - action_source=None, - ) + transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) for dependent_alert_group in self.dependent_alert_groups.all(): dependent_alert_group.acknowledge_by_source() @@ -707,20 +706,17 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. if self.is_root_alert_group: self.start_escalation_if_needed() - log_record = self.log_records.create( - type=AlertGroupLogRecord.TYPE_UN_ACK, author=user, action_source=action_source - ) + with transaction.atomic(): + log_record = self.log_records.create( + type=AlertGroupLogRecord.TYPE_UN_ACK, author=user, action_source=action_source + ) - logger.debug( - f"send alert_group_action_triggered_signal for alert_group {self.pk}, " - f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: {action_source}" - ) + logger.debug( + f"send alert_group_action_triggered_signal for alert_group {self.pk}, " + f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: {action_source}" + ) - alert_group_action_triggered_signal.send( - sender=self.un_acknowledge_by_user, - log_record=log_record.pk, - action_source=action_source, - ) + transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) for dependent_alert_group in self.dependent_alert_groups.all(): dependent_alert_group.un_acknowledge_by_user(user, action_source=action_source) @@ -745,20 +741,18 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. # Update alert group state and response time metrics cache self._update_metrics(organization_id=user.organization_id, previous_state=initial_state, state=self.state) self.stop_escalation() - log_record = self.log_records.create( - type=AlertGroupLogRecord.TYPE_RESOLVED, author=user, action_source=action_source - ) - logger.debug( - f"send alert_group_action_triggered_signal for alert_group {self.pk}, " - f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: {action_source}" - ) + with transaction.atomic(): + log_record = self.log_records.create( + type=AlertGroupLogRecord.TYPE_RESOLVED, author=user, action_source=action_source + ) - alert_group_action_triggered_signal.send( - sender=self.resolve_by_user, - log_record=log_record.pk, - action_source=action_source, - ) + logger.debug( + f"send alert_group_action_triggered_signal for alert_group {self.pk}, " + f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: {action_source}" + ) + + transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) for dependent_alert_group in self.dependent_alert_groups.all(): dependent_alert_group.resolve_by_user(user, action_source=action_source) @@ -782,18 +776,16 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. organization_id=self.channel.organization_id, previous_state=initial_state, state=self.state ) self.stop_escalation() - log_record = self.log_records.create(type=AlertGroupLogRecord.TYPE_RESOLVED) - logger.debug( - f"send alert_group_action_triggered_signal for alert_group {self.pk}, " - f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: alert" - ) + with transaction.atomic(): + log_record = self.log_records.create(type=AlertGroupLogRecord.TYPE_RESOLVED) - alert_group_action_triggered_signal.send( - sender=self.resolve_by_source, - log_record=log_record.pk, - action_source=None, - ) + logger.debug( + f"send alert_group_action_triggered_signal for alert_group {self.pk}, " + f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: alert" + ) + + transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) for dependent_alert_group in self.dependent_alert_groups.all(): dependent_alert_group.resolve_by_source() @@ -809,18 +801,16 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. organization_id=self.channel.organization_id, previous_state=initial_state, state=self.state ) self.stop_escalation() - log_record = self.log_records.create(type=AlertGroupLogRecord.TYPE_RESOLVED) - logger.debug( - f"send alert_group_action_triggered_signal for alert_group {self.pk}, " - f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: resolve step" - ) + with transaction.atomic(): + log_record = self.log_records.create(type=AlertGroupLogRecord.TYPE_RESOLVED) - alert_group_action_triggered_signal.send( - sender=self.resolve_by_last_step, - log_record=log_record.pk, - action_source=None, - ) + logger.debug( + f"send alert_group_action_triggered_signal for alert_group {self.pk}, " + f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: resolve step" + ) + + transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) for dependent_alert_group in self.dependent_alert_groups.all(): dependent_alert_group.resolve_by_last_step() @@ -830,19 +820,17 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. self.resolve(resolved_by=AlertGroup.DISABLE_MAINTENANCE) self.stop_escalation() - log_record = self.log_records.create(type=AlertGroupLogRecord.TYPE_RESOLVED) - logger.debug( - f"send alert_group_action_triggered_signal for alert_group {self.pk}, " - f"log record {log_record.pk} with type '{log_record.get_type_display()}', " - f"action source: disable maintenance" - ) + with transaction.atomic(): + log_record = self.log_records.create(type=AlertGroupLogRecord.TYPE_RESOLVED) - alert_group_action_triggered_signal.send( - sender=self.resolve_by_disable_maintenance, - log_record=log_record.pk, - action_source=None, - ) + logger.debug( + f"send alert_group_action_triggered_signal for alert_group {self.pk}, " + f"log record {log_record.pk} with type '{log_record.get_type_display()}', " + f"action source: disable maintenance" + ) + + transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) for dependent_alert_group in self.dependent_alert_groups.all(): dependent_alert_group.resolve_by_disable_maintenance() @@ -856,24 +844,21 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. # Update alert group state metric cache self._update_metrics(organization_id=user.organization_id, previous_state=initial_state, state=self.state) - log_record = self.log_records.create( - type=AlertGroupLogRecord.TYPE_UN_RESOLVED, author=user, action_source=action_source - ) + with transaction.atomic(): + log_record = self.log_records.create( + type=AlertGroupLogRecord.TYPE_UN_RESOLVED, author=user, action_source=action_source + ) - if self.is_root_alert_group: - self.start_escalation_if_needed() + if self.is_root_alert_group: + self.start_escalation_if_needed() - logger.debug( - f"send alert_group_action_triggered_signal for alert_group {self.pk}, " - f"log record {log_record.pk} with type '{log_record.get_type_display()}', " - f"action source: {action_source}" - ) + logger.debug( + f"send alert_group_action_triggered_signal for alert_group {self.pk}, " + f"log record {log_record.pk} with type '{log_record.get_type_display()}', " + f"action source: {action_source}" + ) - alert_group_action_triggered_signal.send( - sender=self.un_resolve_by_user, - log_record=log_record.pk, - action_source=action_source, - ) + transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) for dependent_alert_group in self.dependent_alert_groups.all(): dependent_alert_group.un_resolve_by_user(user, action_source=action_source) @@ -898,25 +883,22 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. if not root_alert_group.silenced and self.silenced: self.un_silence_by_user(user, action_source=action_source) - log_record = self.log_records.create( - type=AlertGroupLogRecord.TYPE_ATTACHED, - author=user, - root_alert_group=root_alert_group, - reason="Attach dropdown", - action_source=action_source, - ) + with transaction.atomic(): + log_record = self.log_records.create( + type=AlertGroupLogRecord.TYPE_ATTACHED, + author=user, + root_alert_group=root_alert_group, + reason="Attach dropdown", + action_source=action_source, + ) - logger.debug( - f"send alert_group_action_triggered_signal for alert_group {self.pk}, " - f"log record {log_record.pk} with type '{log_record.get_type_display()}', " - f"action source: {action_source}" - ) + logger.debug( + f"send alert_group_action_triggered_signal for alert_group {self.pk}, " + f"log record {log_record.pk} with type '{log_record.get_type_display()}', " + f"action source: {action_source}" + ) - alert_group_action_triggered_signal.send( - sender=self.attach_by_user, - log_record=log_record.pk, - action_source=action_source, - ) + transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) log_record_for_root_incident = root_alert_group.log_records.create( type=AlertGroupLogRecord.TYPE_ATTACHED, @@ -932,11 +914,7 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. f"'{log_record_for_root_incident.get_type_display()}', action source: {action_source}" ) - alert_group_action_triggered_signal.send( - sender=self.attach_by_user, - log_record=log_record_for_root_incident.pk, - action_source=action_source, - ) + transaction.on_commit(partial(send_alert_group_signal.delay, log_record_for_root_incident.pk)) else: log_record = self.log_records.create( @@ -953,11 +931,7 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. f"action source: {action_source}" ) - alert_group_action_triggered_signal.send( - sender=self.attach_by_user, - log_record=log_record.pk, - action_source=action_source, - ) + transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) def un_attach_by_user(self, user: User, action_source: typing.Optional[ActionSource] = None) -> None: from apps.alerts.models import AlertGroupLogRecord @@ -968,45 +942,38 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. self.start_escalation_if_needed() - log_record = self.log_records.create( - type=AlertGroupLogRecord.TYPE_UNATTACHED, - author=user, - root_alert_group=root_alert_group, - reason="Unattach button", - action_source=action_source, - ) + with transaction.atomic(): + log_record = self.log_records.create( + type=AlertGroupLogRecord.TYPE_UNATTACHED, + author=user, + root_alert_group=root_alert_group, + reason="Unattach button", + action_source=action_source, + ) - logger.debug( - f"send alert_group_action_triggered_signal for alert_group {self.pk}, " - f"log record {log_record.pk} with type '{log_record.get_type_display()}', " - f"action source: {action_source}" - ) + logger.debug( + f"send alert_group_action_triggered_signal for alert_group {self.pk}, " + f"log record {log_record.pk} with type '{log_record.get_type_display()}', " + f"action source: {action_source}" + ) - alert_group_action_triggered_signal.send( - sender=self.un_attach_by_user, - log_record=log_record.pk, - action_source=action_source, - ) + transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) - log_record_for_root_incident = root_alert_group.log_records.create( - type=AlertGroupLogRecord.TYPE_UNATTACHED, - author=user, - dependent_alert_group=self, - reason="Unattach dropdown", - action_source=action_source, - ) + log_record_for_root_incident = root_alert_group.log_records.create( + type=AlertGroupLogRecord.TYPE_UNATTACHED, + author=user, + dependent_alert_group=self, + reason="Unattach dropdown", + action_source=action_source, + ) - logger.debug( - f"send alert_group_action_triggered_signal for alert_group {root_alert_group.pk}, " - f"log record {log_record_for_root_incident.pk} " - f"with type '{log_record_for_root_incident.get_type_display()}', action source: {action_source}" - ) + logger.debug( + f"send alert_group_action_triggered_signal for alert_group {root_alert_group.pk}, " + f"log record {log_record_for_root_incident.pk} " + f"with type '{log_record_for_root_incident.get_type_display()}', action source: {action_source}" + ) - alert_group_action_triggered_signal.send( - sender=self.un_attach_by_user, - log_record=log_record_for_root_incident.pk, - action_source=action_source, - ) + transaction.on_commit(partial(send_alert_group_signal.delay, log_record_for_root_incident.pk)) def un_attach_by_delete(self): from apps.alerts.models import AlertGroupLogRecord @@ -1016,22 +983,19 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. self.start_escalation_if_needed() - log_record = self.log_records.create( - type=AlertGroupLogRecord.TYPE_UNATTACHED, - reason="Unattach by deleting root incident", - ) + with transaction.atomic(): + log_record = self.log_records.create( + type=AlertGroupLogRecord.TYPE_UNATTACHED, + reason="Unattach by deleting root incident", + ) - logger.debug( - f"send alert_group_action_triggered_signal for alert_group {self.pk}, " - f"log record {log_record.pk} with type '{log_record.get_type_display()}', " - f"action source: delete" - ) + logger.debug( + f"send alert_group_action_triggered_signal for alert_group {self.pk}, " + f"log record {log_record.pk} with type '{log_record.get_type_display()}', " + f"action source: delete" + ) - alert_group_action_triggered_signal.send( - sender=self.un_attach_by_delete, - log_record=log_record.pk, - action_source=None, - ) + transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) def silence_by_user( self, user: User, silence_delay: typing.Optional[int], action_source: typing.Optional[ActionSource] = None @@ -1086,25 +1050,23 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. # Update alert group state and response time metrics cache self._update_metrics(organization_id=user.organization_id, previous_state=initial_state, state=self.state) - log_record = self.log_records.create( - type=AlertGroupLogRecord.TYPE_SILENCE, - author=user, - silence_delay=silence_delay_timedelta, - reason="Silence button", - action_source=action_source, - ) + with transaction.atomic(): + log_record = self.log_records.create( + type=AlertGroupLogRecord.TYPE_SILENCE, + author=user, + silence_delay=silence_delay_timedelta, + reason="Silence button", + action_source=action_source, + ) - logger.debug( - f"send alert_group_action_triggered_signal for alert_group {self.pk}, " - f"log record {log_record.pk} with type '{log_record.get_type_display()}', " - f"action source: {action_source}" - ) + logger.debug( + f"send alert_group_action_triggered_signal for alert_group {self.pk}, " + f"log record {log_record.pk} with type '{log_record.get_type_display()}', " + f"action source: {action_source}" + ) + + transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) - alert_group_action_triggered_signal.send( - sender=self.silence_by_user, - log_record=log_record.pk, - action_source=action_source, - ) for dependent_alert_group in self.dependent_alert_groups.all(): dependent_alert_group.silence_by_user(user, silence_delay, action_source) @@ -1120,26 +1082,24 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. if self.is_root_alert_group: self.start_escalation_if_needed() - log_record = self.log_records.create( - type=AlertGroupLogRecord.TYPE_UN_SILENCE, - author=user, - silence_delay=None, - # 2.Look like some time ago there was no TYPE_UN_SILENCE - reason="Unsilence button", - action_source=action_source, - ) + with transaction.atomic(): + log_record = self.log_records.create( + type=AlertGroupLogRecord.TYPE_UN_SILENCE, + author=user, + silence_delay=None, + # 2.Look like some time ago there was no TYPE_UN_SILENCE + reason="Unsilence button", + action_source=action_source, + ) - logger.debug( - f"send alert_group_action_triggered_signal for alert_group {self.pk}, " - f"log record {log_record.pk} with type '{log_record.get_type_display()}', " - f"action source: {action_source}" - ) + logger.debug( + f"send alert_group_action_triggered_signal for alert_group {self.pk}, " + f"log record {log_record.pk} with type '{log_record.get_type_display()}', " + f"action source: {action_source}" + ) + + transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) - alert_group_action_triggered_signal.send( - sender=self.un_silence_by_user, - log_record=log_record.pk, - action_source=action_source, - ) for dependent_alert_group in self.dependent_alert_groups.all(): dependent_alert_group.un_silence_by_user(user, action_source=action_source) @@ -1169,22 +1129,19 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. # Update alert group state and response time metrics cache self._update_metrics(organization_id=user.organization_id, previous_state=initial_state, state=self.state) - log_record = self.log_records.create( - type=AlertGroupLogRecord.TYPE_WIPED, - author=user, - ) + with transaction.atomic(): + log_record = self.log_records.create( + type=AlertGroupLogRecord.TYPE_WIPED, + author=user, + ) - logger.debug( - f"send alert_group_action_triggered_signal for alert_group {self.pk}, " - f"log record {log_record.pk} with type '{log_record.get_type_display()}', " - f"action source: wipe" - ) + logger.debug( + f"send alert_group_action_triggered_signal for alert_group {self.pk}, " + f"log record {log_record.pk} with type '{log_record.get_type_display()}', " + f"action source: wipe" + ) - alert_group_action_triggered_signal.send( - sender=self.wipe_by_user, - log_record=log_record.pk, - action_source=None, - ) + transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) for dependent_alert_group in self.dependent_alert_groups.all(): dependent_alert_group.wipe_by_user(user) @@ -1193,31 +1150,27 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. from apps.alerts.models import AlertGroupLogRecord self.stop_escalation() - # prevent creating multiple logs - # filter instead of get_or_create cause it can be multiple logs of this type due deleting error - log_record = self.log_records.filter(type=AlertGroupLogRecord.TYPE_DELETED).last() - if not log_record: - log_record = self.log_records.create( - type=AlertGroupLogRecord.TYPE_DELETED, - author=user, + with transaction.atomic(): + # prevent creating multiple logs + # filter instead of get_or_create cause it can be multiple logs of this type due deleting error + log_record = self.log_records.filter(type=AlertGroupLogRecord.TYPE_DELETED).last() + + if not log_record: + log_record = self.log_records.create( + type=AlertGroupLogRecord.TYPE_DELETED, + author=user, + ) + + logger.debug( + f"send alert_group_action_triggered_signal for alert_group {self.pk}, " + f"log record {log_record.pk} with type '{log_record.get_type_display()}', " + f"action source: delete" ) - logger.debug( - f"send alert_group_action_triggered_signal for alert_group {self.pk}, " - f"log record {log_record.pk} with type '{log_record.get_type_display()}', " - f"action source: delete" - ) - - alert_group_action_triggered_signal.send( - sender=self.delete_by_user, - log_record=log_record.pk, - action_source=None, # TODO: Action source is none - it is suspicious - # this flag forces synchrony call for action handler in representatives - # (for now it is actual only for Slack representative) - force_sync=True, - ) + transaction.on_commit(partial(send_alert_group_signal_for_delete.delay, self.pk, log_record.pk)) + def finish_delete_by_user(self): dependent_alerts = list(self.dependent_alert_groups.all()) self.hard_delete() diff --git a/engine/apps/alerts/models/alert_group_log_record.py b/engine/apps/alerts/models/alert_group_log_record.py index dfcabfa7..3187ebb7 100644 --- a/engine/apps/alerts/models/alert_group_log_record.py +++ b/engine/apps/alerts/models/alert_group_log_record.py @@ -594,6 +594,12 @@ class AlertGroupLogRecord(models.Model): step_specific_info = json.loads(self.step_specific_info) return step_specific_info + def delete(self): + logger.debug( + f"alert_group_log_record for alert_group deleted" f"alert_group={self.alert_group.pk} log_id={self.pk}" + ) + super().delete() + @receiver(post_save, sender=AlertGroupLogRecord) def listen_for_alertgrouplogrecord(sender, instance, created, *args, **kwargs): diff --git a/engine/apps/alerts/tasks/__init__.py b/engine/apps/alerts/tasks/__init__.py index 0dce9b3a..a736c921 100644 --- a/engine/apps/alerts/tasks/__init__.py +++ b/engine/apps/alerts/tasks/__init__.py @@ -7,6 +7,8 @@ from .check_escalation_finished import check_escalation_finished_task # noqa: F from .custom_button_result import custom_button_result # noqa: F401 from .custom_webhook_result import custom_webhook_result # noqa: F401 from .delete_alert_group import delete_alert_group # noqa: F401 +from .delete_alert_group import finish_delete_alert_group # noqa: F401 +from .delete_alert_group import send_alert_group_signal_for_delete # noqa: F401 from .distribute_alert import distribute_alert # noqa: F401 from .escalate_alert_group import escalate_alert_group # noqa: F401 from .invite_user_to_join_incident import invite_user_to_join_incident # noqa: F401 diff --git a/engine/apps/alerts/tasks/acknowledge_reminder.py b/engine/apps/alerts/tasks/acknowledge_reminder.py index e0e41500..73d86491 100644 --- a/engine/apps/alerts/tasks/acknowledge_reminder.py +++ b/engine/apps/alerts/tasks/acknowledge_reminder.py @@ -76,6 +76,7 @@ def acknowledge_reminder_task(alert_group_pk: int, unacknowledge_process_id: str log_record = alert_group.log_records.create( type=AlertGroupLogRecord.TYPE_ACK_REMINDER_TRIGGERED, author=alert_group.acknowledged_by_user ) + task_logger.info(f"created log record {log_record.pk}, sending signal...") transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) diff --git a/engine/apps/alerts/tasks/delete_alert_group.py b/engine/apps/alerts/tasks/delete_alert_group.py index e97ce515..cd4d5807 100644 --- a/engine/apps/alerts/tasks/delete_alert_group.py +++ b/engine/apps/alerts/tasks/delete_alert_group.py @@ -1,6 +1,7 @@ from celery.utils.log import get_task_logger from django.conf import settings +from apps.alerts.signals import alert_group_action_triggered_signal from apps.slack.errors import SlackAPIRatelimitError from common.custom_celery_tasks import shared_dedicated_queue_retry_task @@ -10,7 +11,7 @@ logger = get_task_logger(__name__) @shared_dedicated_queue_retry_task( autoretry_for=(Exception,), retry_backoff=True, max_retries=1 if settings.DEBUG else None ) -def delete_alert_group(alert_group_pk, user_pk): +def delete_alert_group(alert_group_pk: int, user_pk: int) -> None: from apps.alerts.models import AlertGroup from apps.user_management.models import User @@ -25,9 +26,35 @@ def delete_alert_group(alert_group_pk, user_pk): return logger.debug(f"User {user} is deleting alert group {alert_group} (channel: {alert_group.channel})") + alert_group.delete_by_user(user) + +@shared_dedicated_queue_retry_task( + autoretry_for=(Exception,), retry_backoff=True, max_retries=1 if settings.DEBUG else None +) +def send_alert_group_signal_for_delete(alert_group_pk: int, log_record_pk: int) -> None: try: - alert_group.delete_by_user(user) + alert_group_action_triggered_signal.send( + sender=send_alert_group_signal_for_delete, + log_record=log_record_pk, + force_sync=True, + ) except SlackAPIRatelimitError as e: # Handle Slack API ratelimit raised in apps.slack.scenarios.distribute_alerts.DeleteGroupStep.process_signal - delete_alert_group.apply_async((alert_group_pk, user_pk), countdown=e.retry_after) + send_alert_group_signal_for_delete.apply_async((alert_group_pk, log_record_pk), countdown=e.retry_after) + return + + finish_delete_alert_group.apply_async((alert_group_pk,)) + + +@shared_dedicated_queue_retry_task( + autoretry_for=(Exception,), retry_backoff=True, max_retries=1 if settings.DEBUG else None +) +def finish_delete_alert_group(alert_group_pk: int) -> None: + from apps.alerts.models import AlertGroup + + alert_group = AlertGroup.objects.filter(pk=alert_group_pk).first() + if not alert_group: + logger.debug(f"Alert group id={alert_group_pk} not found, already deleted") + return + alert_group.finish_delete_by_user() diff --git a/engine/apps/alerts/tasks/send_alert_group_signal.py b/engine/apps/alerts/tasks/send_alert_group_signal.py index be632a2b..65f0b8f2 100644 --- a/engine/apps/alerts/tasks/send_alert_group_signal.py +++ b/engine/apps/alerts/tasks/send_alert_group_signal.py @@ -5,13 +5,15 @@ from django.conf import settings from apps.alerts.signals import alert_group_action_triggered_signal from common.custom_celery_tasks import shared_dedicated_queue_retry_task +from .task_logger import task_logger + @shared_dedicated_queue_retry_task( autoretry_for=(Exception,), retry_backoff=True, max_retries=0 if settings.DEBUG else None ) def send_alert_group_signal(log_record_id): start_time = time.time() - + task_logger.info(f"sending signal for log record {log_record_id}") alert_group_action_triggered_signal.send(sender=send_alert_group_signal, log_record=log_record_id) print("--- %s seconds ---" % (time.time() - start_time)) diff --git a/engine/apps/alerts/tests/test_alert_group.py b/engine/apps/alerts/tests/test_alert_group.py index d2ba04d2..0d7166c6 100644 --- a/engine/apps/alerts/tests/test_alert_group.py +++ b/engine/apps/alerts/tests/test_alert_group.py @@ -6,7 +6,11 @@ from apps.alerts.constants import ActionSource from apps.alerts.incident_appearance.renderers.phone_call_renderer import AlertGroupPhoneCallRenderer from apps.alerts.models import AlertGroup, AlertGroupLogRecord from apps.alerts.tasks import wipe -from apps.alerts.tasks.delete_alert_group import delete_alert_group +from apps.alerts.tasks.delete_alert_group import ( + delete_alert_group, + finish_delete_alert_group, + send_alert_group_signal_for_delete, +) from apps.slack.client import SlackClient from apps.slack.errors import SlackAPIMessageNotFoundError, SlackAPIRatelimitError from apps.slack.models import SlackMessage @@ -85,9 +89,9 @@ def test_delete( make_alert, make_slack_message, make_resolution_note_slack_message, + django_capture_on_commit_callbacks, ): """test alert group deleting""" - organization, slack_team_identity = make_organization_with_slack_team_identity() user = make_user(organization=organization) @@ -119,7 +123,20 @@ def test_delete( assert alert_group.slack_messages.count() == 1 assert alert_group.resolution_note_slack_messages.count() == 2 - delete_alert_group(alert_group.pk, user.pk) + with patch( + "apps.alerts.tasks.delete_alert_group.send_alert_group_signal_for_delete.delay", return_value=None + ) as mock_send_alert_group_signal: + with django_capture_on_commit_callbacks(execute=True): + delete_alert_group(alert_group.pk, user.pk) + assert mock_send_alert_group_signal.call_count == 1 + + with patch( + "apps.alerts.tasks.delete_alert_group.finish_delete_alert_group.apply_async", return_value=None + ) as mock_finish_delete_alert_group: + send_alert_group_signal_for_delete(*mock_send_alert_group_signal.call_args.args) + assert mock_finish_delete_alert_group.call_count == 1 + + finish_delete_alert_group(alert_group.pk) assert not alert_group.alerts.exists() assert not alert_group.slack_messages.exists() @@ -140,10 +157,10 @@ def test_delete( @pytest.mark.parametrize("api_method", ["reactions_remove", "chat_delete"]) -@patch.object(delete_alert_group, "apply_async") +@patch.object(send_alert_group_signal_for_delete, "apply_async") @pytest.mark.django_db def test_delete_slack_ratelimit( - mock_delete_alert_group, + mock_send_alert_group_signal_for_delete, api_method, make_organization_with_slack_team_identity, make_user, @@ -152,6 +169,7 @@ def test_delete_slack_ratelimit( make_alert, make_slack_message, make_resolution_note_slack_message, + django_capture_on_commit_callbacks, ): organization, slack_team_identity = make_organization_with_slack_team_identity() user = make_user(organization=organization) @@ -180,17 +198,31 @@ def test_delete_slack_ratelimit( ts="test2_ts", ) - with patch.object( - SlackClient, - api_method, - side_effect=SlackAPIRatelimitError( - response=build_slack_response({"ok": False, "error": "ratelimited"}, headers={"Retry-After": 42}) - ), - ): - delete_alert_group(alert_group.pk, user.pk) + with patch( + "apps.alerts.tasks.delete_alert_group.send_alert_group_signal_for_delete.delay", return_value=None + ) as mock_send_alert_group_signal: + with django_capture_on_commit_callbacks(execute=True): + delete_alert_group(alert_group.pk, user.pk) + assert mock_send_alert_group_signal.call_count == 1 + + with patch( + "apps.alerts.tasks.delete_alert_group.finish_delete_alert_group.apply_async", return_value=None + ) as mock_finish_delete_alert_group: + with patch.object( + SlackClient, + api_method, + side_effect=SlackAPIRatelimitError( + response=build_slack_response({"ok": False, "error": "ratelimited"}, headers={"Retry-After": 42}) + ), + ): + send_alert_group_signal_for_delete(*mock_send_alert_group_signal.call_args.args) + + assert mock_finish_delete_alert_group.call_count == 0 # Check task is retried gracefully - mock_delete_alert_group.assert_called_once_with((alert_group.pk, user.pk), countdown=42) + mock_send_alert_group_signal_for_delete.assert_called_once_with( + mock_send_alert_group_signal.call_args.args, countdown=42 + ) @pytest.mark.parametrize("api_method", ["reactions_remove", "chat_delete"]) @@ -582,7 +614,7 @@ def test_filter_active_alert_groups( @patch("apps.alerts.models.AlertGroup.hard_delete") @patch("apps.alerts.models.AlertGroup.un_attach_by_delete") @patch("apps.alerts.models.AlertGroup.stop_escalation") -@patch("apps.alerts.models.alert_group.alert_group_action_triggered_signal") +@patch("apps.alerts.tasks.delete_alert_group.alert_group_action_triggered_signal") @pytest.mark.django_db def test_delete_by_user( mock_alert_group_action_triggered_signal, @@ -592,6 +624,7 @@ def test_delete_by_user( make_organization_and_user, make_alert_receive_channel, make_alert_group, + django_capture_on_commit_callbacks, ): organization, user = make_organization_and_user() alert_receive_channel = make_alert_receive_channel(organization) @@ -603,20 +636,31 @@ def test_delete_by_user( assert alert_group.log_records.filter(type=AlertGroupLogRecord.TYPE_DELETED).count() == 0 - alert_group.delete_by_user(user) + with patch( + "apps.alerts.tasks.delete_alert_group.send_alert_group_signal_for_delete.delay", return_value=None + ) as mock_send_alert_group_signal: + with django_capture_on_commit_callbacks(execute=True): + delete_alert_group(alert_group.pk, user.pk) + assert mock_send_alert_group_signal.call_count == 1 assert alert_group.log_records.filter(type=AlertGroupLogRecord.TYPE_DELETED).count() == 1 deleted_log_record = alert_group.log_records.get(type=AlertGroupLogRecord.TYPE_DELETED) - alert_group.stop_escalation.assert_called_once_with() + with patch( + "apps.alerts.tasks.delete_alert_group.finish_delete_alert_group.apply_async", return_value=None + ) as mock_finish_delete_alert_group: + send_alert_group_signal_for_delete(*mock_send_alert_group_signal.call_args.args) + assert mock_finish_delete_alert_group.call_count == 1 + mock_alert_group_action_triggered_signal.send.assert_called_once_with( - sender=alert_group.delete_by_user, + sender=send_alert_group_signal_for_delete, log_record=deleted_log_record.pk, - action_source=None, force_sync=True, ) + finish_delete_alert_group(alert_group.pk) + alert_group.hard_delete.assert_called_once_with() for dependent_alert_group in dependent_alert_groups: diff --git a/engine/apps/metrics_exporter/tests/test_update_metrics_cache.py b/engine/apps/metrics_exporter/tests/test_update_metrics_cache.py index 29fb52f7..fa05949b 100644 --- a/engine/apps/metrics_exporter/tests/test_update_metrics_cache.py +++ b/engine/apps/metrics_exporter/tests/test_update_metrics_cache.py @@ -31,7 +31,7 @@ def mock_apply_async(monkeypatch): @patch("apps.alerts.models.alert_group_log_record.tasks.send_update_log_report_signal.apply_async") -@patch("apps.alerts.models.alert_group.alert_group_action_triggered_signal.send") +@patch("apps.alerts.tasks.send_alert_group_signal.alert_group_action_triggered_signal.send") @pytest.mark.django_db @override_settings(CELERY_TASK_ALWAYS_EAGER=True) def test_update_metric_alert_groups_total_cache_on_action( @@ -142,7 +142,7 @@ def test_update_metric_alert_groups_total_cache_on_action( @patch("apps.alerts.models.alert_group_log_record.tasks.send_update_log_report_signal.apply_async") -@patch("apps.alerts.models.alert_group.alert_group_action_triggered_signal.send") +@patch("apps.alerts.tasks.send_alert_group_signal.alert_group_action_triggered_signal.send") @pytest.mark.django_db @override_settings(CELERY_TASK_ALWAYS_EAGER=True) def test_update_metric_alert_groups_response_time_cache_on_action( diff --git a/engine/apps/mobile_app/tests/tasks/test_going_oncall_notification.py b/engine/apps/mobile_app/tests/tasks/test_going_oncall_notification.py index 67b51628..c01ec261 100644 --- a/engine/apps/mobile_app/tests/tasks/test_going_oncall_notification.py +++ b/engine/apps/mobile_app/tests/tasks/test_going_oncall_notification.py @@ -56,86 +56,60 @@ def test_shift_starts_within_range(timing_window_lower, timing_window_upper, sec assert _shift_starts_within_range(timing_window_lower, timing_window_upper, seconds_until_shift_starts) == expected +@pytest.mark.parametrize( + "seconds_until_going_oncall,humanized_time_until_going_oncall", + [ + (600, "10 minutes"), + # TODO: right now this returns 11 hours but it should probably round up to 12 hours instead.. + # 11 hours and 53 minutes + ((60 * 60 * 11) + (53 * 60), "11 hours"), + # 12 hours and 10 minutes + ((60 * 60 * 12) + (10 * 60), "12 hours"), + ], +) @pytest.mark.django_db -def test_get_notification_title(make_organization_and_user, make_user, make_schedule): - schedule_name = "asdfasdfasdfasdf" - - organization, user = make_organization_and_user() - user2 = make_user(organization=organization) - schedule = make_schedule(organization, name=schedule_name, schedule_class=OnCallScheduleWeb) - shift_pk = "mncvmnvc" - user_pk = user.public_primary_key - user_locale = "fr_CA" - seconds_until_going_oncall = 600 - humanized_time_until_going_oncall = "10 minutes" - - same_day_shift_start = timezone.datetime(2023, 7, 8, 9, 0, 0) - same_day_shift_end = timezone.datetime(2023, 7, 8, 17, 0, 0) - - multiple_day_shift_start = timezone.datetime(2023, 7, 8, 9, 0, 0) - multiple_day_shift_end = timezone.datetime(2023, 7, 12, 17, 0, 0) - - same_day_shift = _create_schedule_event( - same_day_shift_start, - same_day_shift_end, - shift_pk, - [ - { - "pk": user_pk, - }, - ], - ) - - multiple_day_shift = _create_schedule_event( - multiple_day_shift_start, - multiple_day_shift_end, - shift_pk, - [ - { - "pk": user_pk, - }, - ], - ) - - maus = MobileAppUserSettings.objects.create(user=user, locale=user_locale) - maus_no_locale = MobileAppUserSettings.objects.create(user=user2) - - ################## - # same day shift - ################## - same_day_shift_title = _get_notification_title(seconds_until_going_oncall) - same_day_shift_subtitle = _get_notification_subtitle(schedule, same_day_shift, maus) - same_day_shift_no_locale_subtitle = _get_notification_subtitle(schedule, same_day_shift, maus_no_locale) - - assert same_day_shift_title == f"Your on-call shift starts in {humanized_time_until_going_oncall}" - assert same_day_shift_subtitle == f"09 h 00 - 17 h 00\nSchedule {schedule_name}" - assert same_day_shift_no_locale_subtitle == f"9:00\u202fAM - 5:00\u202fPM\nSchedule {schedule_name}" - - ################## - # multiple day shift - ################## - multiple_day_shift_title = _get_notification_title(seconds_until_going_oncall) - multiple_day_shift_subtitle = _get_notification_subtitle(schedule, multiple_day_shift, maus) - multiple_day_shift_no_locale_subtitle = _get_notification_subtitle(schedule, multiple_day_shift, maus_no_locale) - - assert multiple_day_shift_title == f"Your on-call shift starts in {humanized_time_until_going_oncall}" - assert multiple_day_shift_subtitle == f"2023-07-08 09 h 00 - 2023-07-12 17 h 00\nSchedule {schedule_name}" +def test_get_notification_title(seconds_until_going_oncall, humanized_time_until_going_oncall): assert ( - multiple_day_shift_no_locale_subtitle - == f"7/8/23, 9:00\u202fAM - 7/12/23, 5:00\u202fPM\nSchedule {schedule_name}" + _get_notification_title(seconds_until_going_oncall) + == f"Your on-call shift starts in {humanized_time_until_going_oncall}" ) @pytest.mark.parametrize( - "user_timezone,expected_shift_times", + "user_timezone,shift_start_time,shift_end_time,expected", [ - ("UTC", "9:00 AM - 5:00 PM"), - ("Europe/Amsterdam", "11:00 AM - 7:00 PM"), + # same day shift + ("UTC", timezone.datetime(2023, 7, 8, 9, 0, 0), timezone.datetime(2023, 7, 8, 17, 0, 0), "9:00 AM - 5:00 PM"), + ( + "Europe/Amsterdam", + timezone.datetime(2023, 7, 8, 9, 0, 0), + timezone.datetime(2023, 7, 8, 17, 0, 0), + "11:00 AM - 7:00 PM", + ), + # multi-day shift + ( + "UTC", + timezone.datetime(2023, 7, 8, 9, 0, 0), + timezone.datetime(2023, 7, 12, 17, 0, 0), + "7/8/23, 9:00 AM - 7/12/23, 5:00 PM", + ), + ( + "Europe/Amsterdam", + timezone.datetime(2023, 7, 8, 9, 0, 0), + timezone.datetime(2023, 7, 12, 17, 0, 0), + "7/8/23, 11:00 AM - 7/12/23, 7:00 PM", + ), ], ) @pytest.mark.django_db def test_get_notification_subtitle( - make_organization, make_user_for_organization, make_schedule, user_timezone, expected_shift_times + make_organization, + make_user_for_organization, + make_schedule, + user_timezone, + shift_start_time, + shift_end_time, + expected, ): schedule_name = "asdfasdfasdfasdf" @@ -143,15 +117,11 @@ def test_get_notification_subtitle( user = make_user_for_organization(organization) user_pk = user.public_primary_key maus = MobileAppUserSettings.objects.create(user=user, time_zone=user_timezone) - schedule = make_schedule(organization, name=schedule_name, schedule_class=OnCallScheduleWeb) - shift_start = timezone.datetime(2023, 7, 8, 9, 0, 0) - shift_end = timezone.datetime(2023, 7, 8, 17, 0, 0) - shift = _create_schedule_event( - shift_start, - shift_end, + shift_start_time, + shift_end_time, "asdfasdfasdf", [ { @@ -160,7 +130,51 @@ def test_get_notification_subtitle( ], ) - assert _get_notification_subtitle(schedule, shift, maus) == f"{expected_shift_times}\nSchedule {schedule_name}" + assert _get_notification_subtitle(schedule, shift, maus) == f"{expected}\nSchedule {schedule_name}" + + +@pytest.mark.parametrize( + "shift_start_time,shift_end_time,expected", + [ + # same day shift + (timezone.datetime(2023, 7, 8, 9, 0, 0), timezone.datetime(2023, 7, 8, 17, 0, 0), "9:00 AM - 5:00 PM"), + # multi-day shift + ( + timezone.datetime(2023, 7, 8, 9, 0, 0), + timezone.datetime(2023, 7, 12, 17, 0, 0), + "7/8/23, 9:00 AM - 7/12/23, 5:00 PM", + ), + ], +) +@pytest.mark.django_db +def test_get_notification_subtitle_no_locale( + make_organization, + make_user_for_organization, + make_schedule, + shift_start_time, + shift_end_time, + expected, +): + schedule_name = "asdfasdfasdfasdf" + + organization = make_organization() + user = make_user_for_organization(organization) + user_pk = user.public_primary_key + maus = MobileAppUserSettings.objects.create(user=user) + schedule = make_schedule(organization, name=schedule_name, schedule_class=OnCallScheduleWeb) + + shift = _create_schedule_event( + shift_start_time, + shift_end_time, + "asdfasdfasdf", + [ + { + "pk": user_pk, + }, + ], + ) + + assert _get_notification_subtitle(schedule, shift, maus) == f"{expected}\nSchedule {schedule_name}" @mock.patch("apps.mobile_app.tasks.going_oncall_notification._get_notification_subtitle") diff --git a/engine/apps/slack/representatives/alert_group_representative.py b/engine/apps/slack/representatives/alert_group_representative.py index 1ef8b5c5..67fd4ce6 100644 --- a/engine/apps/slack/representatives/alert_group_representative.py +++ b/engine/apps/slack/representatives/alert_group_representative.py @@ -2,6 +2,7 @@ import logging from celery.utils.log import get_task_logger from django.conf import settings +from django.core.exceptions import ObjectDoesNotExist from apps.alerts.constants import ActionSource from apps.alerts.representative import AlertGroupAbstractRepresentative @@ -49,14 +50,20 @@ def on_create_alert_slack_representative_async(alert_pk): @shared_dedicated_queue_retry_task( - autoretry_for=(Exception,), retry_backoff=True, max_retries=1 if settings.DEBUG else None + autoretry_for=(Exception,), + retry_backoff=True, + dont_autoretry_for=(ObjectDoesNotExist,), + max_retries=1 if settings.DEBUG else None, ) def on_alert_group_action_triggered_async(log_record_id): from apps.alerts.models import AlertGroupLogRecord - logger.debug(f"SLACK representative: get log record {log_record_id}") + try: + log_record = AlertGroupLogRecord.objects.get(pk=log_record_id) + except AlertGroupLogRecord.DoesNotExist as e: + logger.warning(f"SLACK representative: log record {log_record_id} never created or has been deleted") + raise e - log_record = AlertGroupLogRecord.objects.get(pk=log_record_id) alert_group_id = log_record.alert_group_id logger.debug(f"Start on_alert_group_action_triggered for alert_group {alert_group_id}, log record {log_record_id}") instance = AlertGroupSlackRepresentative(log_record) @@ -145,16 +152,25 @@ class AlertGroupSlackRepresentative(AlertGroupAbstractRepresentative): from apps.alerts.models import AlertGroupLogRecord log_record = kwargs["log_record"] - action_source = kwargs.get("action_source") force_sync = kwargs.get("force_sync", False) if isinstance(log_record, AlertGroupLogRecord): log_record_id = log_record.pk else: log_record_id = log_record - if action_source == ActionSource.SLACK or force_sync: + try: + log_record = AlertGroupLogRecord.objects.get(pk=log_record_id) + except AlertGroupLogRecord.DoesNotExist: + logger.warning( + f"on_alert_group_action_triggered: log record {log_record_id} never created or has been deleted" + ) + return + + if log_record.action_source == ActionSource.SLACK or force_sync: + logger.debug(f"SLACK on_alert_group_action_triggered: sync {log_record_id} {force_sync}") on_alert_group_action_triggered_async(log_record_id) else: + logger.debug(f"SLACK on_alert_group_action_triggered: async {log_record_id} {force_sync}") on_alert_group_action_triggered_async.apply_async((log_record_id,)) @classmethod @@ -167,7 +183,11 @@ class AlertGroupSlackRepresentative(AlertGroupAbstractRepresentative): alert_group_id = alert_group.pk else: alert_group_id = alert_group - alert_group = AlertGroup.objects.get(pk=alert_group_id) + try: + alert_group = AlertGroup.objects.get(pk=alert_group_id) + except AlertGroup.DoesNotExist as e: + logger.warning(f"SLACK update log report: alert group {alert_group_id} has been deleted") + raise e logger.debug( f"Received alert_group_update_log_report signal in SLACK representative for alert_group {alert_group_id}" diff --git a/engine/apps/slack/tasks.py b/engine/apps/slack/tasks.py index 0a81c889..fe87f424 100644 --- a/engine/apps/slack/tasks.py +++ b/engine/apps/slack/tasks.py @@ -3,6 +3,7 @@ import random from typing import Optional from celery import uuid as celery_uuid +from celery.exceptions import Retry from celery.utils.log import get_task_logger from django.conf import settings from django.core.cache import cache @@ -142,8 +143,15 @@ def check_slack_message_exists_before_post_message_to_thread( ).save() -@shared_dedicated_queue_retry_task(autoretry_for=(Exception,), retry_backoff=True, max_retries=1) -def send_message_to_thread_if_bot_not_in_channel(alert_group_pk, slack_team_identity_pk, channel_id): +@shared_dedicated_queue_retry_task( + autoretry_for=(Exception,), + dont_autoretry_for=(Retry,), + retry_backoff=True, + max_retries=1, +) +def send_message_to_thread_if_bot_not_in_channel( + alert_group_pk: int, slack_team_identity_pk: int, channel_id: int +) -> None: """ Send message to alert group's thread if bot is not in current channel """ @@ -151,6 +159,11 @@ def send_message_to_thread_if_bot_not_in_channel(alert_group_pk, slack_team_iden from apps.alerts.models import AlertGroup from apps.slack.models import SlackTeamIdentity + logger.info( + f"Starting send_message_to_thread_if_bot_not_in_channel alert_group={alert_group_pk} " + f"slack_team_identity={slack_team_identity_pk} channel_id={channel_id}" + ) + slack_team_identity = SlackTeamIdentity.objects.get(pk=slack_team_identity_pk) alert_group = AlertGroup.objects.get(pk=alert_group_pk) @@ -159,8 +172,20 @@ def send_message_to_thread_if_bot_not_in_channel(alert_group_pk, slack_team_iden bot_user_id = slack_team_identity.bot_user_id members = slack_team_identity.get_conversation_members(sc, channel_id) if bot_user_id not in members: - text = f"Please invite <@{bot_user_id}> to this channel to make all features " f"available :wink:" - AlertGroupSlackService(slack_team_identity, sc).publish_message_to_alert_group_thread(alert_group, text=text) + text = f"Please invite <@{bot_user_id}> to this channel to make all features available :wink:" + + try: + logger.info("Attempting to send message to thread in Slack") + + AlertGroupSlackService(slack_team_identity, sc).publish_message_to_alert_group_thread( + alert_group, text=text + ) + except SlackAPIRatelimitError as e: + logger.warning(f"Slack API rate limit error: {e}, retrying task") + + raise send_message_to_thread_if_bot_not_in_channel.retry( + (alert_group_pk, slack_team_identity_pk, channel_id), countdown=e.retry_after, exc=e + ) @shared_dedicated_queue_retry_task(autoretry_for=(Exception,), retry_backoff=True, max_retries=0) diff --git a/engine/apps/slack/tests/tasks/__init__.py b/engine/apps/slack/tests/tasks/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/engine/apps/slack/tests/tasks/test_send_message_to_thread_if_bot_not_in_channel.py b/engine/apps/slack/tests/tasks/test_send_message_to_thread_if_bot_not_in_channel.py new file mode 100644 index 00000000..ea7b8186 --- /dev/null +++ b/engine/apps/slack/tests/tasks/test_send_message_to_thread_if_bot_not_in_channel.py @@ -0,0 +1,99 @@ +from unittest.mock import ANY, patch + +import pytest +from celery import exceptions + +from apps.slack.errors import SlackAPIRatelimitError +from apps.slack.tasks import send_message_to_thread_if_bot_not_in_channel +from apps.slack.tests.conftest import build_slack_response + +BOT_USER_ID = "U12345678" +TEXT = f"Please invite <@{BOT_USER_ID}> to this channel to make all features available :wink:" + + +@pytest.mark.parametrize("channel_members", [["U0909090"], [BOT_USER_ID]]) +@patch("apps.slack.models.SlackTeamIdentity.get_conversation_members") +@patch("apps.slack.tasks.SlackClient") +@patch("apps.slack.tasks.AlertGroupSlackService") +@pytest.mark.django_db +def test_send_message_to_thread_if_bot_not_in_channel( + MockAlertGroupSlackService, + MockSlackClient, + mock_get_conversation_members, + make_slack_team_identity, + make_slack_channel, + make_organization, + make_alert_receive_channel, + make_alert_group, + channel_members, +): + mock_get_conversation_members.return_value = channel_members + + slack_team_identity = make_slack_team_identity(bot_user_id=BOT_USER_ID) + slack_channel = make_slack_channel(slack_team_identity) + + organization = make_organization(slack_team_identity=slack_team_identity) + + alert_receive_channel = make_alert_receive_channel(organization) + alert_group = make_alert_group(alert_receive_channel) + + send_message_to_thread_if_bot_not_in_channel(alert_group.pk, slack_team_identity.pk, slack_channel.pk) + + MockSlackClient.assert_called_once_with(slack_team_identity, enable_ratelimit_retry=True) + mock_get_conversation_members.assert_called_once_with(MockSlackClient.return_value, slack_channel.pk) + + if BOT_USER_ID not in channel_members: + MockAlertGroupSlackService.assert_called_once_with(slack_team_identity, MockSlackClient.return_value) + + MockAlertGroupSlackService.return_value.publish_message_to_alert_group_thread.assert_called_once_with( + alert_group, text=TEXT + ) + else: + MockAlertGroupSlackService.return_value.publish_message_to_alert_group_thread.assert_not_called() + + +@patch("apps.slack.models.SlackTeamIdentity.get_conversation_members") +@patch("apps.slack.tasks.SlackClient") +@patch("apps.slack.tasks.AlertGroupSlackService") +@patch("apps.slack.tasks.send_message_to_thread_if_bot_not_in_channel.retry") +@pytest.mark.django_db +def test_send_message_to_thread_if_bot_not_in_channel_slack_api_rate_limit_error( + mock_send_message_to_thread_if_bot_not_in_channel_retry, + MockAlertGroupSlackService, + MockSlackClient, + mock_get_conversation_members, + make_slack_team_identity, + make_slack_channel, + make_organization, + make_alert_receive_channel, + make_alert_group, +): + mock_send_message_to_thread_if_bot_not_in_channel_retry.side_effect = exceptions.Retry() + mock_get_conversation_members.return_value = ["U0909090"] + + RETRY_AFTER = 42 + + MockAlertGroupSlackService.return_value.publish_message_to_alert_group_thread.side_effect = SlackAPIRatelimitError( + response=build_slack_response({"ok": False, "error": "ratelimited"}, headers={"Retry-After": RETRY_AFTER}) + ) + + slack_team_identity = make_slack_team_identity(bot_user_id=BOT_USER_ID) + slack_channel = make_slack_channel(slack_team_identity) + + organization = make_organization(slack_team_identity=slack_team_identity) + + alert_receive_channel = make_alert_receive_channel(organization) + alert_group = make_alert_group(alert_receive_channel) + + with pytest.raises(exceptions.Retry): + send_message_to_thread_if_bot_not_in_channel(alert_group.pk, slack_team_identity.pk, slack_channel.pk) + + MockAlertGroupSlackService.assert_called_once_with(slack_team_identity, MockSlackClient.return_value) + + MockAlertGroupSlackService.return_value.publish_message_to_alert_group_thread.assert_called_once_with( + alert_group, text=TEXT + ) + + mock_send_message_to_thread_if_bot_not_in_channel_retry.assert_called_once_with( + (alert_group.pk, slack_team_identity.pk, slack_channel.pk), countdown=RETRY_AFTER, exc=ANY + ) diff --git a/engine/apps/telegram/alert_group_representative.py b/engine/apps/telegram/alert_group_representative.py index 635420c6..ef3ae992 100644 --- a/engine/apps/telegram/alert_group_representative.py +++ b/engine/apps/telegram/alert_group_representative.py @@ -64,8 +64,13 @@ class AlertGroupTelegramRepresentative(AlertGroupAbstractRepresentative): def on_alert_group_update_log_report(cls, **kwargs): logger.info("AlertGroupTelegramRepresentative UPDATE LOG REPORT SIGNAL") alert_group = kwargs["alert_group"] + if not isinstance(alert_group, AlertGroup): - alert_group = AlertGroup.objects.get(pk=alert_group) + try: + alert_group = AlertGroup.objects.get(pk=alert_group) + except AlertGroup.DoesNotExist as e: + logger.warning(f"Telegram update log report: alert group {alert_group} has been deleted") + raise e messages_to_edit = alert_group.telegram_messages.filter( message_type__in=( diff --git a/engine/apps/telegram/tasks.py b/engine/apps/telegram/tasks.py index 41073848..5aeb9948 100644 --- a/engine/apps/telegram/tasks.py +++ b/engine/apps/telegram/tasks.py @@ -3,6 +3,7 @@ import logging from celery import uuid as celery_uuid from celery.utils.log import get_task_logger from django.conf import settings +from django.core.exceptions import ObjectDoesNotExist from telegram import error from apps.alerts.models import Alert, AlertGroup @@ -230,7 +231,10 @@ def on_create_alert_telegram_representative_async(self, alert_pk): @shared_dedicated_queue_retry_task( - autoretry_for=(Exception,), retry_backoff=True, max_retries=1 if settings.DEBUG else None + autoretry_for=(Exception,), + retry_backoff=True, + dont_autoretry_for=(ObjectDoesNotExist,), + max_retries=1 if settings.DEBUG else None, ) def on_alert_group_action_triggered_async(log_record_id): from apps.alerts.models import AlertGroupLogRecord @@ -239,18 +243,14 @@ def on_alert_group_action_triggered_async(log_record_id): logger.info(f"AlertGroupTelegramRepresentative ACTION SIGNAL, log record {log_record_id}") # temporary solution to handle cases when alert group and related log records were deleted + try: log_record = AlertGroupLogRecord.objects.get(pk=log_record_id) except AlertGroupLogRecord.DoesNotExist as e: - retries_count = on_alert_group_action_triggered_async.request.retries - if retries_count >= 10: - logger.error( - f"AlertGroupTelegramRepresentative: was not able to get AlertGroupLogRecord, probably alert group " - f"was deleted. log record {log_record_id}, retries: {retries_count}" - ) - return - else: - raise e + logger.warning( + f"AlertGroupTelegramRepresentative: log record {log_record_id} never created or has been deleted" + ) + raise e instance = AlertGroupTelegramRepresentative(log_record) if instance.is_applicable(): diff --git a/engine/apps/user_management/sync.py b/engine/apps/user_management/sync.py index fd8256e1..df0ce19b 100644 --- a/engine/apps/user_management/sync.py +++ b/engine/apps/user_management/sync.py @@ -111,9 +111,12 @@ def _sync_grafana_incident_plugin(organization: Organization, grafana_api_client It intended to use only inside _sync_organization. It mutates, but not saves org, it's saved in _sync_organization. """ grafana_incident_settings, _ = grafana_api_client.get_grafana_incident_plugin_settings() + organization.is_grafana_incident_enabled = False + organization.grafana_incident_backend_url = None + if grafana_incident_settings is not None: organization.is_grafana_incident_enabled = grafana_incident_settings["enabled"] - organization.grafana_incident_backend_url = grafana_incident_settings.get("jsonData", {}).get( + organization.grafana_incident_backend_url = (grafana_incident_settings.get("jsonData") or {}).get( GrafanaAPIClient.GRAFANA_INCIDENT_PLUGIN_BACKEND_URL_KEY ) diff --git a/engine/apps/user_management/tests/test_sync.py b/engine/apps/user_management/tests/test_sync.py index 158f1378..96a86bae 100644 --- a/engine/apps/user_management/tests/test_sync.py +++ b/engine/apps/user_management/tests/test_sync.py @@ -541,6 +541,7 @@ class TestSyncGrafanaIncidentParams: MOCK_GRAFANA_INCIDENT_BACKEND_URL, ), TestSyncGrafanaIncidentParams(({"enabled": True}, None), True, None), + TestSyncGrafanaIncidentParams(({"enabled": True, "jsonData": None}, None), True, None), # missing jsonData (sometimes this is what we get back from the Grafana API) TestSyncGrafanaIncidentParams(({"enabled": False}, None), False, None), # plugin is disabled for some reason ], diff --git a/engine/apps/webhooks/listeners.py b/engine/apps/webhooks/listeners.py index 5ff9b192..ebbed55b 100644 --- a/engine/apps/webhooks/listeners.py +++ b/engine/apps/webhooks/listeners.py @@ -15,5 +15,9 @@ def on_action_triggered(**kwargs): log_record = kwargs["log_record"] if not isinstance(log_record, AlertGroupLogRecord): - log_record = AlertGroupLogRecord.objects.get(pk=log_record) + try: + log_record = AlertGroupLogRecord.objects.get(pk=log_record) + except AlertGroupLogRecord.DoesNotExist as e: + logger.warning(f"Webhook action triggered: log record {log_record} never created or has been deleted") + raise e alert_group_status_change.apply_async((log_record.type, log_record.alert_group_id, log_record.author_id)) diff --git a/engine/settings/celery_task_routes.py b/engine/settings/celery_task_routes.py index 0a6f319c..73dea53d 100644 --- a/engine/settings/celery_task_routes.py +++ b/engine/settings/celery_task_routes.py @@ -4,6 +4,8 @@ CELERY_TASK_ROUTES = { "queue": "default" }, "apps.alerts.tasks.delete_alert_group.delete_alert_group": {"queue": "default"}, + "apps.alerts.tasks.delete_alert_group.send_alert_group_signal_for_delete": {"queue": "default"}, + "apps.alerts.tasks.delete_alert_group.finish_delete_alert_group": {"queue": "default"}, "apps.alerts.tasks.invalidate_web_cache_for_alert_group.invalidate_web_cache_for_alert_group": {"queue": "default"}, "apps.alerts.tasks.send_alert_group_signal.send_alert_group_signal": {"queue": "default"}, "apps.alerts.tasks.wipe.wipe": {"queue": "default"},