Add transaction on_commit before signals for alert group actions (#3731)

# What this PR does
Add transactions around log record creation and check transaction
on_commit before sending signals passing DB id of alert group log
records. In cases for delete we can then assume any missing IDs on tasks
are from intentionally deleted alert groups and we can stop tasks from
retrying endlessly.

## Which issue(s) this PR fixes

## Checklist

- [x] Unit, integration, and e2e (if applicable) tests updated
- [x] Documentation added (or `pr:no public docs` PR label added if not
required)
- [x] `CHANGELOG.md` updated (or `pr:no changelog` PR label added if not
required)
This commit is contained in:
Michael Derynck 2024-01-31 15:54:50 -07:00 committed by GitHub
parent 14feaba3d1
commit 2a466a0c4f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
14 changed files with 352 additions and 282 deletions

View file

@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## Unreleased
### Changed
- Ensure alert group log records are committed to DB before signalling about them @mderynck([#3731](https://github.com/grafana/oncall/pull/3731))
### Fixed
- Address `SlackAPIRatelimitError` exceptions in `apps.slack.tasks.send_message_to_thread_if_bot_not_in_channel` task

View file

@ -20,8 +20,13 @@ from apps.alerts.escalation_snapshot.escalation_snapshot_mixin import START_ESCA
from apps.alerts.incident_appearance.renderers.constants import DEFAULT_BACKUP_TITLE
from apps.alerts.incident_appearance.renderers.slack_renderer import AlertGroupSlackRenderer
from apps.alerts.incident_log_builder import IncidentLogBuilder
from apps.alerts.signals import alert_group_action_triggered_signal, alert_group_created_signal
from apps.alerts.tasks import acknowledge_reminder_task, send_alert_group_signal, unsilence_task
from apps.alerts.signals import alert_group_created_signal
from apps.alerts.tasks import (
acknowledge_reminder_task,
send_alert_group_signal,
send_alert_group_signal_for_delete,
unsilence_task,
)
from apps.metrics_exporter.tasks import update_metrics_for_alert_group
from apps.slack.slack_formatter import SlackFormatter
from apps.user_management.models import User
@ -639,20 +644,17 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models.
self.stop_escalation()
self.start_ack_reminder_if_needed()
log_record = self.log_records.create(
type=AlertGroupLogRecord.TYPE_ACK, author=user, action_source=action_source
)
with transaction.atomic():
log_record = self.log_records.create(
type=AlertGroupLogRecord.TYPE_ACK, author=user, action_source=action_source
)
logger.debug(
f"send alert_group_action_triggered_signal for alert_group {self.pk}, "
f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: {action_source}"
)
logger.debug(
f"send alert_group_action_triggered_signal for alert_group {self.pk}, "
f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: {action_source}"
)
alert_group_action_triggered_signal.send(
sender=self.acknowledge_by_user,
log_record=log_record.pk,
action_source=action_source,
)
transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk))
for dependent_alert_group in self.dependent_alert_groups.all():
dependent_alert_group.acknowledge_by_user(user, action_source=action_source)
@ -679,18 +681,15 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models.
)
self.stop_escalation()
log_record = self.log_records.create(type=AlertGroupLogRecord.TYPE_ACK)
with transaction.atomic():
log_record = self.log_records.create(type=AlertGroupLogRecord.TYPE_ACK)
logger.debug(
f"send alert_group_action_triggered_signal for alert_group {self.pk}, "
f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: alert"
)
logger.debug(
f"send alert_group_action_triggered_signal for alert_group {self.pk}, "
f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: alert"
)
alert_group_action_triggered_signal.send(
sender=self.acknowledge_by_source,
log_record=log_record.pk,
action_source=None,
)
transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk))
for dependent_alert_group in self.dependent_alert_groups.all():
dependent_alert_group.acknowledge_by_source()
@ -707,20 +706,17 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models.
if self.is_root_alert_group:
self.start_escalation_if_needed()
log_record = self.log_records.create(
type=AlertGroupLogRecord.TYPE_UN_ACK, author=user, action_source=action_source
)
with transaction.atomic():
log_record = self.log_records.create(
type=AlertGroupLogRecord.TYPE_UN_ACK, author=user, action_source=action_source
)
logger.debug(
f"send alert_group_action_triggered_signal for alert_group {self.pk}, "
f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: {action_source}"
)
logger.debug(
f"send alert_group_action_triggered_signal for alert_group {self.pk}, "
f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: {action_source}"
)
alert_group_action_triggered_signal.send(
sender=self.un_acknowledge_by_user,
log_record=log_record.pk,
action_source=action_source,
)
transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk))
for dependent_alert_group in self.dependent_alert_groups.all():
dependent_alert_group.un_acknowledge_by_user(user, action_source=action_source)
@ -745,20 +741,18 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models.
# Update alert group state and response time metrics cache
self._update_metrics(organization_id=user.organization_id, previous_state=initial_state, state=self.state)
self.stop_escalation()
log_record = self.log_records.create(
type=AlertGroupLogRecord.TYPE_RESOLVED, author=user, action_source=action_source
)
logger.debug(
f"send alert_group_action_triggered_signal for alert_group {self.pk}, "
f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: {action_source}"
)
with transaction.atomic():
log_record = self.log_records.create(
type=AlertGroupLogRecord.TYPE_RESOLVED, author=user, action_source=action_source
)
alert_group_action_triggered_signal.send(
sender=self.resolve_by_user,
log_record=log_record.pk,
action_source=action_source,
)
logger.debug(
f"send alert_group_action_triggered_signal for alert_group {self.pk}, "
f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: {action_source}"
)
transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk))
for dependent_alert_group in self.dependent_alert_groups.all():
dependent_alert_group.resolve_by_user(user, action_source=action_source)
@ -782,18 +776,16 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models.
organization_id=self.channel.organization_id, previous_state=initial_state, state=self.state
)
self.stop_escalation()
log_record = self.log_records.create(type=AlertGroupLogRecord.TYPE_RESOLVED)
logger.debug(
f"send alert_group_action_triggered_signal for alert_group {self.pk}, "
f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: alert"
)
with transaction.atomic():
log_record = self.log_records.create(type=AlertGroupLogRecord.TYPE_RESOLVED)
alert_group_action_triggered_signal.send(
sender=self.resolve_by_source,
log_record=log_record.pk,
action_source=None,
)
logger.debug(
f"send alert_group_action_triggered_signal for alert_group {self.pk}, "
f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: alert"
)
transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk))
for dependent_alert_group in self.dependent_alert_groups.all():
dependent_alert_group.resolve_by_source()
@ -809,18 +801,16 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models.
organization_id=self.channel.organization_id, previous_state=initial_state, state=self.state
)
self.stop_escalation()
log_record = self.log_records.create(type=AlertGroupLogRecord.TYPE_RESOLVED)
logger.debug(
f"send alert_group_action_triggered_signal for alert_group {self.pk}, "
f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: resolve step"
)
with transaction.atomic():
log_record = self.log_records.create(type=AlertGroupLogRecord.TYPE_RESOLVED)
alert_group_action_triggered_signal.send(
sender=self.resolve_by_last_step,
log_record=log_record.pk,
action_source=None,
)
logger.debug(
f"send alert_group_action_triggered_signal for alert_group {self.pk}, "
f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: resolve step"
)
transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk))
for dependent_alert_group in self.dependent_alert_groups.all():
dependent_alert_group.resolve_by_last_step()
@ -830,19 +820,17 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models.
self.resolve(resolved_by=AlertGroup.DISABLE_MAINTENANCE)
self.stop_escalation()
log_record = self.log_records.create(type=AlertGroupLogRecord.TYPE_RESOLVED)
logger.debug(
f"send alert_group_action_triggered_signal for alert_group {self.pk}, "
f"log record {log_record.pk} with type '{log_record.get_type_display()}', "
f"action source: disable maintenance"
)
with transaction.atomic():
log_record = self.log_records.create(type=AlertGroupLogRecord.TYPE_RESOLVED)
alert_group_action_triggered_signal.send(
sender=self.resolve_by_disable_maintenance,
log_record=log_record.pk,
action_source=None,
)
logger.debug(
f"send alert_group_action_triggered_signal for alert_group {self.pk}, "
f"log record {log_record.pk} with type '{log_record.get_type_display()}', "
f"action source: disable maintenance"
)
transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk))
for dependent_alert_group in self.dependent_alert_groups.all():
dependent_alert_group.resolve_by_disable_maintenance()
@ -856,24 +844,21 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models.
# Update alert group state metric cache
self._update_metrics(organization_id=user.organization_id, previous_state=initial_state, state=self.state)
log_record = self.log_records.create(
type=AlertGroupLogRecord.TYPE_UN_RESOLVED, author=user, action_source=action_source
)
with transaction.atomic():
log_record = self.log_records.create(
type=AlertGroupLogRecord.TYPE_UN_RESOLVED, author=user, action_source=action_source
)
if self.is_root_alert_group:
self.start_escalation_if_needed()
if self.is_root_alert_group:
self.start_escalation_if_needed()
logger.debug(
f"send alert_group_action_triggered_signal for alert_group {self.pk}, "
f"log record {log_record.pk} with type '{log_record.get_type_display()}', "
f"action source: {action_source}"
)
logger.debug(
f"send alert_group_action_triggered_signal for alert_group {self.pk}, "
f"log record {log_record.pk} with type '{log_record.get_type_display()}', "
f"action source: {action_source}"
)
alert_group_action_triggered_signal.send(
sender=self.un_resolve_by_user,
log_record=log_record.pk,
action_source=action_source,
)
transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk))
for dependent_alert_group in self.dependent_alert_groups.all():
dependent_alert_group.un_resolve_by_user(user, action_source=action_source)
@ -898,25 +883,22 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models.
if not root_alert_group.silenced and self.silenced:
self.un_silence_by_user(user, action_source=action_source)
log_record = self.log_records.create(
type=AlertGroupLogRecord.TYPE_ATTACHED,
author=user,
root_alert_group=root_alert_group,
reason="Attach dropdown",
action_source=action_source,
)
with transaction.atomic():
log_record = self.log_records.create(
type=AlertGroupLogRecord.TYPE_ATTACHED,
author=user,
root_alert_group=root_alert_group,
reason="Attach dropdown",
action_source=action_source,
)
logger.debug(
f"send alert_group_action_triggered_signal for alert_group {self.pk}, "
f"log record {log_record.pk} with type '{log_record.get_type_display()}', "
f"action source: {action_source}"
)
logger.debug(
f"send alert_group_action_triggered_signal for alert_group {self.pk}, "
f"log record {log_record.pk} with type '{log_record.get_type_display()}', "
f"action source: {action_source}"
)
alert_group_action_triggered_signal.send(
sender=self.attach_by_user,
log_record=log_record.pk,
action_source=action_source,
)
transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk))
log_record_for_root_incident = root_alert_group.log_records.create(
type=AlertGroupLogRecord.TYPE_ATTACHED,
@ -932,11 +914,7 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models.
f"'{log_record_for_root_incident.get_type_display()}', action source: {action_source}"
)
alert_group_action_triggered_signal.send(
sender=self.attach_by_user,
log_record=log_record_for_root_incident.pk,
action_source=action_source,
)
transaction.on_commit(partial(send_alert_group_signal.delay, log_record_for_root_incident.pk))
else:
log_record = self.log_records.create(
@ -953,11 +931,7 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models.
f"action source: {action_source}"
)
alert_group_action_triggered_signal.send(
sender=self.attach_by_user,
log_record=log_record.pk,
action_source=action_source,
)
transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk))
def un_attach_by_user(self, user: User, action_source: typing.Optional[ActionSource] = None) -> None:
from apps.alerts.models import AlertGroupLogRecord
@ -968,45 +942,38 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models.
self.start_escalation_if_needed()
log_record = self.log_records.create(
type=AlertGroupLogRecord.TYPE_UNATTACHED,
author=user,
root_alert_group=root_alert_group,
reason="Unattach button",
action_source=action_source,
)
with transaction.atomic():
log_record = self.log_records.create(
type=AlertGroupLogRecord.TYPE_UNATTACHED,
author=user,
root_alert_group=root_alert_group,
reason="Unattach button",
action_source=action_source,
)
logger.debug(
f"send alert_group_action_triggered_signal for alert_group {self.pk}, "
f"log record {log_record.pk} with type '{log_record.get_type_display()}', "
f"action source: {action_source}"
)
logger.debug(
f"send alert_group_action_triggered_signal for alert_group {self.pk}, "
f"log record {log_record.pk} with type '{log_record.get_type_display()}', "
f"action source: {action_source}"
)
alert_group_action_triggered_signal.send(
sender=self.un_attach_by_user,
log_record=log_record.pk,
action_source=action_source,
)
transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk))
log_record_for_root_incident = root_alert_group.log_records.create(
type=AlertGroupLogRecord.TYPE_UNATTACHED,
author=user,
dependent_alert_group=self,
reason="Unattach dropdown",
action_source=action_source,
)
log_record_for_root_incident = root_alert_group.log_records.create(
type=AlertGroupLogRecord.TYPE_UNATTACHED,
author=user,
dependent_alert_group=self,
reason="Unattach dropdown",
action_source=action_source,
)
logger.debug(
f"send alert_group_action_triggered_signal for alert_group {root_alert_group.pk}, "
f"log record {log_record_for_root_incident.pk} "
f"with type '{log_record_for_root_incident.get_type_display()}', action source: {action_source}"
)
logger.debug(
f"send alert_group_action_triggered_signal for alert_group {root_alert_group.pk}, "
f"log record {log_record_for_root_incident.pk} "
f"with type '{log_record_for_root_incident.get_type_display()}', action source: {action_source}"
)
alert_group_action_triggered_signal.send(
sender=self.un_attach_by_user,
log_record=log_record_for_root_incident.pk,
action_source=action_source,
)
transaction.on_commit(partial(send_alert_group_signal.delay, log_record_for_root_incident.pk))
def un_attach_by_delete(self):
from apps.alerts.models import AlertGroupLogRecord
@ -1016,22 +983,19 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models.
self.start_escalation_if_needed()
log_record = self.log_records.create(
type=AlertGroupLogRecord.TYPE_UNATTACHED,
reason="Unattach by deleting root incident",
)
with transaction.atomic():
log_record = self.log_records.create(
type=AlertGroupLogRecord.TYPE_UNATTACHED,
reason="Unattach by deleting root incident",
)
logger.debug(
f"send alert_group_action_triggered_signal for alert_group {self.pk}, "
f"log record {log_record.pk} with type '{log_record.get_type_display()}', "
f"action source: delete"
)
logger.debug(
f"send alert_group_action_triggered_signal for alert_group {self.pk}, "
f"log record {log_record.pk} with type '{log_record.get_type_display()}', "
f"action source: delete"
)
alert_group_action_triggered_signal.send(
sender=self.un_attach_by_delete,
log_record=log_record.pk,
action_source=None,
)
transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk))
def silence_by_user(
self, user: User, silence_delay: typing.Optional[int], action_source: typing.Optional[ActionSource] = None
@ -1086,25 +1050,23 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models.
# Update alert group state and response time metrics cache
self._update_metrics(organization_id=user.organization_id, previous_state=initial_state, state=self.state)
log_record = self.log_records.create(
type=AlertGroupLogRecord.TYPE_SILENCE,
author=user,
silence_delay=silence_delay_timedelta,
reason="Silence button",
action_source=action_source,
)
with transaction.atomic():
log_record = self.log_records.create(
type=AlertGroupLogRecord.TYPE_SILENCE,
author=user,
silence_delay=silence_delay_timedelta,
reason="Silence button",
action_source=action_source,
)
logger.debug(
f"send alert_group_action_triggered_signal for alert_group {self.pk}, "
f"log record {log_record.pk} with type '{log_record.get_type_display()}', "
f"action source: {action_source}"
)
logger.debug(
f"send alert_group_action_triggered_signal for alert_group {self.pk}, "
f"log record {log_record.pk} with type '{log_record.get_type_display()}', "
f"action source: {action_source}"
)
transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk))
alert_group_action_triggered_signal.send(
sender=self.silence_by_user,
log_record=log_record.pk,
action_source=action_source,
)
for dependent_alert_group in self.dependent_alert_groups.all():
dependent_alert_group.silence_by_user(user, silence_delay, action_source)
@ -1120,26 +1082,24 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models.
if self.is_root_alert_group:
self.start_escalation_if_needed()
log_record = self.log_records.create(
type=AlertGroupLogRecord.TYPE_UN_SILENCE,
author=user,
silence_delay=None,
# 2.Look like some time ago there was no TYPE_UN_SILENCE
reason="Unsilence button",
action_source=action_source,
)
with transaction.atomic():
log_record = self.log_records.create(
type=AlertGroupLogRecord.TYPE_UN_SILENCE,
author=user,
silence_delay=None,
# 2.Look like some time ago there was no TYPE_UN_SILENCE
reason="Unsilence button",
action_source=action_source,
)
logger.debug(
f"send alert_group_action_triggered_signal for alert_group {self.pk}, "
f"log record {log_record.pk} with type '{log_record.get_type_display()}', "
f"action source: {action_source}"
)
logger.debug(
f"send alert_group_action_triggered_signal for alert_group {self.pk}, "
f"log record {log_record.pk} with type '{log_record.get_type_display()}', "
f"action source: {action_source}"
)
transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk))
alert_group_action_triggered_signal.send(
sender=self.un_silence_by_user,
log_record=log_record.pk,
action_source=action_source,
)
for dependent_alert_group in self.dependent_alert_groups.all():
dependent_alert_group.un_silence_by_user(user, action_source=action_source)
@ -1169,22 +1129,19 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models.
# Update alert group state and response time metrics cache
self._update_metrics(organization_id=user.organization_id, previous_state=initial_state, state=self.state)
log_record = self.log_records.create(
type=AlertGroupLogRecord.TYPE_WIPED,
author=user,
)
with transaction.atomic():
log_record = self.log_records.create(
type=AlertGroupLogRecord.TYPE_WIPED,
author=user,
)
logger.debug(
f"send alert_group_action_triggered_signal for alert_group {self.pk}, "
f"log record {log_record.pk} with type '{log_record.get_type_display()}', "
f"action source: wipe"
)
logger.debug(
f"send alert_group_action_triggered_signal for alert_group {self.pk}, "
f"log record {log_record.pk} with type '{log_record.get_type_display()}', "
f"action source: wipe"
)
alert_group_action_triggered_signal.send(
sender=self.wipe_by_user,
log_record=log_record.pk,
action_source=None,
)
transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk))
for dependent_alert_group in self.dependent_alert_groups.all():
dependent_alert_group.wipe_by_user(user)
@ -1193,31 +1150,27 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models.
from apps.alerts.models import AlertGroupLogRecord
self.stop_escalation()
# prevent creating multiple logs
# filter instead of get_or_create cause it can be multiple logs of this type due deleting error
log_record = self.log_records.filter(type=AlertGroupLogRecord.TYPE_DELETED).last()
if not log_record:
log_record = self.log_records.create(
type=AlertGroupLogRecord.TYPE_DELETED,
author=user,
with transaction.atomic():
# prevent creating multiple logs
# filter instead of get_or_create cause it can be multiple logs of this type due deleting error
log_record = self.log_records.filter(type=AlertGroupLogRecord.TYPE_DELETED).last()
if not log_record:
log_record = self.log_records.create(
type=AlertGroupLogRecord.TYPE_DELETED,
author=user,
)
logger.debug(
f"send alert_group_action_triggered_signal for alert_group {self.pk}, "
f"log record {log_record.pk} with type '{log_record.get_type_display()}', "
f"action source: delete"
)
logger.debug(
f"send alert_group_action_triggered_signal for alert_group {self.pk}, "
f"log record {log_record.pk} with type '{log_record.get_type_display()}', "
f"action source: delete"
)
alert_group_action_triggered_signal.send(
sender=self.delete_by_user,
log_record=log_record.pk,
action_source=None, # TODO: Action source is none - it is suspicious
# this flag forces synchrony call for action handler in representatives
# (for now it is actual only for Slack representative)
force_sync=True,
)
transaction.on_commit(partial(send_alert_group_signal_for_delete.delay, self.pk, log_record.pk))
def finish_delete_by_user(self):
dependent_alerts = list(self.dependent_alert_groups.all())
self.hard_delete()

View file

@ -594,6 +594,12 @@ class AlertGroupLogRecord(models.Model):
step_specific_info = json.loads(self.step_specific_info)
return step_specific_info
def delete(self):
logger.debug(
f"alert_group_log_record for alert_group deleted" f"alert_group={self.alert_group.pk} log_id={self.pk}"
)
super().delete()
@receiver(post_save, sender=AlertGroupLogRecord)
def listen_for_alertgrouplogrecord(sender, instance, created, *args, **kwargs):

View file

@ -7,6 +7,8 @@ from .check_escalation_finished import check_escalation_finished_task # noqa: F
from .custom_button_result import custom_button_result # noqa: F401
from .custom_webhook_result import custom_webhook_result # noqa: F401
from .delete_alert_group import delete_alert_group # noqa: F401
from .delete_alert_group import finish_delete_alert_group # noqa: F401
from .delete_alert_group import send_alert_group_signal_for_delete # noqa: F401
from .distribute_alert import distribute_alert # noqa: F401
from .escalate_alert_group import escalate_alert_group # noqa: F401
from .invite_user_to_join_incident import invite_user_to_join_incident # noqa: F401

View file

@ -76,6 +76,7 @@ def acknowledge_reminder_task(alert_group_pk: int, unacknowledge_process_id: str
log_record = alert_group.log_records.create(
type=AlertGroupLogRecord.TYPE_ACK_REMINDER_TRIGGERED, author=alert_group.acknowledged_by_user
)
task_logger.info(f"created log record {log_record.pk}, sending signal...")
transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk))

View file

@ -1,6 +1,7 @@
from celery.utils.log import get_task_logger
from django.conf import settings
from apps.alerts.signals import alert_group_action_triggered_signal
from apps.slack.errors import SlackAPIRatelimitError
from common.custom_celery_tasks import shared_dedicated_queue_retry_task
@ -10,7 +11,7 @@ logger = get_task_logger(__name__)
@shared_dedicated_queue_retry_task(
autoretry_for=(Exception,), retry_backoff=True, max_retries=1 if settings.DEBUG else None
)
def delete_alert_group(alert_group_pk, user_pk):
def delete_alert_group(alert_group_pk: int, user_pk: int) -> None:
from apps.alerts.models import AlertGroup
from apps.user_management.models import User
@ -25,9 +26,35 @@ def delete_alert_group(alert_group_pk, user_pk):
return
logger.debug(f"User {user} is deleting alert group {alert_group} (channel: {alert_group.channel})")
alert_group.delete_by_user(user)
@shared_dedicated_queue_retry_task(
autoretry_for=(Exception,), retry_backoff=True, max_retries=1 if settings.DEBUG else None
)
def send_alert_group_signal_for_delete(alert_group_pk: int, log_record_pk: int) -> None:
try:
alert_group.delete_by_user(user)
alert_group_action_triggered_signal.send(
sender=send_alert_group_signal_for_delete,
log_record=log_record_pk,
force_sync=True,
)
except SlackAPIRatelimitError as e:
# Handle Slack API ratelimit raised in apps.slack.scenarios.distribute_alerts.DeleteGroupStep.process_signal
delete_alert_group.apply_async((alert_group_pk, user_pk), countdown=e.retry_after)
send_alert_group_signal_for_delete.apply_async((alert_group_pk, log_record_pk), countdown=e.retry_after)
return
finish_delete_alert_group.apply_async((alert_group_pk,))
@shared_dedicated_queue_retry_task(
autoretry_for=(Exception,), retry_backoff=True, max_retries=1 if settings.DEBUG else None
)
def finish_delete_alert_group(alert_group_pk: int) -> None:
from apps.alerts.models import AlertGroup
alert_group = AlertGroup.objects.filter(pk=alert_group_pk).first()
if not alert_group:
logger.debug(f"Alert group id={alert_group_pk} not found, already deleted")
return
alert_group.finish_delete_by_user()

View file

@ -5,13 +5,15 @@ from django.conf import settings
from apps.alerts.signals import alert_group_action_triggered_signal
from common.custom_celery_tasks import shared_dedicated_queue_retry_task
from .task_logger import task_logger
@shared_dedicated_queue_retry_task(
autoretry_for=(Exception,), retry_backoff=True, max_retries=0 if settings.DEBUG else None
)
def send_alert_group_signal(log_record_id):
start_time = time.time()
task_logger.info(f"sending signal for log record {log_record_id}")
alert_group_action_triggered_signal.send(sender=send_alert_group_signal, log_record=log_record_id)
print("--- %s seconds ---" % (time.time() - start_time))

View file

@ -6,7 +6,11 @@ from apps.alerts.constants import ActionSource
from apps.alerts.incident_appearance.renderers.phone_call_renderer import AlertGroupPhoneCallRenderer
from apps.alerts.models import AlertGroup, AlertGroupLogRecord
from apps.alerts.tasks import wipe
from apps.alerts.tasks.delete_alert_group import delete_alert_group
from apps.alerts.tasks.delete_alert_group import (
delete_alert_group,
finish_delete_alert_group,
send_alert_group_signal_for_delete,
)
from apps.slack.client import SlackClient
from apps.slack.errors import SlackAPIMessageNotFoundError, SlackAPIRatelimitError
from apps.slack.models import SlackMessage
@ -85,9 +89,9 @@ def test_delete(
make_alert,
make_slack_message,
make_resolution_note_slack_message,
django_capture_on_commit_callbacks,
):
"""test alert group deleting"""
organization, slack_team_identity = make_organization_with_slack_team_identity()
user = make_user(organization=organization)
@ -119,7 +123,20 @@ def test_delete(
assert alert_group.slack_messages.count() == 1
assert alert_group.resolution_note_slack_messages.count() == 2
delete_alert_group(alert_group.pk, user.pk)
with patch(
"apps.alerts.tasks.delete_alert_group.send_alert_group_signal_for_delete.delay", return_value=None
) as mock_send_alert_group_signal:
with django_capture_on_commit_callbacks(execute=True):
delete_alert_group(alert_group.pk, user.pk)
assert mock_send_alert_group_signal.call_count == 1
with patch(
"apps.alerts.tasks.delete_alert_group.finish_delete_alert_group.apply_async", return_value=None
) as mock_finish_delete_alert_group:
send_alert_group_signal_for_delete(*mock_send_alert_group_signal.call_args.args)
assert mock_finish_delete_alert_group.call_count == 1
finish_delete_alert_group(alert_group.pk)
assert not alert_group.alerts.exists()
assert not alert_group.slack_messages.exists()
@ -140,10 +157,10 @@ def test_delete(
@pytest.mark.parametrize("api_method", ["reactions_remove", "chat_delete"])
@patch.object(delete_alert_group, "apply_async")
@patch.object(send_alert_group_signal_for_delete, "apply_async")
@pytest.mark.django_db
def test_delete_slack_ratelimit(
mock_delete_alert_group,
mock_send_alert_group_signal_for_delete,
api_method,
make_organization_with_slack_team_identity,
make_user,
@ -152,6 +169,7 @@ def test_delete_slack_ratelimit(
make_alert,
make_slack_message,
make_resolution_note_slack_message,
django_capture_on_commit_callbacks,
):
organization, slack_team_identity = make_organization_with_slack_team_identity()
user = make_user(organization=organization)
@ -180,17 +198,31 @@ def test_delete_slack_ratelimit(
ts="test2_ts",
)
with patch.object(
SlackClient,
api_method,
side_effect=SlackAPIRatelimitError(
response=build_slack_response({"ok": False, "error": "ratelimited"}, headers={"Retry-After": 42})
),
):
delete_alert_group(alert_group.pk, user.pk)
with patch(
"apps.alerts.tasks.delete_alert_group.send_alert_group_signal_for_delete.delay", return_value=None
) as mock_send_alert_group_signal:
with django_capture_on_commit_callbacks(execute=True):
delete_alert_group(alert_group.pk, user.pk)
assert mock_send_alert_group_signal.call_count == 1
with patch(
"apps.alerts.tasks.delete_alert_group.finish_delete_alert_group.apply_async", return_value=None
) as mock_finish_delete_alert_group:
with patch.object(
SlackClient,
api_method,
side_effect=SlackAPIRatelimitError(
response=build_slack_response({"ok": False, "error": "ratelimited"}, headers={"Retry-After": 42})
),
):
send_alert_group_signal_for_delete(*mock_send_alert_group_signal.call_args.args)
assert mock_finish_delete_alert_group.call_count == 0
# Check task is retried gracefully
mock_delete_alert_group.assert_called_once_with((alert_group.pk, user.pk), countdown=42)
mock_send_alert_group_signal_for_delete.assert_called_once_with(
mock_send_alert_group_signal.call_args.args, countdown=42
)
@pytest.mark.parametrize("api_method", ["reactions_remove", "chat_delete"])
@ -582,7 +614,7 @@ def test_filter_active_alert_groups(
@patch("apps.alerts.models.AlertGroup.hard_delete")
@patch("apps.alerts.models.AlertGroup.un_attach_by_delete")
@patch("apps.alerts.models.AlertGroup.stop_escalation")
@patch("apps.alerts.models.alert_group.alert_group_action_triggered_signal")
@patch("apps.alerts.tasks.delete_alert_group.alert_group_action_triggered_signal")
@pytest.mark.django_db
def test_delete_by_user(
mock_alert_group_action_triggered_signal,
@ -592,6 +624,7 @@ def test_delete_by_user(
make_organization_and_user,
make_alert_receive_channel,
make_alert_group,
django_capture_on_commit_callbacks,
):
organization, user = make_organization_and_user()
alert_receive_channel = make_alert_receive_channel(organization)
@ -603,20 +636,31 @@ def test_delete_by_user(
assert alert_group.log_records.filter(type=AlertGroupLogRecord.TYPE_DELETED).count() == 0
alert_group.delete_by_user(user)
with patch(
"apps.alerts.tasks.delete_alert_group.send_alert_group_signal_for_delete.delay", return_value=None
) as mock_send_alert_group_signal:
with django_capture_on_commit_callbacks(execute=True):
delete_alert_group(alert_group.pk, user.pk)
assert mock_send_alert_group_signal.call_count == 1
assert alert_group.log_records.filter(type=AlertGroupLogRecord.TYPE_DELETED).count() == 1
deleted_log_record = alert_group.log_records.get(type=AlertGroupLogRecord.TYPE_DELETED)
alert_group.stop_escalation.assert_called_once_with()
with patch(
"apps.alerts.tasks.delete_alert_group.finish_delete_alert_group.apply_async", return_value=None
) as mock_finish_delete_alert_group:
send_alert_group_signal_for_delete(*mock_send_alert_group_signal.call_args.args)
assert mock_finish_delete_alert_group.call_count == 1
mock_alert_group_action_triggered_signal.send.assert_called_once_with(
sender=alert_group.delete_by_user,
sender=send_alert_group_signal_for_delete,
log_record=deleted_log_record.pk,
action_source=None,
force_sync=True,
)
finish_delete_alert_group(alert_group.pk)
alert_group.hard_delete.assert_called_once_with()
for dependent_alert_group in dependent_alert_groups:

View file

@ -31,7 +31,7 @@ def mock_apply_async(monkeypatch):
@patch("apps.alerts.models.alert_group_log_record.tasks.send_update_log_report_signal.apply_async")
@patch("apps.alerts.models.alert_group.alert_group_action_triggered_signal.send")
@patch("apps.alerts.tasks.send_alert_group_signal.alert_group_action_triggered_signal.send")
@pytest.mark.django_db
@override_settings(CELERY_TASK_ALWAYS_EAGER=True)
def test_update_metric_alert_groups_total_cache_on_action(
@ -142,7 +142,7 @@ def test_update_metric_alert_groups_total_cache_on_action(
@patch("apps.alerts.models.alert_group_log_record.tasks.send_update_log_report_signal.apply_async")
@patch("apps.alerts.models.alert_group.alert_group_action_triggered_signal.send")
@patch("apps.alerts.tasks.send_alert_group_signal.alert_group_action_triggered_signal.send")
@pytest.mark.django_db
@override_settings(CELERY_TASK_ALWAYS_EAGER=True)
def test_update_metric_alert_groups_response_time_cache_on_action(

View file

@ -2,6 +2,7 @@ import logging
from celery.utils.log import get_task_logger
from django.conf import settings
from django.core.exceptions import ObjectDoesNotExist
from apps.alerts.constants import ActionSource
from apps.alerts.representative import AlertGroupAbstractRepresentative
@ -49,14 +50,20 @@ def on_create_alert_slack_representative_async(alert_pk):
@shared_dedicated_queue_retry_task(
autoretry_for=(Exception,), retry_backoff=True, max_retries=1 if settings.DEBUG else None
autoretry_for=(Exception,),
retry_backoff=True,
dont_autoretry_for=(ObjectDoesNotExist,),
max_retries=1 if settings.DEBUG else None,
)
def on_alert_group_action_triggered_async(log_record_id):
from apps.alerts.models import AlertGroupLogRecord
logger.debug(f"SLACK representative: get log record {log_record_id}")
try:
log_record = AlertGroupLogRecord.objects.get(pk=log_record_id)
except AlertGroupLogRecord.DoesNotExist as e:
logger.warning(f"SLACK representative: log record {log_record_id} never created or has been deleted")
raise e
log_record = AlertGroupLogRecord.objects.get(pk=log_record_id)
alert_group_id = log_record.alert_group_id
logger.debug(f"Start on_alert_group_action_triggered for alert_group {alert_group_id}, log record {log_record_id}")
instance = AlertGroupSlackRepresentative(log_record)
@ -145,16 +152,25 @@ class AlertGroupSlackRepresentative(AlertGroupAbstractRepresentative):
from apps.alerts.models import AlertGroupLogRecord
log_record = kwargs["log_record"]
action_source = kwargs.get("action_source")
force_sync = kwargs.get("force_sync", False)
if isinstance(log_record, AlertGroupLogRecord):
log_record_id = log_record.pk
else:
log_record_id = log_record
if action_source == ActionSource.SLACK or force_sync:
try:
log_record = AlertGroupLogRecord.objects.get(pk=log_record_id)
except AlertGroupLogRecord.DoesNotExist:
logger.warning(
f"on_alert_group_action_triggered: log record {log_record_id} never created or has been deleted"
)
return
if log_record.action_source == ActionSource.SLACK or force_sync:
logger.debug(f"SLACK on_alert_group_action_triggered: sync {log_record_id} {force_sync}")
on_alert_group_action_triggered_async(log_record_id)
else:
logger.debug(f"SLACK on_alert_group_action_triggered: async {log_record_id} {force_sync}")
on_alert_group_action_triggered_async.apply_async((log_record_id,))
@classmethod
@ -167,7 +183,11 @@ class AlertGroupSlackRepresentative(AlertGroupAbstractRepresentative):
alert_group_id = alert_group.pk
else:
alert_group_id = alert_group
alert_group = AlertGroup.objects.get(pk=alert_group_id)
try:
alert_group = AlertGroup.objects.get(pk=alert_group_id)
except AlertGroup.DoesNotExist as e:
logger.warning(f"SLACK update log report: alert group {alert_group_id} has been deleted")
raise e
logger.debug(
f"Received alert_group_update_log_report signal in SLACK representative for alert_group {alert_group_id}"

View file

@ -64,8 +64,13 @@ class AlertGroupTelegramRepresentative(AlertGroupAbstractRepresentative):
def on_alert_group_update_log_report(cls, **kwargs):
logger.info("AlertGroupTelegramRepresentative UPDATE LOG REPORT SIGNAL")
alert_group = kwargs["alert_group"]
if not isinstance(alert_group, AlertGroup):
alert_group = AlertGroup.objects.get(pk=alert_group)
try:
alert_group = AlertGroup.objects.get(pk=alert_group)
except AlertGroup.DoesNotExist as e:
logger.warning(f"Telegram update log report: alert group {alert_group} has been deleted")
raise e
messages_to_edit = alert_group.telegram_messages.filter(
message_type__in=(

View file

@ -3,6 +3,7 @@ import logging
from celery import uuid as celery_uuid
from celery.utils.log import get_task_logger
from django.conf import settings
from django.core.exceptions import ObjectDoesNotExist
from telegram import error
from apps.alerts.models import Alert, AlertGroup
@ -230,7 +231,10 @@ def on_create_alert_telegram_representative_async(self, alert_pk):
@shared_dedicated_queue_retry_task(
autoretry_for=(Exception,), retry_backoff=True, max_retries=1 if settings.DEBUG else None
autoretry_for=(Exception,),
retry_backoff=True,
dont_autoretry_for=(ObjectDoesNotExist,),
max_retries=1 if settings.DEBUG else None,
)
def on_alert_group_action_triggered_async(log_record_id):
from apps.alerts.models import AlertGroupLogRecord
@ -239,18 +243,14 @@ def on_alert_group_action_triggered_async(log_record_id):
logger.info(f"AlertGroupTelegramRepresentative ACTION SIGNAL, log record {log_record_id}")
# temporary solution to handle cases when alert group and related log records were deleted
try:
log_record = AlertGroupLogRecord.objects.get(pk=log_record_id)
except AlertGroupLogRecord.DoesNotExist as e:
retries_count = on_alert_group_action_triggered_async.request.retries
if retries_count >= 10:
logger.error(
f"AlertGroupTelegramRepresentative: was not able to get AlertGroupLogRecord, probably alert group "
f"was deleted. log record {log_record_id}, retries: {retries_count}"
)
return
else:
raise e
logger.warning(
f"AlertGroupTelegramRepresentative: log record {log_record_id} never created or has been deleted"
)
raise e
instance = AlertGroupTelegramRepresentative(log_record)
if instance.is_applicable():

View file

@ -15,5 +15,9 @@ def on_action_triggered(**kwargs):
log_record = kwargs["log_record"]
if not isinstance(log_record, AlertGroupLogRecord):
log_record = AlertGroupLogRecord.objects.get(pk=log_record)
try:
log_record = AlertGroupLogRecord.objects.get(pk=log_record)
except AlertGroupLogRecord.DoesNotExist as e:
logger.warning(f"Webhook action triggered: log record {log_record} never created or has been deleted")
raise e
alert_group_status_change.apply_async((log_record.type, log_record.alert_group_id, log_record.author_id))

View file

@ -4,6 +4,8 @@ CELERY_TASK_ROUTES = {
"queue": "default"
},
"apps.alerts.tasks.delete_alert_group.delete_alert_group": {"queue": "default"},
"apps.alerts.tasks.delete_alert_group.send_alert_group_signal_for_delete": {"queue": "default"},
"apps.alerts.tasks.delete_alert_group.finish_delete_alert_group": {"queue": "default"},
"apps.alerts.tasks.invalidate_web_cache_for_alert_group.invalidate_web_cache_for_alert_group": {"queue": "default"},
"apps.alerts.tasks.send_alert_group_signal.send_alert_group_signal": {"queue": "default"},
"apps.alerts.tasks.wipe.wipe": {"queue": "default"},