diff --git a/CHANGELOG.md b/CHANGELOG.md index 06170595..cc534e85 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Improvements for the columns selector ([3668](https://github.com/grafana/oncall/pull/3668)) +### Fixed + +- Address infinite retrying `apps.alerts.tasks.notify_user.perform_notification` task when `UserNotificationPolicyLogRecord` + object cannot be found by @joeyorlando ([#3708](https://github.com/grafana/oncall/pull/3708)) + ## v1.3.89 (2024-01-17) ### Fixed diff --git a/engine/apps/alerts/tasks/notify_user.py b/engine/apps/alerts/tasks/notify_user.py index 7578ba53..3f0fe218 100644 --- a/engine/apps/alerts/tasks/notify_user.py +++ b/engine/apps/alerts/tasks/notify_user.py @@ -196,7 +196,15 @@ def notify_user_task( if not stop_escalation: if notification_policy.step != UserNotificationPolicy.Step.WAIT: - transaction.on_commit(partial(perform_notification.apply_async, (log_record.pk,))) + + def _create_perform_notification_task(): + task = perform_notification.apply_async((log_record.pk,)) + task_logger.info( + f"Created perform_notification task {task} log_record={log_record.pk} " + f"alert_group={alert_group_pk}" + ) + + transaction.on_commit(_create_perform_notification_task) delay = NEXT_ESCALATION_DELAY if countdown is not None: @@ -232,17 +240,18 @@ def perform_notification(log_record_pk): from apps.base.models import UserNotificationPolicy, UserNotificationPolicyLogRecord from apps.telegram.models import TelegramToUserConnector - # TODO: remove this log line once done investigation task_logger.info(f"perform_notification: log_record {log_record_pk}") - log_record = UserNotificationPolicyLogRecord.objects.get(pk=log_record_pk) + try: + log_record = UserNotificationPolicyLogRecord.objects.get(pk=log_record_pk) + except UserNotificationPolicyLogRecord.DoesNotExist: + task_logger.warning( + f"perform_notification: log_record {log_record_pk} doesn't exist. Skipping remainder of task. " + "The alert group associated with this log record may have been deleted." + ) + return - # TODO: uncomment this out once done investigation - # try: - # log_record = UserNotificationPolicyLogRecord.objects.get(pk=log_record_pk) - # except UserNotificationPolicyLogRecord.DoesNotExist: - # task_logger.info(f"perform_notification: log_record {log_record_pk} doesn't exist. Skipping remainder of task") - # return + task_logger.info(f"perform_notification: found record for {log_record_pk}") user = log_record.author alert_group = log_record.alert_group diff --git a/engine/apps/alerts/tests/test_notify_user.py b/engine/apps/alerts/tests/test_notify_user.py index 4643f0c2..e06585b1 100644 --- a/engine/apps/alerts/tests/test_notify_user.py +++ b/engine/apps/alerts/tests/test_notify_user.py @@ -285,3 +285,15 @@ def test_perform_notification_slack_prevent_posting( last_log_record.notification_error_code == UserNotificationPolicyLogRecord.ERROR_NOTIFICATION_POSTING_TO_SLACK_IS_DISABLED ) + + +@pytest.mark.django_db +def test_perform_notification_missing_user_notification_policy_log_record(caplog): + invalid_pk = 12345 + perform_notification(invalid_pk) + + assert ( + f"perform_notification: log_record {invalid_pk} doesn't exist. Skipping remainder of task. " + "The alert group associated with this log record may have been deleted." + ) in caplog.text + assert f"perform_notification: found record for {invalid_pk}" not in caplog.text diff --git a/engine/common/custom_celery_tasks/dedicated_queue_retry_task.py b/engine/common/custom_celery_tasks/dedicated_queue_retry_task.py index 966c73d1..35e551a7 100644 --- a/engine/common/custom_celery_tasks/dedicated_queue_retry_task.py +++ b/engine/common/custom_celery_tasks/dedicated_queue_retry_task.py @@ -17,7 +17,7 @@ class DedicatedQueueRetryTask(LogExceptionOnFailureTask): def retry( self, args=None, kwargs=None, exc=None, throw=True, eta=None, countdown=None, max_retries=None, **options ): - logger.warn("Retrying celery task", exc_info=exc) + logger.warning("Retrying celery task", exc_info=exc) # Just call retry with queue argument return super().retry(