fix infinitely retrying apps.alerts.tasks.notify_user.perform_notification task (#3708)

# Which issue(s) this PR fixes

Closes https://github.com/grafana/oncall-private/issues/2318

## Checklist

- [x] Unit, integration, and e2e (if applicable) tests updated
- [x] Documentation added (or `pr:no public docs` PR label added if not
required)
- [x] `CHANGELOG.md` updated (or `pr:no changelog` PR label added if not
required)
This commit is contained in:
Joey Orlando 2024-01-18 07:07:01 -05:00 committed by GitHub
parent b312667d31
commit 16b648bd15
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 36 additions and 10 deletions

View file

@ -11,6 +11,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Improvements for the columns selector ([3668](https://github.com/grafana/oncall/pull/3668))
### Fixed
- Address infinite retrying `apps.alerts.tasks.notify_user.perform_notification` task when `UserNotificationPolicyLogRecord`
object cannot be found by @joeyorlando ([#3708](https://github.com/grafana/oncall/pull/3708))
## v1.3.89 (2024-01-17)
### Fixed

View file

@ -196,7 +196,15 @@ def notify_user_task(
if not stop_escalation:
if notification_policy.step != UserNotificationPolicy.Step.WAIT:
transaction.on_commit(partial(perform_notification.apply_async, (log_record.pk,)))
def _create_perform_notification_task():
task = perform_notification.apply_async((log_record.pk,))
task_logger.info(
f"Created perform_notification task {task} log_record={log_record.pk} "
f"alert_group={alert_group_pk}"
)
transaction.on_commit(_create_perform_notification_task)
delay = NEXT_ESCALATION_DELAY
if countdown is not None:
@ -232,17 +240,18 @@ def perform_notification(log_record_pk):
from apps.base.models import UserNotificationPolicy, UserNotificationPolicyLogRecord
from apps.telegram.models import TelegramToUserConnector
# TODO: remove this log line once done investigation
task_logger.info(f"perform_notification: log_record {log_record_pk}")
log_record = UserNotificationPolicyLogRecord.objects.get(pk=log_record_pk)
try:
log_record = UserNotificationPolicyLogRecord.objects.get(pk=log_record_pk)
except UserNotificationPolicyLogRecord.DoesNotExist:
task_logger.warning(
f"perform_notification: log_record {log_record_pk} doesn't exist. Skipping remainder of task. "
"The alert group associated with this log record may have been deleted."
)
return
# TODO: uncomment this out once done investigation
# try:
# log_record = UserNotificationPolicyLogRecord.objects.get(pk=log_record_pk)
# except UserNotificationPolicyLogRecord.DoesNotExist:
# task_logger.info(f"perform_notification: log_record {log_record_pk} doesn't exist. Skipping remainder of task")
# return
task_logger.info(f"perform_notification: found record for {log_record_pk}")
user = log_record.author
alert_group = log_record.alert_group

View file

@ -285,3 +285,15 @@ def test_perform_notification_slack_prevent_posting(
last_log_record.notification_error_code
== UserNotificationPolicyLogRecord.ERROR_NOTIFICATION_POSTING_TO_SLACK_IS_DISABLED
)
@pytest.mark.django_db
def test_perform_notification_missing_user_notification_policy_log_record(caplog):
invalid_pk = 12345
perform_notification(invalid_pk)
assert (
f"perform_notification: log_record {invalid_pk} doesn't exist. Skipping remainder of task. "
"The alert group associated with this log record may have been deleted."
) in caplog.text
assert f"perform_notification: found record for {invalid_pk}" not in caplog.text

View file

@ -17,7 +17,7 @@ class DedicatedQueueRetryTask(LogExceptionOnFailureTask):
def retry(
self, args=None, kwargs=None, exc=None, throw=True, eta=None, countdown=None, max_retries=None, **options
):
logger.warn("Retrying celery task", exc_info=exc)
logger.warning("Retrying celery task", exc_info=exc)
# Just call retry with queue argument
return super().retry(