From 72e7224ad35430c18c5324683d907c7ec952de33 Mon Sep 17 00:00:00 2001 From: Joey Orlando Date: Tue, 9 Jan 2024 08:14:20 -0500 Subject: [PATCH] do not retry `firebase.messaging.UnregisteredError` exceptions for FCM relay tasks (#3637) # What this PR does _tldr_; we had a lengthy discussion about this [here](https://raintank-corp.slack.com/archives/C04JCU51NF8/p1701893410542629?thread_ts=1701690117.016909&cid=C04JCU51NF8). `firebase.messaging.UnregisteredError` errors occur because of events outside of our control and retrying will never fix them, therefore we should simply skip retrying in this case. We retry these fairly often ([logs](https://ops.grafana-ops.net/explore?schemaVersion=1&panes=%7B%22iWZ%22:%7B%22datasource%22:%22000000193%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22expr%22:%22%23%20%7Bcluster%3D~%5C%22prod-%28eu-west-0%7Cus-central-0%29%5C%22,%20namespace%3D%5C%22amixr-prod%5C%22%7D%20%7C%3D%20%5C%22task_name%3Dapps.webhooks.tasks.trigger_webhook.execute_webhook%5C%22%20%7C%3D%20%5C%22retry%5C%22%5Cn%7Bcluster%3D~%5C%22prod-%28eu-west-0%7Cus-central-0%29%5C%22,%20namespace%3D%5C%22amixr-prod%5C%22%7D%20%7C%3D%20%5C%22apps.mobile_app.fcm_relay.fcm_relay_async%5C%22%20%7C%3D%20%5C%22UnregisteredError%5C%22%22,%22queryType%22:%22range%22,%22datasource%22:%7B%22type%22:%22loki%22,%22uid%22:%22000000193%22%7D,%22editorMode%22:%22code%22%7D%5D,%22range%22:%7B%22from%22:%22now-7d%22,%22to%22:%22now%22%7D%7D%7D&orgId=1)) which eats up unnecessary celery worker resources. Related to https://github.com/grafana/oncall-private/issues/1820 ## Checklist - [x] Unit, integration, and e2e (if applicable) tests updated - [x] Documentation added (or `pr:no public docs` PR label added if not required) - [x] `CHANGELOG.md` updated (or `pr:no changelog` PR label added if not required) --- CHANGELOG.md | 4 +++ engine/apps/mobile_app/tests/test_utils.py | 35 ++++++++++++++++++++++ engine/apps/mobile_app/utils.py | 21 +++++++++---- 3 files changed, 54 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 38465b87..81904255 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## Unreleased +### Changed + +- Do not retry `firebase.messaging.UnregisteredError` exceptions for FCM relay tasks by @joeyorlando ([#3637](https://github.com/grafana/oncall/pull/3637)) + ## v1.3.83 (2024-01-08) ### Changed diff --git a/engine/apps/mobile_app/tests/test_utils.py b/engine/apps/mobile_app/tests/test_utils.py index 4dc7d974..2db4d941 100644 --- a/engine/apps/mobile_app/tests/test_utils.py +++ b/engine/apps/mobile_app/tests/test_utils.py @@ -1,5 +1,6 @@ from unittest.mock import Mock, patch +import firebase_admin.messaging import pytest from firebase_admin.exceptions import FirebaseError from requests import HTTPError @@ -59,6 +60,40 @@ def test_send_push_notification_cloud_firebase_error( mock_send_message.assert_called_once_with(mock_message) +@patch.object(FCMDevice, "send_message") +@pytest.mark.parametrize( + "ExceptionClass,exception_kwargs", + [ + (firebase_admin.messaging.UnregisteredError, {"message": "test_error_message"}), + ], +) +@pytest.mark.django_db +def test_send_push_notification_cloud_ignores_certain_errors( + mock_send_message, + settings, + make_organization_and_user, + ExceptionClass, + exception_kwargs, +): + mock_send_message.return_value = ExceptionClass(**exception_kwargs) + + # create a user and connect a mobile device + _, user = make_organization_and_user() + device = FCMDevice.objects.create(user=user, registration_id="test_device_id") + mock_message = {"foo": "bar"} + + # check FCM is contacted directly when using the cloud license + settings.LICENSE = CLOUD_LICENSE_NAME + settings.IS_OPEN_SOURCE = False + + try: + utils.send_push_notification(device, mock_message) + except Exception: + pytest.fail(f"send_push_notification should not raise an exception for {ExceptionClass.__name__} errors") + + mock_send_message.assert_called_once_with(mock_message) + + @patch("apps.mobile_app.utils._send_push_notification_to_fcm_relay", return_value="ok") @pytest.mark.django_db def test_send_push_notification_oss( diff --git a/engine/apps/mobile_app/utils.py b/engine/apps/mobile_app/utils.py index a4637aa6..fbd4db3b 100644 --- a/engine/apps/mobile_app/utils.py +++ b/engine/apps/mobile_app/utils.py @@ -5,7 +5,7 @@ import typing import requests from django.conf import settings from firebase_admin.exceptions import FirebaseError -from firebase_admin.messaging import AndroidConfig, APNSConfig, APNSPayload, Message +from firebase_admin.messaging import AndroidConfig, APNSConfig, APNSPayload, Message, UnregisteredError from requests import HTTPError from rest_framework import status @@ -19,6 +19,13 @@ if typing.TYPE_CHECKING: MAX_RETRIES = 1 if settings.DEBUG else 10 + +# UnregisteredError +# App instance was unregistered from FCM. This usually means that the token used is no longer valid and a +# new one must be used. +# +# In other words, this error occurs outside of our control and retrying will never fix it, therefore we should skip +FIREBASE_ERRORS_TO_NOT_RETRY = (UnregisteredError,) logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) @@ -46,13 +53,15 @@ def send_message_to_fcm_device(device: "FCMDevice", message: Message) -> None: if isinstance(response, FirebaseError): logger.exception( - f"FCM error occured in mobile_app.utils.send_message_to_fcm_device\n" - f"FCMDevice info: {device}\n" - f"FirebaseError code: {response._code}\n" - f"FirebaseError cause: {response._cause}\n" - f"FirebaseError http_response: {response._http_response}\n" + f"FCM error occured in mobile_app.utils.send_message_to_fcm_device fcm_device_info={device} " + f"firebase_error_code={response._code} firebase_error_cause={response._cause} " + f"firebase_error_http_response={response._http_response}" ) + if isinstance(response, FIREBASE_ERRORS_TO_NOT_RETRY): + logger.warning(f"FCM error {response} is not being retried as we explicitly do not want to retry it") + return + raise response