do not retry firebase.messaging.UnregisteredError exceptions for FCM relay tasks (#3637)

# What this PR does

_tldr_; we had a lengthy discussion about this
[here](https://raintank-corp.slack.com/archives/C04JCU51NF8/p1701893410542629?thread_ts=1701690117.016909&cid=C04JCU51NF8).
`firebase.messaging.UnregisteredError` errors occur because of events
outside of our control and retrying will never fix them, therefore we
should simply skip retrying in this case.

We retry these fairly often
([logs](https://ops.grafana-ops.net/explore?schemaVersion=1&panes=%7B%22iWZ%22:%7B%22datasource%22:%22000000193%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22expr%22:%22%23%20%7Bcluster%3D~%5C%22prod-%28eu-west-0%7Cus-central-0%29%5C%22,%20namespace%3D%5C%22amixr-prod%5C%22%7D%20%7C%3D%20%5C%22task_name%3Dapps.webhooks.tasks.trigger_webhook.execute_webhook%5C%22%20%7C%3D%20%5C%22retry%5C%22%5Cn%7Bcluster%3D~%5C%22prod-%28eu-west-0%7Cus-central-0%29%5C%22,%20namespace%3D%5C%22amixr-prod%5C%22%7D%20%7C%3D%20%5C%22apps.mobile_app.fcm_relay.fcm_relay_async%5C%22%20%7C%3D%20%5C%22UnregisteredError%5C%22%22,%22queryType%22:%22range%22,%22datasource%22:%7B%22type%22:%22loki%22,%22uid%22:%22000000193%22%7D,%22editorMode%22:%22code%22%7D%5D,%22range%22:%7B%22from%22:%22now-7d%22,%22to%22:%22now%22%7D%7D%7D&orgId=1))
which eats up unnecessary celery worker resources.

Related to https://github.com/grafana/oncall-private/issues/1820

## Checklist

- [x] Unit, integration, and e2e (if applicable) tests updated
- [x] Documentation added (or `pr:no public docs` PR label added if not
required)
- [x] `CHANGELOG.md` updated (or `pr:no changelog` PR label added if not
required)
This commit is contained in:
Joey Orlando 2024-01-09 08:14:20 -05:00 committed by GitHub
parent 9fab4a7b06
commit 72e7224ad3
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 54 additions and 6 deletions

View file

@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## Unreleased
### Changed
- Do not retry `firebase.messaging.UnregisteredError` exceptions for FCM relay tasks by @joeyorlando ([#3637](https://github.com/grafana/oncall/pull/3637))
## v1.3.83 (2024-01-08)
### Changed

View file

@ -1,5 +1,6 @@
from unittest.mock import Mock, patch
import firebase_admin.messaging
import pytest
from firebase_admin.exceptions import FirebaseError
from requests import HTTPError
@ -59,6 +60,40 @@ def test_send_push_notification_cloud_firebase_error(
mock_send_message.assert_called_once_with(mock_message)
@patch.object(FCMDevice, "send_message")
@pytest.mark.parametrize(
"ExceptionClass,exception_kwargs",
[
(firebase_admin.messaging.UnregisteredError, {"message": "test_error_message"}),
],
)
@pytest.mark.django_db
def test_send_push_notification_cloud_ignores_certain_errors(
mock_send_message,
settings,
make_organization_and_user,
ExceptionClass,
exception_kwargs,
):
mock_send_message.return_value = ExceptionClass(**exception_kwargs)
# create a user and connect a mobile device
_, user = make_organization_and_user()
device = FCMDevice.objects.create(user=user, registration_id="test_device_id")
mock_message = {"foo": "bar"}
# check FCM is contacted directly when using the cloud license
settings.LICENSE = CLOUD_LICENSE_NAME
settings.IS_OPEN_SOURCE = False
try:
utils.send_push_notification(device, mock_message)
except Exception:
pytest.fail(f"send_push_notification should not raise an exception for {ExceptionClass.__name__} errors")
mock_send_message.assert_called_once_with(mock_message)
@patch("apps.mobile_app.utils._send_push_notification_to_fcm_relay", return_value="ok")
@pytest.mark.django_db
def test_send_push_notification_oss(

View file

@ -5,7 +5,7 @@ import typing
import requests
from django.conf import settings
from firebase_admin.exceptions import FirebaseError
from firebase_admin.messaging import AndroidConfig, APNSConfig, APNSPayload, Message
from firebase_admin.messaging import AndroidConfig, APNSConfig, APNSPayload, Message, UnregisteredError
from requests import HTTPError
from rest_framework import status
@ -19,6 +19,13 @@ if typing.TYPE_CHECKING:
MAX_RETRIES = 1 if settings.DEBUG else 10
# UnregisteredError
# App instance was unregistered from FCM. This usually means that the token used is no longer valid and a
# new one must be used.
#
# In other words, this error occurs outside of our control and retrying will never fix it, therefore we should skip
FIREBASE_ERRORS_TO_NOT_RETRY = (UnregisteredError,)
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
@ -46,13 +53,15 @@ def send_message_to_fcm_device(device: "FCMDevice", message: Message) -> None:
if isinstance(response, FirebaseError):
logger.exception(
f"FCM error occured in mobile_app.utils.send_message_to_fcm_device\n"
f"FCMDevice info: {device}\n"
f"FirebaseError code: {response._code}\n"
f"FirebaseError cause: {response._cause}\n"
f"FirebaseError http_response: {response._http_response}\n"
f"FCM error occured in mobile_app.utils.send_message_to_fcm_device fcm_device_info={device} "
f"firebase_error_code={response._code} firebase_error_cause={response._cause} "
f"firebase_error_http_response={response._http_response}"
)
if isinstance(response, FIREBASE_ERRORS_TO_NOT_RETRY):
logger.warning(f"FCM error {response} is not being retried as we explicitly do not want to retry it")
return
raise response