oncall-engine/engine/apps/mobile_app/utils.py
Joey Orlando 72e7224ad3
do not retry firebase.messaging.UnregisteredError exceptions for FCM relay tasks (#3637)
# What this PR does

_tldr_; we had a lengthy discussion about this
[here](https://raintank-corp.slack.com/archives/C04JCU51NF8/p1701893410542629?thread_ts=1701690117.016909&cid=C04JCU51NF8).
`firebase.messaging.UnregisteredError` errors occur because of events
outside of our control and retrying will never fix them, therefore we
should simply skip retrying in this case.

We retry these fairly often
([logs](https://ops.grafana-ops.net/explore?schemaVersion=1&panes=%7B%22iWZ%22:%7B%22datasource%22:%22000000193%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22expr%22:%22%23%20%7Bcluster%3D~%5C%22prod-%28eu-west-0%7Cus-central-0%29%5C%22,%20namespace%3D%5C%22amixr-prod%5C%22%7D%20%7C%3D%20%5C%22task_name%3Dapps.webhooks.tasks.trigger_webhook.execute_webhook%5C%22%20%7C%3D%20%5C%22retry%5C%22%5Cn%7Bcluster%3D~%5C%22prod-%28eu-west-0%7Cus-central-0%29%5C%22,%20namespace%3D%5C%22amixr-prod%5C%22%7D%20%7C%3D%20%5C%22apps.mobile_app.fcm_relay.fcm_relay_async%5C%22%20%7C%3D%20%5C%22UnregisteredError%5C%22%22,%22queryType%22:%22range%22,%22datasource%22:%7B%22type%22:%22loki%22,%22uid%22:%22000000193%22%7D,%22editorMode%22:%22code%22%7D%5D,%22range%22:%7B%22from%22:%22now-7d%22,%22to%22:%22now%22%7D%7D%7D&orgId=1))
which eats up unnecessary celery worker resources.

Related to https://github.com/grafana/oncall-private/issues/1820

## Checklist

- [x] Unit, integration, and e2e (if applicable) tests updated
- [x] Documentation added (or `pr:no public docs` PR label added if not
required)
- [x] `CHANGELOG.md` updated (or `pr:no changelog` PR label added if not
required)
2024-01-09 08:14:20 -05:00

154 lines
5.6 KiB
Python

import json
import logging
import typing
import requests
from django.conf import settings
from firebase_admin.exceptions import FirebaseError
from firebase_admin.messaging import AndroidConfig, APNSConfig, APNSPayload, Message, UnregisteredError
from requests import HTTPError
from rest_framework import status
from apps.base.utils import live_settings
from apps.mobile_app.types import FCMMessageData, MessageType
from common.api_helpers.utils import create_engine_url
if typing.TYPE_CHECKING:
from apps.mobile_app.models import FCMDevice
from apps.user_management.models import Organization
MAX_RETRIES = 1 if settings.DEBUG else 10
# UnregisteredError
# App instance was unregistered from FCM. This usually means that the token used is no longer valid and a
# new one must be used.
#
# In other words, this error occurs outside of our control and retrying will never fix it, therefore we should skip
FIREBASE_ERRORS_TO_NOT_RETRY = (UnregisteredError,)
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
def _send_push_notification_to_fcm_relay(message: Message) -> requests.Response:
"""
Send push notification to FCM relay on cloud instance: apps.mobile_app.fcm_relay.FCMRelayView
"""
url = create_engine_url("mobile_app/v1/fcm_relay", override_base=settings.GRAFANA_CLOUD_ONCALL_API_URL)
response = requests.post(
url, headers={"Authorization": live_settings.GRAFANA_CLOUD_ONCALL_TOKEN}, json=json.loads(str(message))
)
response.raise_for_status()
return response
def send_message_to_fcm_device(device: "FCMDevice", message: Message) -> None:
"""
https://firebase.google.com/docs/cloud-messaging/http-server-ref#interpret-downstream
"""
response = device.send_message(message)
logger.debug(f"FCM response: {response}")
if isinstance(response, FirebaseError):
logger.exception(
f"FCM error occured in mobile_app.utils.send_message_to_fcm_device fcm_device_info={device} "
f"firebase_error_code={response._code} firebase_error_cause={response._cause} "
f"firebase_error_http_response={response._http_response}"
)
if isinstance(response, FIREBASE_ERRORS_TO_NOT_RETRY):
logger.warning(f"FCM error {response} is not being retried as we explicitly do not want to retry it")
return
raise response
def send_push_notification(
device_to_notify: "FCMDevice", message: Message, error_cb: typing.Optional[typing.Callable[..., None]] = None
) -> bool:
logger.debug(f"Sending push notification to device type {device_to_notify.type} with message: {message}")
def _error_cb():
if error_cb:
error_cb()
if settings.IS_OPEN_SOURCE:
# FCM relay uses cloud connection to send push notifications
from apps.oss_installation.models import CloudConnector
if not CloudConnector.objects.exists():
_error_cb()
logger.error("Error while sending a mobile push notification: not connected to cloud")
return False
try:
response = _send_push_notification_to_fcm_relay(message)
logger.debug(f"FCM relay response: {response}")
except HTTPError as e:
if status.HTTP_400_BAD_REQUEST <= e.response.status_code < status.HTTP_500_INTERNAL_SERVER_ERROR:
# do not retry on HTTP client errors (4xx errors)
_error_cb()
logger.error(
f"Error while sending a mobile push notification: HTTP client error {e.response.status_code}"
)
return False
else:
raise
else:
send_message_to_fcm_device(device_to_notify, message)
# notification succeeded (otherwise raised exception before)
return True
def construct_fcm_message(
message_type: MessageType,
device_to_notify: "FCMDevice",
thread_id: str,
data: FCMMessageData,
apns_payload: typing.Optional[APNSPayload] = None,
) -> Message:
apns_config_kwargs = {}
if apns_payload is not None:
apns_config_kwargs["payload"] = apns_payload
return Message(
token=device_to_notify.registration_id,
data={
# from the docs..
# A dictionary of data fields (optional). All keys and values in the dictionary must be strings
**data,
"type": message_type,
"thread_id": thread_id,
},
android=AndroidConfig(
# from the docs
# https://firebase.google.com/docs/cloud-messaging/concept-options#setting-the-priority-of-a-message
#
# Normal priority.
# Normal priority messages are delivered immediately when the app is in the foreground.
# For backgrounded apps, delivery may be delayed. For less time-sensitive messages, such as notifications
# of new email, keeping your UI in sync, or syncing app data in the background, choose normal delivery
# priority.
#
# High priority.
# FCM attempts to deliver high priority messages immediately even if the device is in Doze mode.
# High priority messages are for time-sensitive, user visible content.
priority="high",
),
apns=APNSConfig(
**apns_config_kwargs,
headers={
# From the docs
# https://firebase.google.com/docs/cloud-messaging/concept-options#setting-the-priority-of-a-message
"apns-priority": "10",
},
),
)
def add_stack_slug_to_message_title(title: str, organization: "Organization") -> str:
return f"[{organization.stack_slug}] {title}"