oncall-engine/engine/apps/telegram/tasks.py
Joey Orlando 0c96427cfc
fix apps.telegram.tasks.send_log_and_actions_message retrying tasks (#4851)
# What this PR does

It _appears_ like Telegram may have changed one of the error messages
they return for `telegram.error.BadRequest`. This _may_ be causing us to
infinitely retry some of these tasks.

Previously we were checking for two variants of the same type of error
message:
- "Message to reply not found"
- "Replied message not found"

_However_, if I search for the following [in the
logs](https://ops.grafana-ops.net/goto/hMgBb8CSR?orgId=1):
```logql
{namespace="amixr-prod"} |~ `(Message to be replied not found|Message to reply not found|Replied message not found)`
````
I _only_ see references to "Message to be replied not found". I have
updated references to the former to this new error log message we are
seeing.

Also:
- deduplicate some of the words we check for in
`telegram.error.BadRequest` and `telegram.error.Unauthorized` into
`apps.telegram.client.TelegramClient.BadRequestMessage` and
`apps.telegram.client.TelegramClient.UnauthorizedMessage` respectively
- deduplicate some of the wording we use in the `reason` arg passed to
`TelegramToUserConnector.create_telegram_notification_error` into
`apps.telegram.models.connectors.personal.TelegramToUserConnector.NotificationErrorReason`
- standardize how we check the `message` attribute of
`telegram.error.TelegramError`s into a new `error_message_is` static
method on `apps.telegram.client.TelegramClient`
- previously we would check these error messages in two different ways:
  ```python3
  # style 1
  if "error message to check" in e.message:
    # do something

  # style 2
  if error.message == "error message to check":
    # do something
  ```

## Which issue(s) this PR closes

Closes https://github.com/grafana/oncall-private/issues/2868

## Checklist

- [x] Unit, integration, and e2e (if applicable) tests updated
- [x] Documentation added (or `pr:no public docs` PR label added if not
required)
- [x] Added the relevant release notes label (see labels prefixed w/
`release:`). These labels dictate how your PR will
    show up in the autogenerated release notes.
2024-08-19 14:05:40 -04:00

257 lines
10 KiB
Python

import logging
from celery import uuid as celery_uuid
from celery.utils.log import get_task_logger
from django.conf import settings
from django.core.exceptions import ObjectDoesNotExist
from telegram import error
from apps.alerts.models import Alert, AlertGroup
from apps.base.models import UserNotificationPolicy
from apps.telegram.client import TelegramClient
from apps.telegram.decorators import (
handle_missing_token,
ignore_bot_deleted,
ignore_message_to_edit_deleted,
ignore_message_unchanged,
ignore_reply_to_message_deleted,
)
from apps.telegram.models import TelegramMessage, TelegramToOrganizationConnector
from common.custom_celery_tasks import shared_dedicated_queue_retry_task
from common.utils import OkToRetry
logger = get_task_logger(__name__)
logger.setLevel(logging.DEBUG)
@shared_dedicated_queue_retry_task(
autoretry_for=(Exception,), retry_backoff=True, max_retries=1 if settings.DEBUG else None
)
@handle_missing_token
def register_telegram_webhook(token=None):
if settings.FEATURE_TELEGRAM_LONG_POLLING_ENABLED:
return
telegram_client = TelegramClient(token=token)
try:
telegram_client.register_webhook()
except (error.InvalidToken, error.Unauthorized, error.BadRequest) as e:
logger.warning(f"Tried to register Telegram webhook using token: {telegram_client.token}, got error: {e}")
@shared_dedicated_queue_retry_task(
bind=True, autoretry_for=(Exception,), retry_backoff=True, max_retries=1 if settings.DEBUG else None
)
@ignore_message_unchanged
@ignore_message_to_edit_deleted
@ignore_bot_deleted
def edit_message(self, message_pk):
message = TelegramMessage.objects.get(pk=message_pk)
telegram_client = TelegramClient()
# if edit_task_id was not set at the time task was invoked, assign it and rerun the task
if message.edit_task_id is None:
task_id = celery_uuid()
message.edit_task_id = task_id
message.save(update_fields=["edit_task_id"])
edit_message.apply_async((message_pk,), task_id=task_id)
return
if message.edit_task_id != edit_message.request.id:
logger.debug("Dropping the task since another task was scheduled already.")
return
try:
telegram_client.edit_message(message=message)
except error.BadRequest as e:
if TelegramClient.error_message_is(e, [TelegramClient.BadRequestMessage.MESSAGE_IS_NOT_MODIFIED]):
pass
except (error.RetryAfter, error.TimedOut) as e:
countdown = getattr(e, "retry_after", 3)
task_id = celery_uuid()
message.edit_task_id = task_id
message.save(update_fields=["edit_task_id"])
edit_message.apply_async((message_pk,), countdown=countdown, task_id=task_id)
return
message.edit_task_id = None
message.save(update_fields=["edit_task_id"])
@shared_dedicated_queue_retry_task(bind=True, autoretry_for=(Exception,), retry_backoff=True, max_retries=None)
def send_link_to_channel_message_or_fallback_to_full_alert_group(
self, alert_group_pk, notification_policy_pk, user_connector_pk
):
from apps.telegram.models import TelegramToUserConnector
try:
user_connector = TelegramToUserConnector.objects.get(pk=user_connector_pk)
alert_group = AlertGroup.objects.get(pk=alert_group_pk)
notification_policy = UserNotificationPolicy.objects.get(pk=notification_policy_pk)
# probably telegram message just didn't appear in Telegram channel yet
if self.request.retries <= 10:
user_connector.send_link_to_channel_message(
alert_group=alert_group, notification_policy=notification_policy
)
else:
# seems like the message won't appear in Telegram channel, so send the full alert group to user
user_connector.send_full_alert_group(alert_group=alert_group, notification_policy=notification_policy)
except TelegramToUserConnector.DoesNotExist:
# Handle cases when user deleted the bot while escalation is active
logger.warning(
f"TelegramToUserConnector {user_connector_pk} not found. "
f"Most probably it was deleted while escalation was in progress."
f"alert_group {alert_group_pk}"
)
except UserNotificationPolicy.DoesNotExist:
logger.warning(
f"UserNotificationPolicy {notification_policy_pk} does not exist for alert group {alert_group_pk}"
)
@shared_dedicated_queue_retry_task(
bind=True, autoretry_for=(Exception,), retry_backoff=True, max_retries=1 if settings.DEBUG else None
)
@handle_missing_token
@ignore_reply_to_message_deleted
@ignore_bot_deleted
def send_log_and_actions_message(self, channel_chat_id, group_chat_id, channel_message_id, reply_to_message_id):
with OkToRetry(task=self, exc=TelegramMessage.DoesNotExist, num_retries=5):
try:
channel_message = TelegramMessage.objects.get(chat_id=channel_chat_id, message_id=channel_message_id)
except TelegramMessage.DoesNotExist:
if self.request.retries <= 5:
raise
else:
logger.warning(
f"Could not send log and actions message, telegram message does not exist "
f" chat_id={channel_chat_id} message_id={channel_message_id}"
)
return
if channel_message.discussion_group_message_id is None:
channel_message.discussion_group_message_id = reply_to_message_id
channel_message.save(update_fields=["discussion_group_message_id"])
alert_group = channel_message.alert_group
log_message_sent = alert_group.telegram_messages.filter(message_type=TelegramMessage.LOG_MESSAGE).exists()
actions_message_sent = alert_group.telegram_messages.filter(
message_type=TelegramMessage.ACTIONS_MESSAGE
).exists()
telegram_client = TelegramClient()
with OkToRetry(
task=self, exc=(error.RetryAfter, error.TimedOut), compute_countdown=lambda e: getattr(e, "retry_after", 3)
):
try:
if not log_message_sent:
telegram_client.send_message(
chat_id=group_chat_id,
message_type=TelegramMessage.LOG_MESSAGE,
alert_group=alert_group,
reply_to_message_id=reply_to_message_id,
)
if not actions_message_sent:
telegram_client.send_message(
chat_id=group_chat_id,
message_type=TelegramMessage.ACTIONS_MESSAGE,
alert_group=alert_group,
reply_to_message_id=reply_to_message_id,
)
except error.BadRequest as e:
if TelegramClient.error_message_is(
e,
[
TelegramClient.BadRequestMessage.CHAT_NOT_FOUND,
TelegramClient.BadRequestMessage.MESSAGE_TO_BE_REPLIED_NOT_FOUND,
],
):
logger.warning(
f"Could not send log and actions messages to Telegram group with id {group_chat_id} "
f"due to '{e.message}'. alert_group {alert_group.pk}"
)
return
raise
@shared_dedicated_queue_retry_task(
bind=True, autoretry_for=(Exception,), retry_backoff=True, max_retries=1 if settings.DEBUG else None
)
@handle_missing_token
@ignore_bot_deleted
@ignore_reply_to_message_deleted
def on_create_alert_telegram_representative_async(self, alert_pk):
"""
It's async in order to prevent Telegram downtime or formatting issues causing delay with SMS and other destinations.
"""
try:
alert = Alert.objects.get(pk=alert_pk)
except Alert.DoesNotExist as e:
if on_create_alert_telegram_representative_async.request.retries >= 10:
logger.error(f"Alert {alert_pk} was not found. Probably it was deleted. Stop retrying")
return
else:
raise e
alert_group = alert.group
alert_group_messages = alert_group.telegram_messages.filter(
message_type__in=[
TelegramMessage.ALERT_GROUP_MESSAGE,
TelegramMessage.PERSONAL_MESSAGE,
TelegramMessage.FORMATTING_ERROR,
]
)
# TODO: discuss moving this logic into .send_alert_group_message
telegram_channel = TelegramToOrganizationConnector.get_channel_for_alert_group(alert_group)
if telegram_channel is not None and not alert_group_messages.exists():
with OkToRetry(
task=self,
exc=(error.RetryAfter, error.TimedOut),
compute_countdown=lambda e: getattr(e, "retry_after", 3),
):
telegram_channel.send_alert_group_message(alert_group)
messages_to_edit = alert_group_messages.filter(
message_type__in=(
TelegramMessage.ALERT_GROUP_MESSAGE,
TelegramMessage.PERSONAL_MESSAGE,
)
)
for message in messages_to_edit:
edit_message.delay(message_pk=message.pk)
@shared_dedicated_queue_retry_task(
autoretry_for=(Exception,),
retry_backoff=True,
dont_autoretry_for=(ObjectDoesNotExist,),
max_retries=1 if settings.DEBUG else None,
)
def on_alert_group_action_triggered_async(log_record_id):
from apps.alerts.models import AlertGroupLogRecord
from .alert_group_representative import AlertGroupTelegramRepresentative
logger.info(f"AlertGroupTelegramRepresentative ACTION SIGNAL, log record {log_record_id}")
# temporary solution to handle cases when alert group and related log records were deleted
try:
log_record = AlertGroupLogRecord.objects.get(pk=log_record_id)
except AlertGroupLogRecord.DoesNotExist as e:
logger.warning(
f"AlertGroupTelegramRepresentative: log record {log_record_id} never created or has been deleted"
)
raise e
instance = AlertGroupTelegramRepresentative(log_record)
if instance.is_applicable():
handler = instance.get_handler()
handler()