oncall-engine/engine/apps/alerts/tasks/acknowledge_reminder.py
Yulya Artyukhina 8420cfd822
Fix acknowledge reminder task (#5179)
# What this PR does
- Adds 10 minutes lock for acknowledge reminder task to prevent task
duplicates, that causes posting multiple reminder messages and flooding
in Slack threads.
- Adds a new signal for acknowledge reminder task instead of using
`alert_group_action_triggered_signal` since it is used only to post
reminder message in Slack thread and it's not needed to be processed by
other representatives

## Which issue(s) this PR closes

Related to https://github.com/grafana/oncall-private/issues/2953

## Checklist

- [x] Unit, integration, and e2e (if applicable) tests updated
- [x] Documentation added (or `pr:no public docs` PR label added if not
required)
- [x] Added the relevant release notes label (see labels prefixed w/
`release:`). These labels dictate how your PR will
    show up in the autogenerated release notes.
2024-10-16 12:13:28 +00:00

166 lines
7.8 KiB
Python

from datetime import timedelta
from functools import partial
from django.conf import settings
from django.core.cache import cache
from django.db import transaction
from django.utils import timezone
from apps.alerts.signals import post_ack_reminder_message_signal
from common.custom_celery_tasks import shared_dedicated_queue_retry_task
from .send_alert_group_signal import send_alert_group_signal
from .task_logger import task_logger
MAX_RETRIES = 1 if settings.DEBUG else 10
def is_allowed_to_send_acknowledge_reminder(alert_group_id, process_id):
lock_id = f"acknowledge-reminder-lock-{alert_group_id}"
lock_period = 60 * 10 # 10 min
# cache.add returns False if the key already exists
status = cache.add(lock_id, process_id, lock_period)
return status
@shared_dedicated_queue_retry_task(autoretry_for=(Exception,), retry_backoff=True, max_retries=MAX_RETRIES)
def acknowledge_reminder_task(alert_group_pk: int, unacknowledge_process_id: str) -> None:
from apps.alerts.models import AlertGroup, AlertGroupLogRecord
from apps.user_management.models import Organization
with transaction.atomic():
try:
alert_group = AlertGroup.objects.select_for_update().get(pk=alert_group_pk) # Lock alert_group
except AlertGroup.DoesNotExist:
task_logger.warning(f"AlertGroup {alert_group_pk} does not exist")
return
if unacknowledge_process_id != alert_group.last_unique_unacknowledge_process_id:
return
# Don't proceed if acknowledge reminder for this alert group has already been sent recently
if not is_allowed_to_send_acknowledge_reminder(alert_group.id, unacknowledge_process_id):
task_logger.info(f"Acknowledge reminder for alert_group {alert_group_pk} has already been sent recently.")
return
organization = alert_group.channel.organization
# Get timeout values
acknowledge_reminder_timeout = Organization.ACKNOWLEDGE_REMIND_DELAY[organization.acknowledge_remind_timeout]
unacknowledge_timeout = Organization.UNACKNOWLEDGE_TIMEOUT_DELAY[organization.unacknowledge_timeout]
# Don't proceed if the alert group is not in a state for acknowledgement reminder
acknowledge_reminder_required = (
alert_group.is_root_alert_group
and alert_group.status == AlertGroup.ACKNOWLEDGED
and alert_group.acknowledged_by == AlertGroup.USER
and acknowledge_reminder_timeout
)
is_organization_deleted = organization.deleted_at is not None
log_info = (
f"acknowledge_reminder_timeout option: {acknowledge_reminder_timeout},"
f"organization ppk: {organization.public_primary_key},"
f"organization is deleted: {is_organization_deleted}"
)
if not acknowledge_reminder_required or is_organization_deleted:
task_logger.info(f"alert group {alert_group_pk} is not in a state for acknowledgement reminder. {log_info}")
return
task_logger.info(f"alert group {alert_group_pk} is in a state for acknowledgement reminder. {log_info}")
# unacknowledge_timeout_task uses acknowledged_by_confirmed to check if acknowledgement reminder has been confirmed
# by the user. Setting to None here to indicate that the user has not confirmed the acknowledgement reminder
if alert_group.acknowledged_by_confirmed is not None:
alert_group.acknowledged_by_confirmed = None
alert_group.save(update_fields=["acknowledged_by_confirmed"])
if unacknowledge_timeout: # "unack in N minutes if no response" is enabled
unacknowledge_timeout_task.apply_async(
(alert_group.pk, unacknowledge_process_id), countdown=unacknowledge_timeout
)
else:
if alert_group.started_at < timezone.now() - timedelta(days=settings.ACKNOWLEDGE_REMINDER_TASK_EXPIRY_DAYS):
task_logger.info(
f"alert group {alert_group_pk} not renewing acknowledgement reminder, started_at is too old. {log_info}"
)
return
acknowledge_reminder_task.apply_async(
(alert_group.pk, unacknowledge_process_id), countdown=acknowledge_reminder_timeout
)
with transaction.atomic():
log_record = alert_group.log_records.create(
type=AlertGroupLogRecord.TYPE_ACK_REMINDER_TRIGGERED, author=alert_group.acknowledged_by_user
)
task_logger.info(f"created log record {log_record.pk}, sending signal...")
transaction.on_commit(partial(send_post_ack_reminder_message_signal.delay, log_record.pk))
@shared_dedicated_queue_retry_task(autoretry_for=(Exception,), retry_backoff=True, max_retries=MAX_RETRIES)
def unacknowledge_timeout_task(alert_group_pk: int, unacknowledge_process_id: str) -> None:
from apps.alerts.models import AlertGroup, AlertGroupLogRecord
from apps.user_management.models import Organization
with transaction.atomic():
try:
alert_group = AlertGroup.objects.select_for_update().get(pk=alert_group_pk) # Lock alert_group
except AlertGroup.DoesNotExist:
task_logger.warning(f"AlertGroup {alert_group_pk} does not exist")
return
if unacknowledge_process_id != alert_group.last_unique_unacknowledge_process_id:
return
organization = alert_group.channel.organization
# Get timeout values
acknowledge_reminder_timeout = Organization.ACKNOWLEDGE_REMIND_DELAY[organization.acknowledge_remind_timeout]
unacknowledge_timeout = Organization.UNACKNOWLEDGE_TIMEOUT_DELAY[organization.unacknowledge_timeout]
# Don't proceed if the alert group is not in a state for auto-unacknowledge
unacknowledge_required = (
alert_group.is_root_alert_group
and alert_group.status == AlertGroup.ACKNOWLEDGED
and alert_group.acknowledged_by == AlertGroup.USER
and acknowledge_reminder_timeout
and unacknowledge_timeout
)
is_organization_deleted = organization.deleted_at is not None
log_info = (
f"acknowledge_reminder_timeout option: {acknowledge_reminder_timeout},"
f"unacknowledge_timeout option: {unacknowledge_timeout},"
f"organization ppk: {organization.public_primary_key},"
f"organization is deleted: {is_organization_deleted}"
)
if not unacknowledge_required or is_organization_deleted:
task_logger.info(f"alert group {alert_group_pk} is not in a state for unacknowledge by timeout. {log_info}")
return
if alert_group.acknowledged_by_confirmed: # acknowledgement reminder was confirmed by the user
acknowledge_reminder_task.apply_async(
(alert_group_pk, unacknowledge_process_id), countdown=acknowledge_reminder_timeout - unacknowledge_timeout
)
task_logger.info(
f"Acknowledgement reminder was confirmed by user. Rescheduling acknowledge_reminder_task..."
f"alert group: {alert_group_pk}, {log_info}"
)
return
task_logger.info(f"alert group {alert_group_pk} is in a state for unacknowledge by timeout. {log_info}")
# If acknowledgement reminder wasn't confirmed by the user, unacknowledge the alert group and start escalation again
log_record = alert_group.log_records.create(
type=AlertGroupLogRecord.TYPE_AUTO_UN_ACK, author=alert_group.acknowledged_by_user
)
transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk))
alert_group.unacknowledge()
alert_group.start_escalation_if_needed()
@shared_dedicated_queue_retry_task(autoretry_for=(Exception,), retry_backoff=True, max_retries=MAX_RETRIES)
def send_post_ack_reminder_message_signal(log_record_id):
"""
Sends signal to post acknowledge reminder message to Slack thread.
The signal is connected to AlertGroupSlackRepresentative.
"""
task_logger.info(f"sending signal for posting ack reminder message, log record {log_record_id}")
post_ack_reminder_message_signal.send(sender=send_post_ack_reminder_message_signal, log_record=log_record_id)