# What this PR does - Adds 10 minutes lock for acknowledge reminder task to prevent task duplicates, that causes posting multiple reminder messages and flooding in Slack threads. - Adds a new signal for acknowledge reminder task instead of using `alert_group_action_triggered_signal` since it is used only to post reminder message in Slack thread and it's not needed to be processed by other representatives ## Which issue(s) this PR closes Related to https://github.com/grafana/oncall-private/issues/2953 ## Checklist - [x] Unit, integration, and e2e (if applicable) tests updated - [x] Documentation added (or `pr:no public docs` PR label added if not required) - [x] Added the relevant release notes label (see labels prefixed w/ `release:`). These labels dictate how your PR will show up in the autogenerated release notes.
166 lines
7.8 KiB
Python
166 lines
7.8 KiB
Python
from datetime import timedelta
|
|
from functools import partial
|
|
|
|
from django.conf import settings
|
|
from django.core.cache import cache
|
|
from django.db import transaction
|
|
from django.utils import timezone
|
|
|
|
from apps.alerts.signals import post_ack_reminder_message_signal
|
|
from common.custom_celery_tasks import shared_dedicated_queue_retry_task
|
|
|
|
from .send_alert_group_signal import send_alert_group_signal
|
|
from .task_logger import task_logger
|
|
|
|
MAX_RETRIES = 1 if settings.DEBUG else 10
|
|
|
|
|
|
def is_allowed_to_send_acknowledge_reminder(alert_group_id, process_id):
|
|
lock_id = f"acknowledge-reminder-lock-{alert_group_id}"
|
|
lock_period = 60 * 10 # 10 min
|
|
# cache.add returns False if the key already exists
|
|
status = cache.add(lock_id, process_id, lock_period)
|
|
return status
|
|
|
|
|
|
@shared_dedicated_queue_retry_task(autoretry_for=(Exception,), retry_backoff=True, max_retries=MAX_RETRIES)
|
|
def acknowledge_reminder_task(alert_group_pk: int, unacknowledge_process_id: str) -> None:
|
|
from apps.alerts.models import AlertGroup, AlertGroupLogRecord
|
|
from apps.user_management.models import Organization
|
|
|
|
with transaction.atomic():
|
|
try:
|
|
alert_group = AlertGroup.objects.select_for_update().get(pk=alert_group_pk) # Lock alert_group
|
|
except AlertGroup.DoesNotExist:
|
|
task_logger.warning(f"AlertGroup {alert_group_pk} does not exist")
|
|
return
|
|
|
|
if unacknowledge_process_id != alert_group.last_unique_unacknowledge_process_id:
|
|
return
|
|
|
|
# Don't proceed if acknowledge reminder for this alert group has already been sent recently
|
|
if not is_allowed_to_send_acknowledge_reminder(alert_group.id, unacknowledge_process_id):
|
|
task_logger.info(f"Acknowledge reminder for alert_group {alert_group_pk} has already been sent recently.")
|
|
return
|
|
|
|
organization = alert_group.channel.organization
|
|
|
|
# Get timeout values
|
|
acknowledge_reminder_timeout = Organization.ACKNOWLEDGE_REMIND_DELAY[organization.acknowledge_remind_timeout]
|
|
unacknowledge_timeout = Organization.UNACKNOWLEDGE_TIMEOUT_DELAY[organization.unacknowledge_timeout]
|
|
|
|
# Don't proceed if the alert group is not in a state for acknowledgement reminder
|
|
acknowledge_reminder_required = (
|
|
alert_group.is_root_alert_group
|
|
and alert_group.status == AlertGroup.ACKNOWLEDGED
|
|
and alert_group.acknowledged_by == AlertGroup.USER
|
|
and acknowledge_reminder_timeout
|
|
)
|
|
is_organization_deleted = organization.deleted_at is not None
|
|
log_info = (
|
|
f"acknowledge_reminder_timeout option: {acknowledge_reminder_timeout},"
|
|
f"organization ppk: {organization.public_primary_key},"
|
|
f"organization is deleted: {is_organization_deleted}"
|
|
)
|
|
if not acknowledge_reminder_required or is_organization_deleted:
|
|
task_logger.info(f"alert group {alert_group_pk} is not in a state for acknowledgement reminder. {log_info}")
|
|
return
|
|
|
|
task_logger.info(f"alert group {alert_group_pk} is in a state for acknowledgement reminder. {log_info}")
|
|
|
|
# unacknowledge_timeout_task uses acknowledged_by_confirmed to check if acknowledgement reminder has been confirmed
|
|
# by the user. Setting to None here to indicate that the user has not confirmed the acknowledgement reminder
|
|
if alert_group.acknowledged_by_confirmed is not None:
|
|
alert_group.acknowledged_by_confirmed = None
|
|
alert_group.save(update_fields=["acknowledged_by_confirmed"])
|
|
|
|
if unacknowledge_timeout: # "unack in N minutes if no response" is enabled
|
|
unacknowledge_timeout_task.apply_async(
|
|
(alert_group.pk, unacknowledge_process_id), countdown=unacknowledge_timeout
|
|
)
|
|
else:
|
|
if alert_group.started_at < timezone.now() - timedelta(days=settings.ACKNOWLEDGE_REMINDER_TASK_EXPIRY_DAYS):
|
|
task_logger.info(
|
|
f"alert group {alert_group_pk} not renewing acknowledgement reminder, started_at is too old. {log_info}"
|
|
)
|
|
return
|
|
acknowledge_reminder_task.apply_async(
|
|
(alert_group.pk, unacknowledge_process_id), countdown=acknowledge_reminder_timeout
|
|
)
|
|
|
|
with transaction.atomic():
|
|
log_record = alert_group.log_records.create(
|
|
type=AlertGroupLogRecord.TYPE_ACK_REMINDER_TRIGGERED, author=alert_group.acknowledged_by_user
|
|
)
|
|
task_logger.info(f"created log record {log_record.pk}, sending signal...")
|
|
transaction.on_commit(partial(send_post_ack_reminder_message_signal.delay, log_record.pk))
|
|
|
|
|
|
@shared_dedicated_queue_retry_task(autoretry_for=(Exception,), retry_backoff=True, max_retries=MAX_RETRIES)
|
|
def unacknowledge_timeout_task(alert_group_pk: int, unacknowledge_process_id: str) -> None:
|
|
from apps.alerts.models import AlertGroup, AlertGroupLogRecord
|
|
from apps.user_management.models import Organization
|
|
|
|
with transaction.atomic():
|
|
try:
|
|
alert_group = AlertGroup.objects.select_for_update().get(pk=alert_group_pk) # Lock alert_group
|
|
except AlertGroup.DoesNotExist:
|
|
task_logger.warning(f"AlertGroup {alert_group_pk} does not exist")
|
|
return
|
|
|
|
if unacknowledge_process_id != alert_group.last_unique_unacknowledge_process_id:
|
|
return
|
|
|
|
organization = alert_group.channel.organization
|
|
|
|
# Get timeout values
|
|
acknowledge_reminder_timeout = Organization.ACKNOWLEDGE_REMIND_DELAY[organization.acknowledge_remind_timeout]
|
|
unacknowledge_timeout = Organization.UNACKNOWLEDGE_TIMEOUT_DELAY[organization.unacknowledge_timeout]
|
|
|
|
# Don't proceed if the alert group is not in a state for auto-unacknowledge
|
|
unacknowledge_required = (
|
|
alert_group.is_root_alert_group
|
|
and alert_group.status == AlertGroup.ACKNOWLEDGED
|
|
and alert_group.acknowledged_by == AlertGroup.USER
|
|
and acknowledge_reminder_timeout
|
|
and unacknowledge_timeout
|
|
)
|
|
is_organization_deleted = organization.deleted_at is not None
|
|
log_info = (
|
|
f"acknowledge_reminder_timeout option: {acknowledge_reminder_timeout},"
|
|
f"unacknowledge_timeout option: {unacknowledge_timeout},"
|
|
f"organization ppk: {organization.public_primary_key},"
|
|
f"organization is deleted: {is_organization_deleted}"
|
|
)
|
|
if not unacknowledge_required or is_organization_deleted:
|
|
task_logger.info(f"alert group {alert_group_pk} is not in a state for unacknowledge by timeout. {log_info}")
|
|
return
|
|
|
|
if alert_group.acknowledged_by_confirmed: # acknowledgement reminder was confirmed by the user
|
|
acknowledge_reminder_task.apply_async(
|
|
(alert_group_pk, unacknowledge_process_id), countdown=acknowledge_reminder_timeout - unacknowledge_timeout
|
|
)
|
|
task_logger.info(
|
|
f"Acknowledgement reminder was confirmed by user. Rescheduling acknowledge_reminder_task..."
|
|
f"alert group: {alert_group_pk}, {log_info}"
|
|
)
|
|
return
|
|
|
|
task_logger.info(f"alert group {alert_group_pk} is in a state for unacknowledge by timeout. {log_info}")
|
|
# If acknowledgement reminder wasn't confirmed by the user, unacknowledge the alert group and start escalation again
|
|
log_record = alert_group.log_records.create(
|
|
type=AlertGroupLogRecord.TYPE_AUTO_UN_ACK, author=alert_group.acknowledged_by_user
|
|
)
|
|
transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk))
|
|
alert_group.unacknowledge()
|
|
alert_group.start_escalation_if_needed()
|
|
|
|
|
|
@shared_dedicated_queue_retry_task(autoretry_for=(Exception,), retry_backoff=True, max_retries=MAX_RETRIES)
|
|
def send_post_ack_reminder_message_signal(log_record_id):
|
|
"""
|
|
Sends signal to post acknowledge reminder message to Slack thread.
|
|
The signal is connected to AlertGroupSlackRepresentative.
|
|
"""
|
|
task_logger.info(f"sending signal for posting ack reminder message, log record {log_record_id}")
|
|
post_ack_reminder_message_signal.send(sender=send_post_ack_reminder_message_signal, log_record=log_record_id)
|