Refactor maintenance (#1340)

# What this PR does
This PR simplifies code of maintenance mode.
1. Perform distribution/escalation maintenance checks in send_signal...
tasks.
2. Use usual alert distribution flow for the maintenance incident.
3. Decouple maintenance mode from slack (all, except
**notify_about_maintenance_action** methods, I don't want to make this
PR too big)

As a bonus from these changes, maintenance mode now mute alert group
delivery in all chatops integrations, not only in slack. (Before,
incidents happened while maintenance were posted to telegram and msteams
anyway)

## Checklist

- [ ] Tests updated
- [ ] Documentation added
- [ ] `CHANGELOG.md` updated
This commit is contained in:
Innokentii Konstantinov 2023-02-23 08:13:03 +01:00 committed by GitHub
parent d99f6920c5
commit 26a2bd9c91
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
15 changed files with 160 additions and 154 deletions

View file

@ -248,6 +248,11 @@ class EscalationSnapshotMixin:
"""
AlertGroup = apps.get_model("alerts", "AlertGroup")
is_on_maintenace_or_debug_mode = (
self.channel.maintenance_mode is not None or self.channel.organization.maintenance_mode is not None
)
if is_on_maintenace_or_debug_mode:
return
if self.pause_escalation:
return

View file

@ -169,7 +169,7 @@ class AlertGroupSlackRenderer(AlertGroupBaseRenderer):
def _get_buttons_blocks(self):
AlertGroup = apps.get_model("alerts", "AlertGroup")
buttons = []
if self.alert_group.maintenance_uuid is None:
if not self.alert_group.is_maintenance_incident:
if not self.alert_group.resolved:
if not self.alert_group.acknowledged:
buttons.append(

View file

@ -127,29 +127,31 @@ class Alert(models.Model):
alert.save()
maintenance_uuid = None
if alert_receive_channel.organization.maintenance_mode == AlertReceiveChannel.MAINTENANCE:
maintenance_uuid = alert_receive_channel.organization.maintenance_uuid
if group_created:
# all code below related to maintenance mode
maintenance_uuid = None
if alert_receive_channel.organization.maintenance_mode == AlertReceiveChannel.MAINTENANCE:
maintenance_uuid = alert_receive_channel.organization.maintenance_uuid
elif alert_receive_channel.maintenance_mode == AlertReceiveChannel.MAINTENANCE:
maintenance_uuid = alert_receive_channel.maintenance_uuid
elif alert_receive_channel.maintenance_mode == AlertReceiveChannel.MAINTENANCE:
maintenance_uuid = alert_receive_channel.maintenance_uuid
if maintenance_uuid is not None:
try:
maintenance_incident = AlertGroup.all_objects.get(maintenance_uuid=maintenance_uuid)
group.root_alert_group = maintenance_incident
group.save(update_fields=["root_alert_group"])
log_record_for_root_incident = maintenance_incident.log_records.create(
type=AlertGroupLogRecord.TYPE_ATTACHED, dependent_alert_group=group, reason="Attach dropdown"
)
logger.debug(
f"call send_alert_group_signal for alert_group {maintenance_incident.pk} (maintenance), "
f"log record {log_record_for_root_incident.pk} with type "
f"'{log_record_for_root_incident.get_type_display()}'"
)
send_alert_group_signal.apply_async((log_record_for_root_incident.pk,))
except AlertGroup.DoesNotExist:
pass
if maintenance_uuid is not None:
try:
maintenance_incident = AlertGroup.all_objects.get(maintenance_uuid=maintenance_uuid)
group.root_alert_group = maintenance_incident
group.save(update_fields=["root_alert_group"])
log_record_for_root_incident = maintenance_incident.log_records.create(
type=AlertGroupLogRecord.TYPE_ATTACHED, dependent_alert_group=group, reason="Attach dropdown"
)
logger.debug(
f"call send_alert_group_signal for alert_group {maintenance_incident.pk} (maintenance), "
f"log record {log_record_for_root_incident.pk} with type "
f"'{log_record_for_root_incident.get_type_display()}'"
)
send_alert_group_signal.apply_async((log_record_for_root_incident.pk,))
except AlertGroup.DoesNotExist:
pass
return alert
@ -264,7 +266,7 @@ def listen_for_alert_model_save(sender, instance, created, *args, **kwargs):
"""
Here we invoke AlertShootingStep by model saving action.
"""
if created and instance.group.maintenance_uuid is None:
if created:
# RFCT - why additinal save ?
instance.save()

View file

@ -557,10 +557,9 @@ class AlertGroupLogRecord(models.Model):
@receiver(post_save, sender=AlertGroupLogRecord)
def listen_for_alertgrouplogrecord(sender, instance, created, *args, **kwargs):
if instance.type != AlertGroupLogRecord.TYPE_DELETED:
if not instance.alert_group.is_maintenance_incident:
alert_group_pk = instance.alert_group.pk
logger.debug(
f"send_update_log_report_signal for alert_group {alert_group_pk}, "
f"alert group event: {instance.get_type_display()}"
)
send_update_log_report_signal.apply_async(kwargs={"alert_group_pk": alert_group_pk}, countdown=8)
alert_group_pk = instance.alert_group.pk
logger.debug(
f"send_update_log_report_signal for alert_group {alert_group_pk}, "
f"alert group event: {instance.get_type_display()}"
)
send_update_log_report_signal.apply_async(kwargs={"alert_group_pk": alert_group_pk}, countdown=8)

View file

@ -510,6 +510,8 @@ class AlertReceiveChannel(IntegrationOptionsMixin, MaintainableObject):
disable_maintenance(alert_receive_channel_id=self.pk, force=True, user_id=user.pk)
def notify_about_maintenance_action(self, text, send_to_general_log_channel=True):
# TODO: this method should be refactored.
# It's binded to slack and sending maintenance notification only there.
channel_ids = list(
self.channel_filters.filter(slack_channel_id__isnull=False, notify_in_slack=False).values_list(
"slack_channel_id", flat=True

View file

@ -6,7 +6,6 @@ from django.apps import apps
from django.db import models, transaction
from django.utils import timezone
from apps.slack.scenarios.scenario_step import ScenarioStep
from common.exceptions import MaintenanceCouldNotBeStartedError
from common.insight_log import MaintenanceEvent, write_maintenance_insight_log
@ -67,17 +66,6 @@ class MaintainableObject(models.Model):
def notify_about_maintenance_action(self, text, send_to_general_log_channel=True):
raise NotImplementedError
def send_maintenance_incident(self, organization, group, alert):
slack_team_identity = organization.slack_team_identity
if slack_team_identity is not None:
channel_id = organization.general_log_channel_id
attachments = group.render_slack_attachments()
blocks = group.render_slack_blocks()
AlertShootingStep = ScenarioStep.get_step("distribute_alerts", "AlertShootingStep")
AlertShootingStep(slack_team_identity, organization).publish_slack_messages(
slack_team_identity, group, alert, attachments, channel_id, blocks
)
def start_maintenance(self, mode, maintenance_duration, user):
AlertGroup = apps.get_model("alerts", "AlertGroup")
AlertReceiveChannel = apps.get_model("alerts", "AlertReceiveChannel")
@ -142,6 +130,7 @@ class MaintainableObject(models.Model):
f" During this time all alerts from integration will be collected here without escalations"
)
alert = Alert(
is_the_first_alert_in_group=True,
is_resolve_signal=False,
title=title,
message=message,
@ -154,7 +143,6 @@ class MaintainableObject(models.Model):
alert.save()
write_maintenance_insight_log(self, user, MaintenanceEvent.STARTED)
if mode == AlertReceiveChannel.MAINTENANCE:
self.send_maintenance_incident(organization, group, alert)
self.notify_about_maintenance_action(
f"Maintenance of {verbal}. Initiated by {user_verbal} for {duration_verbal}.",
send_to_general_log_channel=False,

View file

@ -19,14 +19,12 @@ def distribute_alert(alert_id):
AlertGroup = apps.get_model("alerts", "AlertGroup")
alert = Alert.objects.get(pk=alert_id)
task_logger.debug(f"Start distribute_alert for alert {alert_id} from alert_group {alert.group_id}")
send_alert_create_signal.apply_async((alert_id,))
alert_group = AlertGroup.all_objects.filter(pk=alert.group_id).get()
# If it's the first alert, let's launch the escalation!
if alert.is_the_first_alert_in_group:
alert_group = AlertGroup.all_objects.filter(pk=alert.group_id).get()
alert_group.start_escalation_if_needed(countdown=TASK_DELAY_SECONDS)
updated_rows = Alert.objects.filter(pk=alert_id, delivered=True).update(delivered=True)
@ -39,12 +37,21 @@ def distribute_alert(alert_id):
@shared_dedicated_queue_retry_task(
autoretry_for=(Exception,), retry_backoff=True, max_retries=0 if settings.DEBUG else None
autoretry_for=(Exception,), retry_backoff=True, max_retries=1 if settings.DEBUG else None
)
def send_alert_create_signal(alert_id):
Alert = apps.get_model("alerts", "Alert")
AlertReceiveChannel = apps.get_model("alerts", "AlertReceiveChannel")
task_logger.debug(f"Started send_alert_create_signal task for alert {alert_id}")
alert_create_signal.send(
sender=send_alert_create_signal,
alert=alert_id,
alert = Alert.objects.get(pk=alert_id)
is_on_maintenace_mode = (
alert.group.channel.maintenance_mode == AlertReceiveChannel.MAINTENANCE
or alert.group.channel.organization.maintenance_mode == AlertReceiveChannel.MAINTENANCE
)
if not is_on_maintenace_mode:
alert_create_signal.send(
sender=send_alert_create_signal,
alert=alert_id,
)
task_logger.debug(f"Finished send_alert_create_signal task for alert {alert_id} ")

View file

@ -28,14 +28,6 @@ def escalate_alert_group(alert_group_pk):
except IndexError:
return f"Alert group with pk {alert_group_pk} doesn't exist"
if (
alert_group.channel.maintenance_mode is not None
or alert_group.channel.organization.maintenance_mode is not None
):
task_logger.info(f"alert_group {alert_group.pk} organization or alert_receive_channel on maintenance.")
alert_group.stop_escalation()
return
if not compare_escalations(escalate_alert_group.request.id, alert_group.active_escalation_id):
return "Active escalation ID mismatch. Duplication or non-active escalation triggered. Active: {}".format(
alert_group.active_escalation_id

View file

@ -4,21 +4,34 @@ from django.conf import settings
from apps.alerts.signals import alert_group_update_log_report_signal
from common.custom_celery_tasks import shared_dedicated_queue_retry_task
from .task_logger import task_logger
@shared_dedicated_queue_retry_task(
autoretry_for=(Exception,), retry_backoff=True, max_retries=1 if settings.DEBUG else None
)
def send_update_log_report_signal(log_record_pk=None, alert_group_pk=None):
AlertGroupLogRecord = apps.get_model("alerts", "AlertGroupLogRecord")
AlertGroup = apps.get_model("alerts", "AlertGroup")
AlertReceiveChannel = apps.get_model("alerts", "AlertReceiveChannel")
if log_record_pk and not alert_group_pk: # legacy
log_record = AlertGroupLogRecord.objects.get(pk=log_record_pk)
if log_record.type == AlertGroupLogRecord.TYPE_DELETED:
return
alert_group_pk = log_record.alert_group.pk
if alert_group_pk is not None:
alert_group_update_log_report_signal.send(
sender=send_update_log_report_signal,
alert_group=alert_group_pk,
alert_group = AlertGroup.all_objects.get(id=alert_group_pk)
if alert_group.is_maintenance_incident:
task_logger.debug(
f'send_update_log_report_signal: alert_group={alert_group_pk} msg="skip alert_group_update_log_report_signal, alert group is maintenance incident "'
)
return
is_on_maintenace_mode = (
alert_group.channel.maintenance_mode == AlertReceiveChannel.MAINTENANCE
or alert_group.channel.organization.maintenance_mode == AlertReceiveChannel.MAINTENANCE
)
if is_on_maintenace_mode:
task_logger.debug(
f'send_update_log_report_signal: alert_group={alert_group_pk} msg="skip alert_group_update_log_report_signal due to maintenace"'
)
return
alert_group_update_log_report_signal.send(
sender=send_update_log_report_signal,
alert_group=alert_group_pk,
)

View file

@ -61,8 +61,8 @@ def make_resolved_ack_new_silenced_alert_groups(make_alert_group, make_alert_rec
@pytest.fixture()
def mock_alert_shooting_step_publish_slack_messages(monkeypatch):
def mock_publish_slack_messages(*args, **kwargs):
def mock_alert_shooting_step_post_alert_group_to_slack(monkeypatch):
def mock_post_alert_group_to_slack(*args, **kwargs):
return None
monkeypatch.setattr(AlertShootingStep, "publish_slack_messages", mock_publish_slack_messages)
monkeypatch.setattr(AlertShootingStep, "_post_alert_group_to_slack", mock_post_alert_group_to_slack)

View file

@ -126,7 +126,7 @@ def test_integration_filter_by_maintenance(
alert_receive_channel_internal_api_setup,
make_user_auth_headers,
mock_start_disable_maintenance_task,
mock_alert_shooting_step_publish_slack_messages,
mock_alert_shooting_step_post_alert_group_to_slack,
):
user, token, alert_receive_channel = alert_receive_channel_internal_api_setup
client = APIClient()
@ -149,7 +149,7 @@ def test_integration_filter_by_debug(
alert_receive_channel_internal_api_setup,
make_user_auth_headers,
mock_start_disable_maintenance_task,
mock_alert_shooting_step_publish_slack_messages,
mock_alert_shooting_step_post_alert_group_to_slack,
):
user, token, alert_receive_channel = alert_receive_channel_internal_api_setup
client = APIClient()

View file

@ -21,7 +21,6 @@ def on_create_alert_slack_representative_async(alert_pk):
It's asynced in order to prevent Slack downtime causing issues with SMS and other destinations.
"""
Alert = apps.get_model("alerts", "Alert")
AlertReceiveChannel = apps.get_model("alerts", "AlertReceiveChannel")
alert = (
Alert.objects.filter(pk=alert_pk)
@ -35,17 +34,6 @@ def on_create_alert_slack_representative_async(alert_pk):
)
logger.debug(f"Start on_create_alert_slack_representative for alert {alert_pk} from alert_group {alert.group_id}")
# don't need to publish in slack maintenance alert
# it was published earlier
if alert.group.maintenance_uuid is not None:
return
# don't need to publish alerts in slack while integration on maintenance
if (
alert.group.channel.maintenance_mode == AlertReceiveChannel.MAINTENANCE
or alert.group.channel.organization.maintenance_mode == AlertReceiveChannel.MAINTENANCE is not None
):
return
organization = alert.group.channel.organization
if organization.slack_team_identity:
logger.debug(

View file

@ -42,7 +42,72 @@ logger.setLevel(logging.DEBUG)
class AlertShootingStep(scenario_step.ScenarioStep):
def publish_slack_messages(self, slack_team_identity, alert_group, alert, attachments, channel_id, blocks):
def process_signal(self, alert):
# do not try to post alert group message to slack if its channel is rate limited
if alert.group.channel.is_rate_limited_in_slack:
logger.info("Skip posting or updating alert_group in Slack due to rate limit")
AlertGroup.all_objects.filter(
pk=alert.group.pk,
slack_message_sent=False,
).update(slack_message_sent=True, reason_to_skip_escalation=AlertGroup.RATE_LIMITED)
return
num_updated_rows = AlertGroup.all_objects.filter(pk=alert.group.pk, slack_message_sent=False).update(
slack_message_sent=True
)
if num_updated_rows == 1:
try:
channel_id = alert.group.channel_filter.slack_channel_id_or_general_log_id
self._send_first_alert(alert, channel_id)
except SlackAPIException as e:
AlertGroup.all_objects.filter(pk=alert.group.pk).update(slack_message_sent=False)
raise e
is_on_debug_mode = (
alert.group.channel.maintenance_mode == AlertReceiveChannel.DEBUG_MAINTENANCE
or alert.group.channel.organization.maintenance_mode == AlertReceiveChannel.DEBUG_MAINTENANCE
)
if is_on_debug_mode:
self._send_debug_mode_notice(alert.group, channel_id)
if alert.group.is_maintenance_incident:
# not sending log report message for maintenance incident
pass
else:
# check if alert group was posted to slack before posting message to thread
if not alert.group.skip_escalation_in_slack:
self._send_log_report_message(alert.group, channel_id)
self._send_message_to_thread_if_bot_not_in_channel(alert.group, channel_id)
else:
# check if alert group was posted to slack before updating its message
if not alert.group.skip_escalation_in_slack:
update_task_id = update_incident_slack_message.apply_async(
(self.slack_team_identity.pk, alert.group.pk),
countdown=10,
)
cache.set(
get_cache_key_update_incident_slack_message(alert.group.pk),
update_task_id,
timeout=CACHE_UPDATE_INCIDENT_SLACK_MESSAGE_LIFETIME,
)
else:
logger.info("Skip updating alert_group in Slack due to rate limit")
def _send_first_alert(self, alert, channel_id):
attachments = alert.group.render_slack_attachments()
blocks = alert.group.render_slack_blocks()
self._post_alert_group_to_slack(
slack_team_identity=self.slack_team_identity,
alert_group=alert.group,
alert=alert,
attachments=attachments,
channel_id=channel_id,
blocks=blocks,
)
def _post_alert_group_to_slack(self, slack_team_identity, alert_group, alert, attachments, channel_id, blocks):
# channel_id can be None if general log channel for slack_team_identity is not set
if channel_id is None:
logger.info(f"Failed to post message to Slack for alert_group {alert_group.pk} because channel_id is None")
@ -120,65 +185,6 @@ class AlertShootingStep(scenario_step.ScenarioStep):
finally:
alert.save()
def process_signal(self, alert):
# do not try to post alert group message to slack if its channel is rate limited
if alert.group.channel.is_rate_limited_in_slack:
logger.info("Skip posting or updating alert_group in Slack due to rate limit")
AlertGroup.all_objects.filter(
pk=alert.group.pk,
slack_message_sent=False,
).update(slack_message_sent=True, reason_to_skip_escalation=AlertGroup.RATE_LIMITED)
return
num_updated_rows = AlertGroup.all_objects.filter(pk=alert.group.pk, slack_message_sent=False).update(
slack_message_sent=True
)
if num_updated_rows == 1:
try:
channel_id = alert.group.channel_filter.slack_channel_id_or_general_log_id
self._send_first_alert(alert, channel_id)
except SlackAPIException as e:
AlertGroup.all_objects.filter(pk=alert.group.pk).update(slack_message_sent=False)
raise e
is_debug_mode = (
alert.group.channel.maintenance_mode is not None
or alert.group.channel.organization.maintenance_mode is not None
)
if is_debug_mode:
self._send_debug_mode_notice(alert.group, channel_id)
else:
# check if alert group was posted to slack before posting message to thread
if not alert.group.skip_escalation_in_slack:
self._send_thread_messages(alert.group, channel_id)
else:
# check if alert group was posted to slack before updating its message
if not alert.group.skip_escalation_in_slack:
update_task_id = update_incident_slack_message.apply_async(
(self.slack_team_identity.pk, alert.group.pk),
countdown=10,
)
cache.set(
get_cache_key_update_incident_slack_message(alert.group.pk),
update_task_id,
timeout=CACHE_UPDATE_INCIDENT_SLACK_MESSAGE_LIFETIME,
)
else:
logger.info("Skip updating alert_group in Slack due to rate limit")
def _send_first_alert(self, alert, channel_id):
attachments = alert.group.render_slack_attachments()
blocks = alert.group.render_slack_blocks()
self.publish_slack_messages(
slack_team_identity=self.slack_team_identity,
alert_group=alert.group,
alert=alert,
attachments=attachments,
channel_id=channel_id,
blocks=blocks,
)
def _send_debug_mode_notice(self, alert_group, channel_id):
blocks = []
text = "Escalations are silenced due to Debug mode"
@ -193,11 +199,12 @@ class AlertShootingStep(scenario_step.ScenarioStep):
blocks=blocks,
)
def _send_thread_messages(self, alert_group, channel_id):
def _send_log_report_message(self, alert_group, channel_id):
post_or_update_log_report_message_task.apply_async(
(alert_group.pk, self.slack_team_identity.pk),
)
def _send_message_to_thread_if_bot_not_in_channel(self, alert_group, channel_id):
send_message_to_thread_if_bot_not_in_channel.apply_async(
(alert_group.pk, self.slack_team_identity.pk, channel_id),
countdown=1, # delay for message so that the log report is published first
@ -608,8 +615,10 @@ class ResolveGroupStep(
if not self.check_alert_is_unarchived(slack_team_identity, payload, alert_group):
return
if alert_group.maintenance_uuid is None:
if alert_group.is_maintenance_incident:
alert_group.stop_maintenance(self.user)
else:
# TODO: refactor that check, it should be in alert core, not in slack.
if self.organization.is_resolution_note_required and not alert_group.has_resolution_notes:
resolution_note_data = {
@ -623,12 +632,11 @@ class ResolveGroupStep(
return
alert_group.resolve_by_user(self.user, action_source=ActionSource.SLACK)
else:
alert_group.stop_maintenance(self.user)
def process_signal(self, log_record):
alert_group = log_record.alert_group
# Do not rerender alert_groups which happened while maintenance.
# They have no slack messages, since they just attached to the maintenance incident.
if not alert_group.happened_while_maintenance:
self.alert_group_slack_service.update_alert_group_slack_message(alert_group)

View file

@ -25,7 +25,7 @@ def test_restricted_action_error(
with patch.object(step._slack_client, "api_call") as mock_slack_api_call:
mock_slack_api_call.side_effect = SlackAPIException(response={"error": "restricted_action"})
step.publish_slack_messages(slack_team_identity, alert_group, alert, None, "channel-id", [])
step._post_alert_group_to_slack(slack_team_identity, alert_group, alert, None, "channel-id", [])
alert_group.refresh_from_db()
alert.refresh_from_db()

View file

@ -263,6 +263,8 @@ class Organization(MaintainableObject):
return self.org_title
def notify_about_maintenance_action(self, text, send_to_general_log_channel=True):
# TODO: this method should be refactored.
# It's binded to slack and sending maintenance notification only there.
if send_to_general_log_channel:
post_message_to_channel(self, self.general_log_channel_id, text)