diff --git a/engine/apps/alerts/escalation_snapshot/snapshot_classes/escalation_policy_snapshot.py b/engine/apps/alerts/escalation_snapshot/snapshot_classes/escalation_policy_snapshot.py index e179dc92..b6a49593 100644 --- a/engine/apps/alerts/escalation_snapshot/snapshot_classes/escalation_policy_snapshot.py +++ b/engine/apps/alerts/escalation_snapshot/snapshot_classes/escalation_policy_snapshot.py @@ -12,11 +12,13 @@ from apps.alerts.models.alert_group_log_record import AlertGroupLogRecord from apps.alerts.models.escalation_policy import EscalationPolicy from apps.alerts.tasks import ( custom_webhook_result, + declare_incident, notify_all_task, notify_group_task, notify_user_task, resolve_by_last_step_task, ) +from apps.alerts.utils import is_declare_incident_step_enabled from apps.schedules.ical_utils import list_users_to_notify_from_ical from apps.user_management.models import User @@ -136,6 +138,7 @@ class EscalationPolicySnapshot: EscalationPolicy.STEP_NOTIFY_IF_NUM_ALERTS_IN_TIME_WINDOW: self._escalation_step_notify_if_num_alerts_in_time_window, EscalationPolicy.STEP_NOTIFY_MULTIPLE_USERS: self._escalation_step_notify_multiple_users, EscalationPolicy.STEP_NOTIFY_MULTIPLE_USERS_IMPORTANT: self._escalation_step_notify_multiple_users, + EscalationPolicy.STEP_DECLARE_INCIDENT: self._escalation_step_declare_incident, None: self._escalation_step_not_configured, } result = action_map[self.step](alert_group, reason) @@ -410,6 +413,32 @@ class EscalationPolicySnapshot: self._execute_tasks(tasks) + def _escalation_step_declare_incident(self, alert_group: "AlertGroup", _reason: str) -> None: + grafana_declare_incident_enabled = is_declare_incident_step_enabled( + organization=alert_group.channel.organization + ) + if not grafana_declare_incident_enabled: + AlertGroupLogRecord( + type=AlertGroupLogRecord.TYPE_ESCALATION_FAILED, + alert_group=alert_group, + reason="Declare Incident step is not enabled", + escalation_policy=self.escalation_policy, + escalation_error_code=AlertGroupLogRecord.ERROR_ESCALATION_DECLARE_INCIDENT_STEP_IS_NOT_ENABLED, + escalation_policy_step=self.step, + ).save() + return + tasks = [] + declare_incident_task = declare_incident.signature( + args=(alert_group.pk,), + kwargs={ + "escalation_policy_pk": self.id, + "severity": self.severity, + }, + immutable=True, + ) + tasks.append(declare_incident_task) + self._execute_tasks(tasks) + def _escalation_step_notify_if_time(self, alert_group: "AlertGroup", _reason: str) -> StepExecutionResultData: eta = None diff --git a/engine/apps/alerts/migrations/0060_relatedincident.py b/engine/apps/alerts/migrations/0060_relatedincident.py new file mode 100644 index 00000000..d044cdbf --- /dev/null +++ b/engine/apps/alerts/migrations/0060_relatedincident.py @@ -0,0 +1,30 @@ +# Generated by Django 4.2.15 on 2024-10-04 16:38 + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ('user_management', '0022_alter_team_unique_together'), + ('alerts', '0059_escalationpolicy_severity_and_more'), + ] + + operations = [ + migrations.CreateModel( + name='RelatedIncident', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('incident_id', models.CharField(db_index=True, max_length=50)), + ('created_at', models.DateTimeField(auto_now_add=True)), + ('is_active', models.BooleanField(default=True)), + ('attached_alert_groups', models.ManyToManyField(related_name='related_incidents', to='alerts.alertgroup')), + ('channel_filter', models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='related_incidents', to='alerts.channelfilter')), + ('organization', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='related_incidents', to='user_management.organization')), + ], + options={ + 'unique_together': {('organization', 'incident_id')}, + }, + ), + ] diff --git a/engine/apps/alerts/models/__init__.py b/engine/apps/alerts/models/__init__.py index 51b44158..24ea0905 100644 --- a/engine/apps/alerts/models/__init__.py +++ b/engine/apps/alerts/models/__init__.py @@ -13,6 +13,7 @@ from .escalation_policy import EscalationPolicy # noqa: F401 from .grafana_alerting_contact_point import GrafanaAlertingContactPoint # noqa: F401 from .invitation import Invitation # noqa: F401 from .maintainable_object import MaintainableObject # noqa: F401 +from .related_incident import RelatedIncident # noqa: F401 from .resolution_note import ResolutionNote, ResolutionNoteSlackMessage # noqa: F401 from .user_has_notification import UserHasNotification # noqa: F401 from .user_notification_bundle import BundledNotification, UserNotificationBundle # noqa: F401 diff --git a/engine/apps/alerts/models/alert_group.py b/engine/apps/alerts/models/alert_group.py index 81ac41b9..6a9062bd 100644 --- a/engine/apps/alerts/models/alert_group.py +++ b/engine/apps/alerts/models/alert_group.py @@ -44,6 +44,7 @@ if typing.TYPE_CHECKING: AlertGroupLogRecord, AlertReceiveChannel, BundledNotification, + RelatedIncident, ResolutionNote, ResolutionNoteSlackMessage, ) @@ -193,6 +194,7 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. acknowledged_by_user: typing.Optional["User"] alerts: "RelatedManager['Alert']" bundled_notifications: "RelatedManager['BundledNotification']" + related_incidents: "RelatedManager['RelatedIncident']" dependent_alert_groups: "RelatedManager['AlertGroup']" channel: "AlertReceiveChannel" log_records: "RelatedManager['AlertGroupLogRecord']" diff --git a/engine/apps/alerts/models/alert_group_log_record.py b/engine/apps/alerts/models/alert_group_log_record.py index f4b796aa..ea2b2c18 100644 --- a/engine/apps/alerts/models/alert_group_log_record.py +++ b/engine/apps/alerts/models/alert_group_log_record.py @@ -11,18 +11,24 @@ from rest_framework.fields import DateTimeField from apps.alerts import tasks from apps.alerts.constants import ActionSource +from apps.alerts.incident_appearance.renderers.constants import DEFAULT_BACKUP_TITLE from apps.alerts.utils import render_relative_timeline from apps.slack.slack_formatter import SlackFormatter from common.utils import clean_markup if typing.TYPE_CHECKING: from apps.alerts.models import AlertGroup, CustomButton, EscalationPolicy, Invitation - from apps.user_management.models import User + from apps.user_management.models import Organization, User logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) +class RelatedIncidentData(typing.TypedDict): + incident_link: typing.Optional[str] + incident_title: str + + class AlertGroupLogRecord(models.Model): alert_group: "AlertGroup" author: typing.Optional["User"] @@ -161,7 +167,9 @@ class AlertGroupLogRecord(models.Model): ERROR_ESCALATION_TRIGGER_CUSTOM_WEBHOOK_ERROR, ERROR_ESCALATION_NOTIFY_TEAM_MEMBERS_STEP_IS_NOT_CONFIGURED, ERROR_ESCALATION_TRIGGER_WEBHOOK_IS_DISABLED, - ) = range(20) + ERROR_ESCALATION_DECLARE_INCIDENT_STEP_IS_NOT_ENABLED, + ERROR_ESCALATION_INCIDENT_COULD_NOT_BE_DECLARED, + ) = range(22) type = models.IntegerField(choices=TYPE_CHOICES) @@ -225,7 +233,14 @@ class AlertGroupLogRecord(models.Model): escalation_policy_step = models.IntegerField(null=True, default=None) step_specific_info = JSONField(null=True, default=None) - STEP_SPECIFIC_INFO_KEYS = ["schedule_name", "custom_button_name", "usergroup_handle", "source_integration_name"] + STEP_SPECIFIC_INFO_KEYS = [ + "schedule_name", + "custom_button_name", + "usergroup_handle", + "source_integration_name", + "incident_id", + "incident_title", + ] def _make_log_line_link(self, url, title, html=False, for_slack=False, substitute_with_tag=False): if html and url: @@ -244,6 +259,7 @@ class AlertGroupLogRecord(models.Model): author = self.author.short(organization) if self.author is not None else None escalation_chain = self.alert_group.channel_filter.escalation_chain if self.alert_group.channel_filter else None step_info = self.get_step_specific_info() + related_incident = self.render_incident_data_from_step_info(organization, step_info) escalation_chain_data = ( { "pk": escalation_chain.public_primary_key, @@ -280,6 +296,7 @@ class AlertGroupLogRecord(models.Model): "type": self.type, "created_at": created_at, "author": author, + "incident": related_incident, "escalation_chain": escalation_chain_data, "schedule": schedule, "webhook": webhook, @@ -425,6 +442,14 @@ class AlertGroupLogRecord(models.Model): result += f'triggered step "Notify on-call from Schedule {schedule_text}{important_text}"' elif escalation_policy_step == EscalationPolicy.STEP_REPEAT_ESCALATION_N_TIMES: result += "escalation started from the beginning" + elif escalation_policy_step == EscalationPolicy.STEP_DECLARE_INCIDENT: + organization = self.alert_group.channel.organization + incident_data = self.render_incident_data_from_step_info(organization, step_specific_info) + incident_link = incident_data["incident_link"] + incident_title = incident_data["incident_title"] + tag = "related_incident" if substitute_with_tag else False + incident_text = self._make_log_line_link(incident_link, incident_title, html, for_slack, tag) + result += self.reason + f": {incident_text}" else: result += f'triggered step "{EscalationPolicy.get_step_display_name(escalation_policy_step)}"' elif self.type == AlertGroupLogRecord.TYPE_SILENCE: @@ -640,8 +665,32 @@ class AlertGroupLogRecord(models.Model): result += f"failed to notify User Group{usergroup_handle_text} in Slack" elif self.escalation_error_code == AlertGroupLogRecord.ERROR_ESCALATION_TRIGGER_WEBHOOK_IS_DISABLED: result += 'skipped escalation step "Trigger Outgoing Webhook" because it is disabled' + elif ( + self.escalation_error_code == AlertGroupLogRecord.ERROR_ESCALATION_DECLARE_INCIDENT_STEP_IS_NOT_ENABLED + ): + result += 'skipped escalation step "Declare Incident": step is not enabled' + elif self.escalation_error_code == AlertGroupLogRecord.ERROR_ESCALATION_INCIDENT_COULD_NOT_BE_DECLARED: + result += "failed to declare an Incident" + if self.reason: + result += f": {self.reason}" return result + def render_incident_data_from_step_info( + self, organization: "Organization", step_specific_info: dict + ) -> RelatedIncidentData | None: + from apps.alerts.models.related_incident import get_incident_url + + if not step_specific_info or not all(key in step_specific_info for key in ["incident_title", "incident_id"]): + return None + + incident_link = ( + get_incident_url(organization, step_specific_info["incident_id"]) + if step_specific_info["incident_id"] + else None + ) + incident_title = step_specific_info["incident_title"] or DEFAULT_BACKUP_TITLE + return {"incident_link": incident_link, "incident_title": incident_title} + def get_step_specific_info(self): step_specific_info = None # in some cases step_specific_info was saved with using json.dumps diff --git a/engine/apps/alerts/models/escalation_policy.py b/engine/apps/alerts/models/escalation_policy.py index 1f74ef60..28ea7022 100644 --- a/engine/apps/alerts/models/escalation_policy.py +++ b/engine/apps/alerts/models/escalation_policy.py @@ -92,6 +92,7 @@ class EscalationPolicy(OrderedModel): STEP_NOTIFY_IF_TIME, STEP_NOTIFY_IF_NUM_ALERTS_IN_TIME_WINDOW, STEP_REPEAT_ESCALATION_N_TIMES, + STEP_DECLARE_INCIDENT, ] # Steps can be stored in db while interacting with internal api # Includes important versions of default steps @@ -218,6 +219,7 @@ class EscalationPolicy(OrderedModel): STEP_NOTIFY_IF_TIME, STEP_NOTIFY_IF_NUM_ALERTS_IN_TIME_WINDOW, STEP_REPEAT_ESCALATION_N_TIMES, + STEP_DECLARE_INCIDENT, ] PUBLIC_STEP_CHOICES_MAP = { @@ -239,6 +241,7 @@ class EscalationPolicy(OrderedModel): STEP_NOTIFY_IF_TIME: "notify_if_time_from_to", STEP_NOTIFY_IF_NUM_ALERTS_IN_TIME_WINDOW: "notify_if_num_alerts_in_window", STEP_REPEAT_ESCALATION_N_TIMES: "repeat_escalation", + STEP_DECLARE_INCIDENT: "declare_incident", } public_primary_key = models.CharField( diff --git a/engine/apps/alerts/models/related_incident.py b/engine/apps/alerts/models/related_incident.py new file mode 100644 index 00000000..61f340cf --- /dev/null +++ b/engine/apps/alerts/models/related_incident.py @@ -0,0 +1,48 @@ +import typing +from urllib.parse import urljoin + +from django.db import models + +from common.constants.plugin_ids import PluginID + +if typing.TYPE_CHECKING: + from django.db.models.manager import RelatedManager + + from apps.alerts.models import AlertGroup, ChannelFilter + from apps.user_management.models import Organization + + +def get_incident_url(organization, incident_id) -> str: + return urljoin(organization.grafana_url, f"a/{PluginID.INCIDENT}/incidents/{incident_id}") + + +class RelatedIncident(models.Model): + attached_alert_groups: "RelatedManager['AlertGroup']" + channel_filter: typing.Optional["ChannelFilter"] + organization: "Organization" + + incident_id = models.CharField(db_index=True, max_length=50) + organization = models.ForeignKey( + "user_management.Organization", + on_delete=models.CASCADE, + related_name="related_incidents", + ) + channel_filter = models.ForeignKey( + "alerts.ChannelFilter", + on_delete=models.SET_NULL, + null=True, + related_name="related_incidents", + ) + created_at = models.DateTimeField(auto_now_add=True) + is_active = models.BooleanField(default=True) + + attached_alert_groups = models.ManyToManyField( + "alerts.AlertGroup", + related_name="related_incidents", + ) + + class Meta: + unique_together = ("organization", "incident_id") + + def get_incident_link(self) -> str: + return get_incident_url(self.organization, self.incident_id) diff --git a/engine/apps/alerts/tasks/__init__.py b/engine/apps/alerts/tasks/__init__.py index 056140a3..e89f96cb 100644 --- a/engine/apps/alerts/tasks/__init__.py +++ b/engine/apps/alerts/tasks/__init__.py @@ -5,6 +5,7 @@ from .alert_group_web_title_cache import ( # noqa:F401 ) from .check_escalation_finished import check_escalation_finished_task # noqa: F401 from .custom_webhook_result import custom_webhook_result # noqa: F401 +from .declare_incident import declare_incident # noqa: F401 from .delete_alert_group import delete_alert_group # noqa: F401 from .delete_alert_group import finish_delete_alert_group # noqa: F401 from .delete_alert_group import send_alert_group_signal_for_delete # noqa: F401 diff --git a/engine/apps/alerts/tasks/declare_incident.py b/engine/apps/alerts/tasks/declare_incident.py new file mode 100644 index 00000000..b0003534 --- /dev/null +++ b/engine/apps/alerts/tasks/declare_incident.py @@ -0,0 +1,148 @@ +import logging + +from django.conf import settings + +from apps.alerts.incident_appearance.renderers.constants import DEFAULT_BACKUP_TITLE +from common.custom_celery_tasks import shared_dedicated_queue_retry_task +from common.incident_api.client import ( + DEFAULT_INCIDENT_SEVERITY, + DEFAULT_INCIDENT_STATUS, + IncidentAPIClient, + IncidentAPIException, +) + +logger = logging.getLogger(__name__) + +ATTACHMENT_CAPTION = "OnCall Alert Group" +ERROR_SEVERITY_NOT_FOUND = "Severity.FindOne: not found" +MAX_RETRIES = 1 if settings.DEBUG else 10 +MAX_ATTACHED_ALERT_GROUPS_PER_INCIDENT = 5 + + +def _attach_alert_group_to_incident(alert_group, incident_id, incident_title, escalation_policy, attached=False): + from apps.alerts.models import AlertGroupLogRecord, EscalationPolicy, RelatedIncident + + declared_incident, _ = RelatedIncident.objects.get_or_create( + incident_id=incident_id, + organization=alert_group.channel.organization, + defaults={ + "channel_filter": alert_group.channel_filter, + }, + ) + declared_incident.attached_alert_groups.add(alert_group) + reason = "attached to existing incident" if attached else "incident declared" + AlertGroupLogRecord.objects.create( + type=AlertGroupLogRecord.TYPE_ESCALATION_TRIGGERED, + reason=reason, + alert_group=alert_group, + step_specific_info={"incident_id": incident_id, "incident_title": incident_title}, + escalation_policy=escalation_policy, + escalation_policy_step=EscalationPolicy.STEP_DECLARE_INCIDENT, + ) + + +def _create_error_log_record(alert_group, escalation_policy, reason=""): + from apps.alerts.models import AlertGroupLogRecord, EscalationPolicy + + AlertGroupLogRecord.objects.create( + type=AlertGroupLogRecord.TYPE_ESCALATION_FAILED, + escalation_error_code=AlertGroupLogRecord.ERROR_ESCALATION_INCIDENT_COULD_NOT_BE_DECLARED, + reason=reason, + alert_group=alert_group, + escalation_policy=escalation_policy, + escalation_policy_step=EscalationPolicy.STEP_DECLARE_INCIDENT, + ) + + +@shared_dedicated_queue_retry_task(autoretry_for=(Exception,), retry_backoff=True, max_retries=MAX_RETRIES) +def declare_incident(alert_group_pk, escalation_policy_pk, severity=None): + from apps.alerts.models import AlertGroup, EscalationPolicy, RelatedIncident + + alert_group = AlertGroup.objects.get(pk=alert_group_pk) + organization = alert_group.channel.organization + escalation_policy = None + if escalation_policy_pk: + escalation_policy = EscalationPolicy.objects.filter(pk=escalation_policy_pk).first() + + if alert_group.channel_filter.is_default: + _create_error_log_record( + alert_group, escalation_policy, reason="Declare incident step is not enabled for default routes" + ) + return + + if declare_incident.request.retries == MAX_RETRIES: + _create_error_log_record(alert_group, escalation_policy) + return + + incident_client = IncidentAPIClient(organization.grafana_url, organization.api_token) + + # check for currently active related incident in the same route (channel_filter) + existing_incident = ( + RelatedIncident.objects.filter( + organization=organization, channel_filter=alert_group.channel_filter, is_active=True + ) + .order_by("-created_at") + .first() + ) + + if existing_incident: + incident_id = existing_incident.incident_id + try: + # get existing incident details + incident_data, _ = incident_client.get_incident(incident_id) + except IncidentAPIException as e: + logger.error(f"Error getting incident details: {e.msg}") + if e.status == 404: + # incident not found, mark as not opened + existing_incident.is_active = False + existing_incident.save(update_fields=["is_active"]) + else: + # raise (and retry) + raise + else: + # incident exists, check if it is still active + if incident_data["status"] == DEFAULT_INCIDENT_STATUS: + # attach to incident context + incident_title = incident_data["title"] + num_attached = existing_incident.attached_alert_groups.count() + if num_attached < MAX_ATTACHED_ALERT_GROUPS_PER_INCIDENT: + try: + incident_data, _ = incident_client.add_activity(incident_id, alert_group.web_link) + except IncidentAPIException as e: + logger.error(f"Error attaching to existing incident: {e.msg}") + # setup association between alert group and incident (even if not attached) + _attach_alert_group_to_incident( + alert_group, incident_id, incident_title, escalation_policy, attached=True + ) + else: + existing_incident.is_active = False + existing_incident.save(update_fields=["is_active"]) + + if existing_incident is None or not existing_incident.is_active: + # create new incident + if severity == EscalationPolicy.SEVERITY_SET_FROM_LABEL: + severity_label = alert_group.labels.filter(key_name="severity").first() + severity = severity_label.value_name if severity_label else None + severity = severity or DEFAULT_INCIDENT_SEVERITY + try: + incident_data, _ = incident_client.create_incident( + alert_group.web_title_cache if alert_group.web_title_cache else DEFAULT_BACKUP_TITLE, + severity=severity, + attachCaption=ATTACHMENT_CAPTION, + attachURL=alert_group.web_link, + ) + except IncidentAPIException as e: + logger.error(f"Error creating new incident: {e.msg}") + if ERROR_SEVERITY_NOT_FOUND.lower() in e.msg.lower() and severity != DEFAULT_INCIDENT_SEVERITY: + # invalid severity, retry with default severity + declare_incident.apply_async( + args=(alert_group_pk, escalation_policy_pk), + kwargs={"severity": DEFAULT_INCIDENT_SEVERITY}, + ) + return + # else raise (and retry) + raise + else: + _attach_alert_group_to_incident( + alert_group, incident_data["incidentID"], incident_data["title"], escalation_policy + ) diff --git a/engine/apps/alerts/tests/factories.py b/engine/apps/alerts/tests/factories.py index f07ef900..6a519ccf 100644 --- a/engine/apps/alerts/tests/factories.py +++ b/engine/apps/alerts/tests/factories.py @@ -11,6 +11,7 @@ from apps.alerts.models import ( EscalationChain, EscalationPolicy, Invitation, + RelatedIncident, ResolutionNote, ResolutionNoteSlackMessage, UserNotificationBundle, @@ -91,3 +92,8 @@ class InvitationFactory(factory.DjangoModelFactory): class UserNotificationBundleFactory(factory.DjangoModelFactory): class Meta: model = UserNotificationBundle + + +class RelatedIncidentFactory(factory.DjangoModelFactory): + class Meta: + model = RelatedIncident diff --git a/engine/apps/alerts/tests/test_escalation_policy_snapshot.py b/engine/apps/alerts/tests/test_escalation_policy_snapshot.py index 8a3eef60..8882a370 100644 --- a/engine/apps/alerts/tests/test_escalation_policy_snapshot.py +++ b/engine/apps/alerts/tests/test_escalation_policy_snapshot.py @@ -690,3 +690,52 @@ def test_notify_team_members( (user_2.pk, alert_group.pk), expected_kwargs, immutable=True ) assert mock_execute.signature.call_count == 2 + + +@pytest.mark.django_db +def test_escalation_step_declare_incident( + escalation_step_test_setup, + make_escalation_policy, +): + organization, _, _, channel_filter, alert_group, reason = escalation_step_test_setup + + declare_incident_step = make_escalation_policy( + escalation_chain=channel_filter.escalation_chain, + escalation_policy_step=EscalationPolicy.STEP_DECLARE_INCIDENT, + ) + escalation_policy_snapshot = get_escalation_policy_snapshot_from_model(declare_incident_step) + expected_eta = timezone.now() + timezone.timedelta(seconds=NEXT_ESCALATION_DELAY) + with patch.object(EscalationPolicySnapshot, "_execute_tasks") as mocked_execute_tasks: + with patch( + "apps.alerts.escalation_snapshot.snapshot_classes.escalation_policy_snapshot.is_declare_incident_step_enabled", + return_value=True, + ): + result = escalation_policy_snapshot.execute(alert_group, reason) + expected_result = EscalationPolicySnapshot.StepExecutionResultData( + eta=result.eta, + stop_escalation=False, + pause_escalation=False, + start_from_beginning=False, + ) + assert ( + expected_eta + timezone.timedelta(seconds=15) + > result.eta + > expected_eta - timezone.timedelta(seconds=15) + ) + assert result == expected_result + assert not alert_group.log_records.exists() + mocked_execute_tasks.assert_called_once() + with patch.object(EscalationPolicySnapshot, "_execute_tasks") as mocked_execute_tasks: + with patch( + "apps.alerts.escalation_snapshot.snapshot_classes.escalation_policy_snapshot.is_declare_incident_step_enabled", + return_value=False, + ): + escalation_policy_snapshot.execute(alert_group, reason) + mocked_execute_tasks.assert_not_called() + assert alert_group.log_records.exists() + log_record = alert_group.log_records.get() + assert log_record.type == AlertGroupLogRecord.TYPE_ESCALATION_FAILED + assert ( + log_record.escalation_error_code + == AlertGroupLogRecord.ERROR_ESCALATION_DECLARE_INCIDENT_STEP_IS_NOT_ENABLED + ) diff --git a/engine/apps/alerts/tests/test_related_incident.py b/engine/apps/alerts/tests/test_related_incident.py new file mode 100644 index 00000000..a2dfd956 --- /dev/null +++ b/engine/apps/alerts/tests/test_related_incident.py @@ -0,0 +1,332 @@ +from unittest.mock import patch + +import httpretty +import pytest + +from apps.alerts.models import AlertGroupLogRecord, EscalationPolicy, RelatedIncident +from apps.alerts.tasks.declare_incident import ( + ATTACHMENT_CAPTION, + DEFAULT_BACKUP_TITLE, + DEFAULT_INCIDENT_SEVERITY, + ERROR_SEVERITY_NOT_FOUND, + MAX_ATTACHED_ALERT_GROUPS_PER_INCIDENT, + declare_incident, +) +from common.incident_api.client import IncidentAPIException + + +@pytest.fixture +def setup_alert_group_and_escalation_step( + make_organization, + make_alert_receive_channel, + make_alert_group, + make_channel_filter, + make_escalation_chain, + make_escalation_policy, +): + def _setup_alert_group_and_escalation_step(is_default_route=False, already_declared_incident=False): + organization = make_organization(grafana_url="https://stack.grafana.net", api_token="token") + alert_receive_channel = make_alert_receive_channel(organization=organization) + escalation_chain = make_escalation_chain(organization) + declare_incident_step = make_escalation_policy( + escalation_chain=escalation_chain, + escalation_policy_step=EscalationPolicy.STEP_DECLARE_INCIDENT, + ) + channel_filter = make_channel_filter( + alert_receive_channel, + escalation_chain=escalation_chain, + is_default=is_default_route, + ) + alert_group = make_alert_group( + alert_receive_channel=alert_receive_channel, + channel_filter=channel_filter, + ) + declared_incident = None + if already_declared_incident: + declared_incident = RelatedIncident.objects.create( + incident_id="123", + organization=organization, + channel_filter=channel_filter, + ) + + return alert_group, declare_incident_step, declared_incident + + return _setup_alert_group_and_escalation_step + + +@pytest.mark.django_db +def test_declare_incident_default_route(setup_alert_group_and_escalation_step): + alert_group, declare_incident_step, _ = setup_alert_group_and_escalation_step(is_default_route=True) + + declare_incident(alert_group.pk, declare_incident_step.pk) + + alert_group.refresh_from_db() + # check triggered log + log_record = alert_group.log_records.last() + assert log_record.type == log_record.TYPE_ESCALATION_FAILED + assert log_record.escalation_policy == declare_incident_step + assert log_record.escalation_policy_step == EscalationPolicy.STEP_DECLARE_INCIDENT + assert log_record.step_specific_info is None + assert log_record.reason == "Declare incident step is not enabled for default routes" + assert log_record.escalation_error_code == AlertGroupLogRecord.ERROR_ESCALATION_INCIDENT_COULD_NOT_BE_DECLARED + + +@pytest.mark.django_db +@httpretty.activate(verbose=True, allow_net_connect=False) +def test_declare_incident_ok(setup_alert_group_and_escalation_step): + alert_group, declare_incident_step, _ = setup_alert_group_and_escalation_step(already_declared_incident=False) + + with patch("common.incident_api.client.IncidentAPIClient.create_incident") as mock_create_incident: + mock_create_incident.return_value = {"incidentID": "123", "title": "Incident"}, None + declare_incident(alert_group.pk, declare_incident_step.pk) + + mock_create_incident.assert_called_with( + DEFAULT_BACKUP_TITLE, + severity=DEFAULT_INCIDENT_SEVERITY, + attachCaption=ATTACHMENT_CAPTION, + attachURL=alert_group.web_link, + ) + + alert_group.refresh_from_db() + + # check declared incident + new_incident = alert_group.related_incidents.get() + assert new_incident.incident_id == "123" + assert new_incident.organization == alert_group.channel.organization + assert new_incident.channel_filter == alert_group.channel_filter + # check triggered log + log_record = alert_group.log_records.last() + assert log_record.type == log_record.TYPE_ESCALATION_TRIGGERED + assert log_record.escalation_policy == declare_incident_step + assert log_record.escalation_policy_step == EscalationPolicy.STEP_DECLARE_INCIDENT + assert log_record.step_specific_info == {"incident_id": "123", "incident_title": "Incident"} + assert log_record.reason == "incident declared" + assert log_record.escalation_error_code is None + + +@pytest.mark.django_db +@httpretty.activate(verbose=True, allow_net_connect=False) +def test_declare_incident_set_severity(setup_alert_group_and_escalation_step): + alert_group, declare_incident_step, _ = setup_alert_group_and_escalation_step(already_declared_incident=False) + severity = "critical" + + with patch("common.incident_api.client.IncidentAPIClient.create_incident") as mock_create_incident: + mock_create_incident.return_value = {"incidentID": "123", "title": "Incident"}, None + declare_incident(alert_group.pk, declare_incident_step.pk, severity=severity) + + mock_create_incident.assert_called_with( + DEFAULT_BACKUP_TITLE, severity=severity, attachCaption=ATTACHMENT_CAPTION, attachURL=alert_group.web_link + ) + + +@pytest.mark.django_db +@httpretty.activate(verbose=True, allow_net_connect=False) +def test_declare_incident_set_severity_from_label(setup_alert_group_and_escalation_step): + alert_group, declare_incident_step, _ = setup_alert_group_and_escalation_step(already_declared_incident=False) + expected_severity = "minor" + # set alert group label + alert_group.labels.create( + organization=alert_group.channel.organization, key_name="severity", value_name=expected_severity + ) + severity = EscalationPolicy.SEVERITY_SET_FROM_LABEL + + with patch("common.incident_api.client.IncidentAPIClient.create_incident") as mock_create_incident: + mock_create_incident.return_value = {"incidentID": "123", "title": "Incident"}, None + declare_incident(alert_group.pk, declare_incident_step.pk, severity=severity) + + mock_create_incident.assert_called_with( + DEFAULT_BACKUP_TITLE, + severity=expected_severity, + attachCaption=ATTACHMENT_CAPTION, + attachURL=alert_group.web_link, + ) + + +@pytest.mark.django_db +@httpretty.activate(verbose=True, allow_net_connect=False) +def test_declare_incident_invalid_severity_fallback(setup_alert_group_and_escalation_step): + alert_group, declare_incident_step, _ = setup_alert_group_and_escalation_step(already_declared_incident=False) + severity = "INVALID" + + with patch("common.incident_api.client.IncidentAPIClient.create_incident") as mock_create_incident: + with patch.object(declare_incident, "apply_async") as mock_declare_incident_apply_async: + mock_create_incident.side_effect = IncidentAPIException( + status=500, url="some-url", msg=ERROR_SEVERITY_NOT_FOUND + ) + declare_incident(alert_group.pk, declare_incident_step.pk, severity=severity) + + # create call failing with invalid severity + mock_create_incident.assert_called_with( + DEFAULT_BACKUP_TITLE, severity=severity, attachCaption=ATTACHMENT_CAPTION, attachURL=alert_group.web_link + ) + # new task is queued with default severity instead + mock_declare_incident_apply_async.assert_called_with( + args=(alert_group.pk, declare_incident_step.pk), kwargs={"severity": DEFAULT_INCIDENT_SEVERITY} + ) + + +@pytest.mark.django_db +@httpretty.activate(verbose=True, allow_net_connect=False) +def test_declare_incident_attach_alert_group(setup_alert_group_and_escalation_step): + alert_group, declare_incident_step, existing_open_incident = setup_alert_group_and_escalation_step( + already_declared_incident=True + ) + incident_id = existing_open_incident.incident_id + + with patch("common.incident_api.client.IncidentAPIClient.get_incident") as mock_get_incident: + with patch("common.incident_api.client.IncidentAPIClient.add_activity") as mock_add_activity: + mock_get_incident.return_value = {"incidentID": incident_id, "title": "Incident", "status": "active"}, None + mock_add_activity.return_value = {"activityItemID": "111"}, None + declare_incident(alert_group.pk, declare_incident_step.pk) + + # check declared incident + assert existing_open_incident.attached_alert_groups.filter(id=alert_group.id).exists() + log_record = alert_group.log_records.last() + assert log_record.type == log_record.TYPE_ESCALATION_TRIGGERED + assert log_record.escalation_policy == declare_incident_step + assert log_record.escalation_policy_step == EscalationPolicy.STEP_DECLARE_INCIDENT + assert log_record.step_specific_info == {"incident_id": incident_id, "incident_title": "Incident"} + assert log_record.reason == "attached to existing incident" + assert log_record.escalation_error_code is None + + +@pytest.mark.django_db +@httpretty.activate(verbose=True, allow_net_connect=False) +def test_declare_incident_resolved_update(setup_alert_group_and_escalation_step): + alert_group, declare_incident_step, existing_open_incident = setup_alert_group_and_escalation_step( + already_declared_incident=True + ) + incident_id = existing_open_incident.incident_id + new_incident_id = "333" + assert new_incident_id != incident_id + + with patch("common.incident_api.client.IncidentAPIClient.get_incident") as mock_get_incident: + with patch("common.incident_api.client.IncidentAPIClient.create_incident") as mock_create_incident: + mock_get_incident.return_value = { + "incidentID": incident_id, + "title": "Incident1", + "status": "resolved", + }, None + mock_create_incident.return_value = {"incidentID": new_incident_id, "title": "Incident2"}, None + declare_incident(alert_group.pk, declare_incident_step.pk) + + existing_open_incident.refresh_from_db() + + assert existing_open_incident.is_active is False + # check declared incident + assert not existing_open_incident.attached_alert_groups.filter(id=alert_group.id).exists() + assert alert_group.related_incidents.get().incident_id == new_incident_id + log_record = alert_group.log_records.last() + assert log_record.type == log_record.TYPE_ESCALATION_TRIGGERED + assert log_record.escalation_policy == declare_incident_step + assert log_record.escalation_policy_step == EscalationPolicy.STEP_DECLARE_INCIDENT + assert log_record.step_specific_info == {"incident_id": new_incident_id, "incident_title": "Incident2"} + assert log_record.reason == "incident declared" + assert log_record.escalation_error_code is None + + +@pytest.mark.django_db +@httpretty.activate(verbose=True, allow_net_connect=False) +def test_declare_incident_attach_alert_group_skip_incident_update( + setup_alert_group_and_escalation_step, make_alert_group +): + alert_group, declare_incident_step, existing_open_incident = setup_alert_group_and_escalation_step( + already_declared_incident=True + ) + alert_receive_channel = alert_group.channel + channel_filter = alert_group.channel_filter + incident_id = existing_open_incident.incident_id + + # attach max alert groups to incident + for _ in range(MAX_ATTACHED_ALERT_GROUPS_PER_INCIDENT): + ag = make_alert_group(alert_receive_channel=alert_receive_channel, channel_filter=channel_filter) + existing_open_incident.attached_alert_groups.add(ag) + + with patch("common.incident_api.client.IncidentAPIClient.get_incident") as mock_get_incident: + with patch("common.incident_api.client.IncidentAPIClient.add_activity") as mock_add_activity: + mock_get_incident.return_value = {"incidentID": incident_id, "title": "Incident", "status": "active"}, None + declare_incident(alert_group.pk, declare_incident_step.pk) + + assert not mock_add_activity.called + + # check declared incident + assert existing_open_incident.attached_alert_groups.filter(id=alert_group.id).exists() + log_record = alert_group.log_records.last() + assert log_record.type == log_record.TYPE_ESCALATION_TRIGGERED + assert log_record.escalation_policy == declare_incident_step + assert log_record.escalation_policy_step == EscalationPolicy.STEP_DECLARE_INCIDENT + assert log_record.step_specific_info == {"incident_id": incident_id, "incident_title": "Incident"} + assert log_record.reason == "attached to existing incident" + assert log_record.escalation_error_code is None + + +@pytest.mark.django_db +@httpretty.activate(verbose=True, allow_net_connect=False) +def test_get_existing_incident_error(setup_alert_group_and_escalation_step): + alert_group, declare_incident_step, existing_open_incident = setup_alert_group_and_escalation_step( + already_declared_incident=True + ) + + with patch("common.incident_api.client.IncidentAPIClient.get_incident") as mock_get_incident: + mock_get_incident.side_effect = IncidentAPIException(status=500, url="some-url") + with pytest.raises(IncidentAPIException): + declare_incident(alert_group.pk, declare_incident_step.pk) + + # but if incident was not found, a new one should be created + incident_id = existing_open_incident.incident_id + new_incident_id = "333" + assert new_incident_id != incident_id + + with patch("common.incident_api.client.IncidentAPIClient.get_incident") as mock_get_incident: + with patch("common.incident_api.client.IncidentAPIClient.create_incident") as mock_create_incident: + mock_get_incident.side_effect = IncidentAPIException(status=404, url="some-url") + mock_create_incident.return_value = {"incidentID": new_incident_id, "title": "Incident"}, None + declare_incident(alert_group.pk, declare_incident_step.pk) + + alert_group.refresh_from_db() + + # check declared incident + assert not existing_open_incident.attached_alert_groups.filter(id=alert_group.id).exists() + new_incident = alert_group.related_incidents.get() + assert new_incident != existing_open_incident + assert new_incident.incident_id == new_incident_id + assert new_incident.organization == alert_group.channel.organization + assert new_incident.channel_filter == alert_group.channel_filter + + +@pytest.mark.django_db +@httpretty.activate(verbose=True, allow_net_connect=False) +def test_attach_alert_group_error(setup_alert_group_and_escalation_step): + alert_group, declare_incident_step, existing_open_incident = setup_alert_group_and_escalation_step( + already_declared_incident=True + ) + incident_id = existing_open_incident.incident_id + + with patch("common.incident_api.client.IncidentAPIClient.get_incident") as mock_get_incident: + with patch("common.incident_api.client.IncidentAPIClient.add_activity") as mock_add_activity: + mock_get_incident.return_value = {"incidentID": incident_id, "title": "Incident", "status": "active"}, None + mock_add_activity.side_effect = IncidentAPIException(status=500, url="some-url") + declare_incident(alert_group.pk, declare_incident_step.pk) + + alert_group.refresh_from_db() + + # incident attachment failed, but DB is still updated + assert existing_open_incident.attached_alert_groups.filter(id=alert_group.id).exists() + log_record = alert_group.log_records.last() + assert log_record.type == log_record.TYPE_ESCALATION_TRIGGERED + assert log_record.escalation_policy == declare_incident_step + assert log_record.escalation_policy_step == EscalationPolicy.STEP_DECLARE_INCIDENT + assert log_record.step_specific_info == {"incident_id": incident_id, "incident_title": "Incident"} + assert log_record.reason == "attached to existing incident" + assert log_record.escalation_error_code is None + + +@pytest.mark.django_db +@httpretty.activate(verbose=True, allow_net_connect=False) +def test_create_incident_error(setup_alert_group_and_escalation_step): + alert_group, declare_incident_step, _ = setup_alert_group_and_escalation_step(already_declared_incident=False) + + with patch("common.incident_api.client.IncidentAPIClient.create_incident") as mock_create_incident: + mock_create_incident.side_effect = IncidentAPIException(status=500, url="some-url") + with pytest.raises(IncidentAPIException): + declare_incident(alert_group.pk, declare_incident_step.pk) diff --git a/engine/apps/alerts/utils.py b/engine/apps/alerts/utils.py index abf6b24c..5317c22b 100644 --- a/engine/apps/alerts/utils.py +++ b/engine/apps/alerts/utils.py @@ -1,3 +1,11 @@ +import typing + +from django.conf import settings + +if typing.TYPE_CHECKING: + from apps.user_management.models import Organization + + def render_relative_timeline(log_created_at, alert_group_started_at): time_delta = log_created_at - alert_group_started_at seconds = int(time_delta.total_seconds()) @@ -12,3 +20,7 @@ def render_relative_timeline(log_created_at, alert_group_started_at): return "%dm%ds" % (minutes, seconds) else: return "%ds" % (seconds,) + + +def is_declare_incident_step_enabled(organization: "Organization") -> bool: + return organization.is_grafana_incident_enabled and settings.FEATURE_DECLARE_INCIDENT_STEP_ENABLED diff --git a/engine/apps/api/serializers/escalation_policy.py b/engine/apps/api/serializers/escalation_policy.py index 75f36284..f8b0270d 100644 --- a/engine/apps/api/serializers/escalation_policy.py +++ b/engine/apps/api/serializers/escalation_policy.py @@ -3,6 +3,7 @@ from datetime import timedelta from rest_framework import serializers from apps.alerts.models import EscalationChain, EscalationPolicy +from apps.alerts.utils import is_declare_incident_step_enabled from apps.schedules.models import OnCallSchedule from apps.slack.models import SlackUserGroup from apps.user_management.models import Team, User @@ -24,6 +25,7 @@ TO_TIME = "to_time" NUM_ALERTS_IN_WINDOW = "num_alerts_in_window" NUM_MINUTES_IN_WINDOW = "num_minutes_in_window" CUSTOM_WEBHOOK_TRIGGER = "custom_webhook" +SEVERITY = "severity" STEP_TYPE_TO_RELATED_FIELD_MAP = { EscalationPolicy.STEP_WAIT: [WAIT_DELAY], @@ -35,6 +37,7 @@ STEP_TYPE_TO_RELATED_FIELD_MAP = { EscalationPolicy.STEP_NOTIFY_IF_TIME: [FROM_TIME, TO_TIME], EscalationPolicy.STEP_NOTIFY_IF_NUM_ALERTS_IN_TIME_WINDOW: [NUM_ALERTS_IN_WINDOW, NUM_MINUTES_IN_WINDOW], EscalationPolicy.STEP_TRIGGER_CUSTOM_WEBHOOK: [CUSTOM_WEBHOOK_TRIGGER], + EscalationPolicy.STEP_DECLARE_INCIDENT: [SEVERITY], } @@ -81,6 +84,7 @@ class EscalationPolicySerializer(EagerLoadingMixin, serializers.ModelSerializer) allow_null=True, filter_field="organization", ) + severity = serializers.CharField(required=False, allow_null=True) class Meta: model = EscalationPolicy @@ -99,6 +103,7 @@ class EscalationPolicySerializer(EagerLoadingMixin, serializers.ModelSerializer) "notify_schedule", "notify_to_group", "notify_to_team_members", + "severity", "important", ] @@ -123,6 +128,7 @@ class EscalationPolicySerializer(EagerLoadingMixin, serializers.ModelSerializer) NUM_ALERTS_IN_WINDOW, NUM_MINUTES_IN_WINDOW, CUSTOM_WEBHOOK_TRIGGER, + SEVERITY, ] step = data.get("step") @@ -151,6 +157,8 @@ class EscalationPolicySerializer(EagerLoadingMixin, serializers.ModelSerializer) raise serializers.ValidationError("Invalid step value") if step_type in EscalationPolicy.SLACK_INTEGRATION_REQUIRED_STEPS and organization.slack_team_identity is None: raise serializers.ValidationError("Invalid escalation step type: step is Slack-specific") + if step_type == EscalationPolicy.STEP_DECLARE_INCIDENT and not is_declare_incident_step_enabled(organization): + raise serializers.ValidationError("Invalid escalation step type: step is not enabled") return step_type def to_representation(self, instance): @@ -214,6 +222,7 @@ class EscalationPolicyUpdateSerializer(EscalationPolicySerializer): NUM_ALERTS_IN_WINDOW, NUM_MINUTES_IN_WINDOW, CUSTOM_WEBHOOK_TRIGGER, + SEVERITY, ] for f in STEP_TYPE_TO_RELATED_FIELD_MAP.get(step, []): diff --git a/engine/apps/api/tests/test_alert_group.py b/engine/apps/api/tests/test_alert_group.py index a015fccc..8ee438b6 100644 --- a/engine/apps/api/tests/test_alert_group.py +++ b/engine/apps/api/tests/test_alert_group.py @@ -975,6 +975,37 @@ def test_get_filter_labels( assert response.json()["results"][0]["pk"] == alert_groups[0].public_primary_key +@pytest.mark.django_db +def test_get_filter_by_related_incident( + alert_group_internal_api_setup, make_related_incident, make_alert_group, make_user_auth_headers +): + user, token, alert_groups = alert_group_internal_api_setup + + alert_group = alert_groups[0] + related_incident = make_related_incident("1", alert_group.channel.organization, alert_group.channel_filter) + related_incident.attached_alert_groups.add(alert_group) + + client = APIClient() + url = reverse("api-internal:alertgroup-list") + response = client.get( + url + "?has_related_incident=true", + format="json", + **make_user_auth_headers(user, token), + ) + + assert response.status_code == status.HTTP_200_OK + assert len(response.data["results"]) == 1 + + response = client.get( + url + "?has_related_incident=false", + format="json", + **make_user_auth_headers(user, token), + ) + + assert response.status_code == status.HTTP_200_OK + assert len(response.data["results"]) == 3 + + @pytest.mark.django_db def test_get_title_search( settings, diff --git a/engine/apps/api/tests/test_escalation_policy.py b/engine/apps/api/tests/test_escalation_policy.py index 0a5f719e..0c1b3299 100644 --- a/engine/apps/api/tests/test_escalation_policy.py +++ b/engine/apps/api/tests/test_escalation_policy.py @@ -10,6 +10,7 @@ from rest_framework.test import APIClient from apps.alerts.models import EscalationPolicy from apps.api.permissions import LegacyAccessControlRole +from common.incident_api.client import DEFAULT_INCIDENT_SEVERITY, IncidentAPIException @pytest.fixture() @@ -651,8 +652,13 @@ def test_create_escalation_policy_with_no_important_version( make_escalation_chain, step, make_user_auth_headers, + settings, ): organization, user, _, _ = make_organization_and_user_with_slack_identities() + # make sure declare incident step is enabled + settings.FEATURE_DECLARE_INCIDENT_STEP_ENABLED = True + organization.is_grafana_incident_enabled = True + organization.save() _, token = make_token_for_organization(organization) escalation_chain = make_escalation_chain(organization) @@ -832,6 +838,7 @@ def test_escalation_policy_switch_importance( "notify_schedule": None, "notify_to_group": None, "notify_to_team_members": None, + "severity": None, "important": True, "wait_delay": None, } @@ -889,6 +896,7 @@ def test_escalation_policy_filter_by_user( "notify_schedule": None, "notify_to_group": None, "notify_to_team_members": None, + "severity": None, "important": False, }, { @@ -906,6 +914,7 @@ def test_escalation_policy_filter_by_user( "notify_schedule": None, "notify_to_group": None, "notify_to_team_members": None, + "severity": None, "important": False, }, ] @@ -971,6 +980,7 @@ def test_escalation_policy_filter_by_slack_channel( "notify_schedule": None, "notify_to_group": None, "notify_to_team_members": None, + "severity": None, "important": False, }, ] @@ -1001,3 +1011,88 @@ def test_escalation_policy_escalation_options_webhooks( returned_options = [option["value"] for option in response.json()] assert EscalationPolicy.STEP_TRIGGER_CUSTOM_WEBHOOK in returned_options + + +@pytest.mark.django_db +def test_escalation_policy_severity_options( + make_organization_and_user_with_plugin_token, + make_user_auth_headers, +): + organization, user, token = make_organization_and_user_with_plugin_token() + organization.is_grafana_labels_enabled = False + organization.save() + + client = APIClient() + url = reverse("api-internal:escalation_policy-severity-options") + + # without labels enabled + available_severities = [ + {"severityID": "abc", "orgID": "1", "displayLabel": "Pending", "level": -1}, + {"severityID": "def", "orgID": "1", "displayLabel": "Critical", "level": 1}, + ] + with patch("common.incident_api.client.IncidentAPIClient.get_severities") as mock_get_severities: + mock_get_severities.return_value = available_severities, None + response = client.get(url, format="json", **make_user_auth_headers(user, token)) + + expected_options = [{"value": s["displayLabel"], "display_name": s["displayLabel"]} for s in available_severities] + assert response.json() == expected_options + + # failing request does not break; fallback to default option only + with patch("common.incident_api.client.IncidentAPIClient.get_severities") as mock_get_severities: + mock_get_severities.side_effect = IncidentAPIException(status=404, url="some-url") + response = client.get(url, format="json", **make_user_auth_headers(user, token)) + + fallback_options = [{"value": DEFAULT_INCIDENT_SEVERITY, "display_name": DEFAULT_INCIDENT_SEVERITY}] + assert response.json() == fallback_options + + # labels enabled + organization.is_grafana_labels_enabled = True + organization.save() + + with patch("common.incident_api.client.IncidentAPIClient.get_severities") as mock_get_severities: + mock_get_severities.return_value = available_severities, None + response = client.get(url, format="json", **make_user_auth_headers(user, token)) + # include set from label option + expected_options = [ + { + "value": EscalationPolicy.SEVERITY_SET_FROM_LABEL, + "display_name": EscalationPolicy.SEVERITY_SET_FROM_LABEL_DISPLAY_VALUE, + } + ] + expected_options + assert response.json() == expected_options + + +@pytest.mark.django_db +def test_create_escalation_policy_declare_incident( + escalation_policy_internal_api_setup, make_user_auth_headers, settings +): + token, escalation_chain, _, user, _ = escalation_policy_internal_api_setup + organization = escalation_chain.organization + client = APIClient() + url = reverse("api-internal:escalation_policy-list") + + data = { + "step": EscalationPolicy.STEP_DECLARE_INCIDENT, + "severity": "critical", + "escalation_chain": escalation_chain.public_primary_key, + } + + response = client.post(url, data, format="json", **make_user_auth_headers(user, token)) + assert response.status_code == status.HTTP_400_BAD_REQUEST + + # make sure declare incident step is enabled + settings.FEATURE_DECLARE_INCIDENT_STEP_ENABLED = True + organization.is_grafana_incident_enabled = True + organization.save() + + response = client.post(url, data, format="json", **make_user_auth_headers(user, token)) + assert response.status_code == status.HTTP_201_CREATED + escalation_policy = EscalationPolicy.objects.get(public_primary_key=response.data["id"]) + assert escalation_policy.step == EscalationPolicy.STEP_DECLARE_INCIDENT + assert escalation_policy.severity == "critical" + + url = reverse("api-internal:escalation_policy-detail", kwargs={"pk": escalation_policy.public_primary_key}) + response = client.get(url, format="json", **make_user_auth_headers(user, token)) + response_data = response.json() + assert response_data["step"] == EscalationPolicy.STEP_DECLARE_INCIDENT + assert response_data["severity"] == "critical" diff --git a/engine/apps/api/views/alert_group.py b/engine/apps/api/views/alert_group.py index 22d0be4a..c937a784 100644 --- a/engine/apps/api/views/alert_group.py +++ b/engine/apps/api/views/alert_group.py @@ -17,6 +17,7 @@ from apps.alerts.constants import ActionSource from apps.alerts.models import AlertGroup, AlertReceiveChannel, EscalationChain, ResolutionNote from apps.alerts.paging import unpage_user from apps.alerts.tasks import delete_alert_group, send_update_resolution_note_signal +from apps.alerts.utils import is_declare_incident_step_enabled from apps.api.errors import AlertGroupAPIError from apps.api.label_filtering import parse_label_query from apps.api.permissions import RBACPermission @@ -120,6 +121,7 @@ class AlertGroupFilter(DateRangeFilterMixin, ModelFieldFilterMixin, filters.Filt ) with_resolution_note = filters.BooleanFilter(method="filter_with_resolution_note") mine = filters.BooleanFilter(method="filter_mine") + has_related_incident = filters.BooleanFilter(field_name="related_incidents", lookup_expr="isnull", exclude=True) def filter_status(self, queryset, name, value): if not value: @@ -719,6 +721,7 @@ class AlertGroupView( """ Retrieve a list of valid filter options that can be used to filter alert groups """ + organization = self.request.auth.organization api_root = "/api/internal/v1/" default_day_range = 30 @@ -804,7 +807,7 @@ class AlertGroupView( filter_options = [{"name": "search", "type": "search", "description": description}] + filter_options - if is_labels_feature_enabled(self.request.auth.organization): + if is_labels_feature_enabled(organization): filter_options.append( { "name": "label", @@ -813,6 +816,15 @@ class AlertGroupView( } ) + if is_declare_incident_step_enabled(organization): + filter_options.append( + { + "name": "has_related_incident", + "type": "boolean", + "default": "true", + } + ) + return Response(filter_options) @extend_schema( diff --git a/engine/apps/api/views/escalation_policy.py b/engine/apps/api/views/escalation_policy.py index 2cb288be..eb502b5c 100644 --- a/engine/apps/api/views/escalation_policy.py +++ b/engine/apps/api/views/escalation_policy.py @@ -1,3 +1,5 @@ +import logging + from django.conf import settings from django.db.models import Q from rest_framework.decorators import action @@ -5,6 +7,7 @@ from rest_framework.permissions import IsAuthenticated from rest_framework.response import Response from apps.alerts.models import EscalationPolicy +from apps.alerts.utils import is_declare_incident_step_enabled from apps.api.permissions import RBACPermission from apps.api.serializers.escalation_policy import ( EscalationPolicyCreateSerializer, @@ -19,9 +22,12 @@ from common.api_helpers.mixins import ( TeamFilteringMixin, UpdateSerializerMixin, ) +from common.incident_api.client import DEFAULT_INCIDENT_SEVERITY, IncidentAPIClient, IncidentAPIException from common.insight_log import EntityEvent, write_resource_insight_log from common.ordered_model.viewset import OrderedModelViewSet +logger = logging.getLogger(__name__) + class EscalationPolicyView( TeamFilteringMixin, @@ -42,6 +48,7 @@ class EscalationPolicyView( "escalation_options": [RBACPermission.Permissions.ESCALATION_CHAINS_READ], "delay_options": [RBACPermission.Permissions.ESCALATION_CHAINS_READ], "num_minutes_in_window_options": [RBACPermission.Permissions.ESCALATION_CHAINS_READ], + "severity_options": [RBACPermission.Permissions.ESCALATION_CHAINS_READ], "create": [RBACPermission.Permissions.ESCALATION_CHAINS_WRITE], "update": [RBACPermission.Permissions.ESCALATION_CHAINS_WRITE], "partial_update": [RBACPermission.Permissions.ESCALATION_CHAINS_WRITE], @@ -116,6 +123,7 @@ class EscalationPolicyView( @action(detail=False, methods=["get"]) def escalation_options(self, request): + grafana_declare_incident_enabled = is_declare_incident_step_enabled(organization=self.request.auth.organization) choices = [] for step in EscalationPolicy.INTERNAL_API_STEPS: verbal = EscalationPolicy.INTERNAL_API_STEPS_TO_VERBAL_MAP[step] @@ -126,7 +134,7 @@ class EscalationPolicyView( if slack_integration_required and not settings.FEATURE_SLACK_INTEGRATION_ENABLED: continue - if step == EscalationPolicy.STEP_DECLARE_INCIDENT: + if step == EscalationPolicy.STEP_DECLARE_INCIDENT and not grafana_declare_incident_enabled: continue choices.append( @@ -155,3 +163,25 @@ class EscalationPolicyView( {"value": choice[0], "display_name": choice[1]} for choice in EscalationPolicy.WEB_DURATION_CHOICES_MINUTES ] return Response(choices) + + @action(detail=False, methods=["get"]) + def severity_options(self, request): + organization = self.request.auth.organization + choices = [] + if organization.is_grafana_labels_enabled: + choices = [ + { + "value": EscalationPolicy.SEVERITY_SET_FROM_LABEL, + "display_name": EscalationPolicy.SEVERITY_SET_FROM_LABEL_DISPLAY_VALUE, + } + ] + incident_client = IncidentAPIClient(organization.grafana_url, organization.api_token) + try: + severities, _ = incident_client.get_severities() + choices += [ + {"value": severity["displayLabel"], "display_name": severity["displayLabel"]} for severity in severities + ] + except IncidentAPIException as e: + logger.error(f"Error getting severities: {e.msg}") + choices += [{"value": DEFAULT_INCIDENT_SEVERITY, "display_name": DEFAULT_INCIDENT_SEVERITY}] + return Response(choices) diff --git a/engine/apps/public_api/serializers/escalation_policies.py b/engine/apps/public_api/serializers/escalation_policies.py index ba40ff30..54fb35ad 100644 --- a/engine/apps/public_api/serializers/escalation_policies.py +++ b/engine/apps/public_api/serializers/escalation_policies.py @@ -5,6 +5,7 @@ from django.utils.functional import cached_property from rest_framework import fields, serializers from apps.alerts.models import EscalationChain, EscalationPolicy +from apps.alerts.utils import is_declare_incident_step_enabled from apps.schedules.models import OnCallSchedule from apps.slack.models import SlackUserGroup from apps.user_management.models import Team, User @@ -72,6 +73,7 @@ class EscalationPolicySerializer(EagerLoadingMixin, OrderedModelSerializer): required=False, source="custom_webhook", ) + severity = serializers.CharField(required=False) important = serializers.BooleanField(required=False) TIME_FORMAT = "%H:%M:%SZ" @@ -101,6 +103,7 @@ class EscalationPolicySerializer(EagerLoadingMixin, OrderedModelSerializer): "notify_if_time_to", "num_alerts_in_window", "num_minutes_in_window", + "severity", ] PREFETCH_RELATED = ["notify_to_users_queue"] @@ -120,6 +123,9 @@ class EscalationPolicySerializer(EagerLoadingMixin, OrderedModelSerializer): if step_type == EscalationPolicy.STEP_FINAL_NOTIFYALL and organization.slack_team_identity is None: raise BadRequest(detail="Invalid escalation step type: step is Slack-specific") + if step_type == EscalationPolicy.STEP_DECLARE_INCIDENT and not is_declare_incident_step_enabled(organization): + raise BadRequest("Invalid escalation step type: step is not enabled") + return step_type def create(self, validated_data): @@ -163,6 +169,7 @@ class EscalationPolicySerializer(EagerLoadingMixin, OrderedModelSerializer): "notify_if_time_to", "num_alerts_in_window", "num_minutes_in_window", + "severity", ] if step == EscalationPolicy.STEP_WAIT: fields_to_remove.remove("duration") @@ -190,6 +197,8 @@ class EscalationPolicySerializer(EagerLoadingMixin, OrderedModelSerializer): elif step == EscalationPolicy.STEP_NOTIFY_IF_NUM_ALERTS_IN_TIME_WINDOW: fields_to_remove.remove("num_alerts_in_window") fields_to_remove.remove("num_minutes_in_window") + elif step == EscalationPolicy.STEP_DECLARE_INCIDENT: + fields_to_remove.remove("severity") if ( step in EscalationPolicy.DEFAULT_TO_IMPORTANT_STEP_MAPPING @@ -213,6 +222,7 @@ class EscalationPolicySerializer(EagerLoadingMixin, OrderedModelSerializer): "to_time", "num_alerts_in_window", "num_minutes_in_window", + "severity", ] step = validated_data.get("step") important = validated_data.pop("important", None) @@ -243,6 +253,8 @@ class EscalationPolicySerializer(EagerLoadingMixin, OrderedModelSerializer): elif step == EscalationPolicy.STEP_NOTIFY_IF_NUM_ALERTS_IN_TIME_WINDOW: validated_data_fields_to_remove.remove("num_alerts_in_window") validated_data_fields_to_remove.remove("num_minutes_in_window") + elif step == EscalationPolicy.STEP_DECLARE_INCIDENT: + validated_data_fields_to_remove.remove("severity") for field in validated_data_fields_to_remove: validated_data.pop(field, None) @@ -299,5 +311,7 @@ class EscalationPolicyUpdateSerializer(EscalationPolicySerializer): if step != EscalationPolicy.STEP_NOTIFY_IF_NUM_ALERTS_IN_TIME_WINDOW: instance.num_alerts_in_window = None instance.num_minutes_in_window = None + if step != EscalationPolicy.STEP_DECLARE_INCIDENT: + instance.severity = None return super().update(instance, validated_data) diff --git a/engine/apps/public_api/tests/test_escalation_policies.py b/engine/apps/public_api/tests/test_escalation_policies.py index 9cf961ac..e1d478da 100644 --- a/engine/apps/public_api/tests/test_escalation_policies.py +++ b/engine/apps/public_api/tests/test_escalation_policies.py @@ -463,3 +463,43 @@ def test_update_escalation_policy_using_notify_team_members( escalation_policy = EscalationPolicy.objects.get(public_primary_key=response.data["id"]) serializer = EscalationPolicySerializer(escalation_policy) assert response.data == serializer.data + + +@pytest.mark.django_db +def test_create_escalation_policy_declare_incident( + make_organization_and_user_with_token, + escalation_policies_setup, + settings, +): + organization, user, token = make_organization_and_user_with_token() + escalation_chain, _, _ = escalation_policies_setup(organization, user) + + data_for_create = { + "escalation_chain_id": escalation_chain.public_primary_key, + "type": "declare_incident", + "position": 0, + "severity": "critical", + } + + client = APIClient() + url = reverse("api-public:escalation_policies-list") + response = client.post(url, data=data_for_create, format="json", HTTP_AUTHORIZATION=token) + assert response.status_code == status.HTTP_400_BAD_REQUEST + + # make sure declare incident step is enabled + settings.FEATURE_DECLARE_INCIDENT_STEP_ENABLED = True + organization.is_grafana_incident_enabled = True + organization.save() + + response = client.post(url, data=data_for_create, format="json", HTTP_AUTHORIZATION=token) + assert response.status_code == status.HTTP_201_CREATED + + escalation_policy = EscalationPolicy.objects.get(public_primary_key=response.data["id"]) + assert escalation_policy.step == EscalationPolicy.STEP_DECLARE_INCIDENT + assert escalation_policy.severity == "critical" + + url = reverse("api-public:escalation_policies-detail", kwargs={"pk": escalation_policy.public_primary_key}) + response = client.get(url, format="json", HTTP_AUTHORIZATION=token) + response_data = response.json() + assert response_data["type"] == EscalationPolicy.PUBLIC_STEP_CHOICES_MAP[EscalationPolicy.STEP_DECLARE_INCIDENT] + assert response_data["severity"] == "critical" diff --git a/engine/conftest.py b/engine/conftest.py index ec655a48..a95383dd 100644 --- a/engine/conftest.py +++ b/engine/conftest.py @@ -35,6 +35,7 @@ from apps.alerts.tests.factories import ( EscalationChainFactory, EscalationPolicyFactory, InvitationFactory, + RelatedIncidentFactory, ResolutionNoteFactory, ResolutionNoteSlackMessageFactory, UserNotificationBundleFactory, @@ -1112,3 +1113,11 @@ def make_user_notification_bundle(): ) return _make_user_notification_bundle + + +@pytest.fixture +def make_related_incident(): + def _make_related_incident(incident_id, organization, channel_filter): + return RelatedIncidentFactory(incident_id=incident_id, organization=organization, channel_filter=channel_filter) + + return _make_related_incident diff --git a/engine/settings/base.py b/engine/settings/base.py index e189a5b8..4f0859f0 100644 --- a/engine/settings/base.py +++ b/engine/settings/base.py @@ -75,6 +75,7 @@ FEATURE_LABELS_ENABLED_PER_ORG = getenv_list("FEATURE_LABELS_ENABLED_PER_ORG", d FEATURE_ALERT_GROUP_SEARCH_ENABLED = getenv_boolean("FEATURE_ALERT_GROUP_SEARCH_ENABLED", default=True) FEATURE_ALERT_GROUP_SEARCH_CUTOFF_DAYS = getenv_integer("FEATURE_ALERT_GROUP_SEARCH_CUTOFF_DAYS", default=None) FEATURE_NOTIFICATION_BUNDLE_ENABLED = getenv_boolean("FEATURE_NOTIFICATION_BUNDLE_ENABLED", default=True) +FEATURE_DECLARE_INCIDENT_STEP_ENABLED = getenv_boolean("FEATURE_DECLARE_INCIDENT_STEP_ENABLED", default=False) TWILIO_API_KEY_SID = os.environ.get("TWILIO_API_KEY_SID") TWILIO_API_KEY_SECRET = os.environ.get("TWILIO_API_KEY_SECRET") diff --git a/engine/settings/celery_task_routes.py b/engine/settings/celery_task_routes.py index ed58be1a..29309a71 100644 --- a/engine/settings/celery_task_routes.py +++ b/engine/settings/celery_task_routes.py @@ -94,6 +94,7 @@ CELERY_TASK_ROUTES = { # CRITICAL "apps.alerts.tasks.acknowledge_reminder.acknowledge_reminder_task": {"queue": "critical"}, "apps.alerts.tasks.acknowledge_reminder.unacknowledge_timeout_task": {"queue": "critical"}, + "apps.alerts.tasks.declare_incident.declare_incident": {"queue": "critical"}, "apps.alerts.tasks.distribute_alert.send_alert_create_signal": {"queue": "critical"}, "apps.alerts.tasks.escalate_alert_group.escalate_alert_group": {"queue": "critical"}, "apps.alerts.tasks.invite_user_to_join_incident.invite_user_to_join_incident": {"queue": "critical"},