Reworked declare incident escalation step (#5130)

Reworked https://github.com/grafana/oncall/pull/5047. Main update is the
switch from FK to a [M2M
relation](https://docs.google.com/document/d/1HeulqxoFShSHtInQrZNJLL5MDlHPNT50rVGaK3zZWvw/edit?disco=AAABVLjV4W8)
(which doesn't really change the original/intended behavior, besides not
needing to alter the alert group table, and it is a bit more flexible;
the extra table shouldn't introduce issues because this is used only for
tracking purposes and the information needed in the log record is
already there).

Avoid a db migration involving alert group table:

```
--
-- Create model RelatedIncident
--
CREATE TABLE `alerts_relatedincident` (`id` bigint AUTO_INCREMENT NOT NULL PRIMARY KEY, `incident_id` varchar(50) NOT NULL, `created_at` datetime(6) NOT NULL, `is_active` bool NOT NULL, `channel_filter_id` bigint NULL, `organization_id` bigint NOT NULL);
CREATE TABLE `alerts_relatedincident_attached_alert_groups` (`id` bigint AUTO_INCREMENT NOT NULL PRIMARY KEY, `relatedincident_id` bigint NOT NULL, `alertgroup_id` bigint NOT NULL);
ALTER TABLE `alerts_relatedincident` ADD CONSTRAINT `alerts_relatedincident_organization_id_incident_id_d7fc9a4f_uniq` UNIQUE (`organization_id`, `incident_id`);
ALTER TABLE `alerts_relatedincident` ADD CONSTRAINT `alerts_relatedincide_channel_filter_id_9556c836_fk_alerts_ch` FOREIGN KEY (`channel_filter_id`) REFERENCES `alerts_channelfilter` (`id`);
ALTER TABLE `alerts_relatedincident` ADD CONSTRAINT `alerts_relatedincide_organization_id_74ed6bed_fk_user_mana` FOREIGN KEY (`organization_id`) REFERENCES `user_management_organization` (`id`);
CREATE INDEX `alerts_relatedincident_incident_id_8356a799` ON `alerts_relatedincident` (`incident_id`);
ALTER TABLE `alerts_relatedincident_attached_alert_groups` ADD CONSTRAINT `alerts_relatedincident_a_relatedincident_id_alert_3d683baa_uniq` UNIQUE (`relatedincident_id`, `alertgroup_id`);
ALTER TABLE `alerts_relatedincident_attached_alert_groups` ADD CONSTRAINT `alerts_relatedincide_relatedincident_id_3e5e7a23_fk_alerts_re` FOREIGN KEY (`relatedincident_id`) REFERENCES `alerts_relatedincident` (`id`);
ALTER TABLE `alerts_relatedincident_attached_alert_groups` ADD CONSTRAINT `alerts_relatedincide_alertgroup_id_0125deca_fk_alerts_al` FOREIGN KEY (`alertgroup_id`) REFERENCES `alerts_alertgroup` (`id`);
```
This commit is contained in:
Matias Bordese 2024-10-07 16:26:10 -03:00 committed by GitHub
parent ac7dc97cc3
commit fa815b7ecd
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
23 changed files with 957 additions and 5 deletions

View file

@ -12,11 +12,13 @@ from apps.alerts.models.alert_group_log_record import AlertGroupLogRecord
from apps.alerts.models.escalation_policy import EscalationPolicy
from apps.alerts.tasks import (
custom_webhook_result,
declare_incident,
notify_all_task,
notify_group_task,
notify_user_task,
resolve_by_last_step_task,
)
from apps.alerts.utils import is_declare_incident_step_enabled
from apps.schedules.ical_utils import list_users_to_notify_from_ical
from apps.user_management.models import User
@ -136,6 +138,7 @@ class EscalationPolicySnapshot:
EscalationPolicy.STEP_NOTIFY_IF_NUM_ALERTS_IN_TIME_WINDOW: self._escalation_step_notify_if_num_alerts_in_time_window,
EscalationPolicy.STEP_NOTIFY_MULTIPLE_USERS: self._escalation_step_notify_multiple_users,
EscalationPolicy.STEP_NOTIFY_MULTIPLE_USERS_IMPORTANT: self._escalation_step_notify_multiple_users,
EscalationPolicy.STEP_DECLARE_INCIDENT: self._escalation_step_declare_incident,
None: self._escalation_step_not_configured,
}
result = action_map[self.step](alert_group, reason)
@ -410,6 +413,32 @@ class EscalationPolicySnapshot:
self._execute_tasks(tasks)
def _escalation_step_declare_incident(self, alert_group: "AlertGroup", _reason: str) -> None:
grafana_declare_incident_enabled = is_declare_incident_step_enabled(
organization=alert_group.channel.organization
)
if not grafana_declare_incident_enabled:
AlertGroupLogRecord(
type=AlertGroupLogRecord.TYPE_ESCALATION_FAILED,
alert_group=alert_group,
reason="Declare Incident step is not enabled",
escalation_policy=self.escalation_policy,
escalation_error_code=AlertGroupLogRecord.ERROR_ESCALATION_DECLARE_INCIDENT_STEP_IS_NOT_ENABLED,
escalation_policy_step=self.step,
).save()
return
tasks = []
declare_incident_task = declare_incident.signature(
args=(alert_group.pk,),
kwargs={
"escalation_policy_pk": self.id,
"severity": self.severity,
},
immutable=True,
)
tasks.append(declare_incident_task)
self._execute_tasks(tasks)
def _escalation_step_notify_if_time(self, alert_group: "AlertGroup", _reason: str) -> StepExecutionResultData:
eta = None

View file

@ -0,0 +1,30 @@
# Generated by Django 4.2.15 on 2024-10-04 16:38
from django.db import migrations, models
import django.db.models.deletion
class Migration(migrations.Migration):
dependencies = [
('user_management', '0022_alter_team_unique_together'),
('alerts', '0059_escalationpolicy_severity_and_more'),
]
operations = [
migrations.CreateModel(
name='RelatedIncident',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('incident_id', models.CharField(db_index=True, max_length=50)),
('created_at', models.DateTimeField(auto_now_add=True)),
('is_active', models.BooleanField(default=True)),
('attached_alert_groups', models.ManyToManyField(related_name='related_incidents', to='alerts.alertgroup')),
('channel_filter', models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='related_incidents', to='alerts.channelfilter')),
('organization', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='related_incidents', to='user_management.organization')),
],
options={
'unique_together': {('organization', 'incident_id')},
},
),
]

View file

@ -13,6 +13,7 @@ from .escalation_policy import EscalationPolicy # noqa: F401
from .grafana_alerting_contact_point import GrafanaAlertingContactPoint # noqa: F401
from .invitation import Invitation # noqa: F401
from .maintainable_object import MaintainableObject # noqa: F401
from .related_incident import RelatedIncident # noqa: F401
from .resolution_note import ResolutionNote, ResolutionNoteSlackMessage # noqa: F401
from .user_has_notification import UserHasNotification # noqa: F401
from .user_notification_bundle import BundledNotification, UserNotificationBundle # noqa: F401

View file

@ -44,6 +44,7 @@ if typing.TYPE_CHECKING:
AlertGroupLogRecord,
AlertReceiveChannel,
BundledNotification,
RelatedIncident,
ResolutionNote,
ResolutionNoteSlackMessage,
)
@ -193,6 +194,7 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models.
acknowledged_by_user: typing.Optional["User"]
alerts: "RelatedManager['Alert']"
bundled_notifications: "RelatedManager['BundledNotification']"
related_incidents: "RelatedManager['RelatedIncident']"
dependent_alert_groups: "RelatedManager['AlertGroup']"
channel: "AlertReceiveChannel"
log_records: "RelatedManager['AlertGroupLogRecord']"

View file

@ -11,18 +11,24 @@ from rest_framework.fields import DateTimeField
from apps.alerts import tasks
from apps.alerts.constants import ActionSource
from apps.alerts.incident_appearance.renderers.constants import DEFAULT_BACKUP_TITLE
from apps.alerts.utils import render_relative_timeline
from apps.slack.slack_formatter import SlackFormatter
from common.utils import clean_markup
if typing.TYPE_CHECKING:
from apps.alerts.models import AlertGroup, CustomButton, EscalationPolicy, Invitation
from apps.user_management.models import User
from apps.user_management.models import Organization, User
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
class RelatedIncidentData(typing.TypedDict):
incident_link: typing.Optional[str]
incident_title: str
class AlertGroupLogRecord(models.Model):
alert_group: "AlertGroup"
author: typing.Optional["User"]
@ -161,7 +167,9 @@ class AlertGroupLogRecord(models.Model):
ERROR_ESCALATION_TRIGGER_CUSTOM_WEBHOOK_ERROR,
ERROR_ESCALATION_NOTIFY_TEAM_MEMBERS_STEP_IS_NOT_CONFIGURED,
ERROR_ESCALATION_TRIGGER_WEBHOOK_IS_DISABLED,
) = range(20)
ERROR_ESCALATION_DECLARE_INCIDENT_STEP_IS_NOT_ENABLED,
ERROR_ESCALATION_INCIDENT_COULD_NOT_BE_DECLARED,
) = range(22)
type = models.IntegerField(choices=TYPE_CHOICES)
@ -225,7 +233,14 @@ class AlertGroupLogRecord(models.Model):
escalation_policy_step = models.IntegerField(null=True, default=None)
step_specific_info = JSONField(null=True, default=None)
STEP_SPECIFIC_INFO_KEYS = ["schedule_name", "custom_button_name", "usergroup_handle", "source_integration_name"]
STEP_SPECIFIC_INFO_KEYS = [
"schedule_name",
"custom_button_name",
"usergroup_handle",
"source_integration_name",
"incident_id",
"incident_title",
]
def _make_log_line_link(self, url, title, html=False, for_slack=False, substitute_with_tag=False):
if html and url:
@ -244,6 +259,7 @@ class AlertGroupLogRecord(models.Model):
author = self.author.short(organization) if self.author is not None else None
escalation_chain = self.alert_group.channel_filter.escalation_chain if self.alert_group.channel_filter else None
step_info = self.get_step_specific_info()
related_incident = self.render_incident_data_from_step_info(organization, step_info)
escalation_chain_data = (
{
"pk": escalation_chain.public_primary_key,
@ -280,6 +296,7 @@ class AlertGroupLogRecord(models.Model):
"type": self.type,
"created_at": created_at,
"author": author,
"incident": related_incident,
"escalation_chain": escalation_chain_data,
"schedule": schedule,
"webhook": webhook,
@ -425,6 +442,14 @@ class AlertGroupLogRecord(models.Model):
result += f'triggered step "Notify on-call from Schedule {schedule_text}{important_text}"'
elif escalation_policy_step == EscalationPolicy.STEP_REPEAT_ESCALATION_N_TIMES:
result += "escalation started from the beginning"
elif escalation_policy_step == EscalationPolicy.STEP_DECLARE_INCIDENT:
organization = self.alert_group.channel.organization
incident_data = self.render_incident_data_from_step_info(organization, step_specific_info)
incident_link = incident_data["incident_link"]
incident_title = incident_data["incident_title"]
tag = "related_incident" if substitute_with_tag else False
incident_text = self._make_log_line_link(incident_link, incident_title, html, for_slack, tag)
result += self.reason + f": {incident_text}"
else:
result += f'triggered step "{EscalationPolicy.get_step_display_name(escalation_policy_step)}"'
elif self.type == AlertGroupLogRecord.TYPE_SILENCE:
@ -640,8 +665,32 @@ class AlertGroupLogRecord(models.Model):
result += f"failed to notify User Group{usergroup_handle_text} in Slack"
elif self.escalation_error_code == AlertGroupLogRecord.ERROR_ESCALATION_TRIGGER_WEBHOOK_IS_DISABLED:
result += 'skipped escalation step "Trigger Outgoing Webhook" because it is disabled'
elif (
self.escalation_error_code == AlertGroupLogRecord.ERROR_ESCALATION_DECLARE_INCIDENT_STEP_IS_NOT_ENABLED
):
result += 'skipped escalation step "Declare Incident": step is not enabled'
elif self.escalation_error_code == AlertGroupLogRecord.ERROR_ESCALATION_INCIDENT_COULD_NOT_BE_DECLARED:
result += "failed to declare an Incident"
if self.reason:
result += f": {self.reason}"
return result
def render_incident_data_from_step_info(
self, organization: "Organization", step_specific_info: dict
) -> RelatedIncidentData | None:
from apps.alerts.models.related_incident import get_incident_url
if not step_specific_info or not all(key in step_specific_info for key in ["incident_title", "incident_id"]):
return None
incident_link = (
get_incident_url(organization, step_specific_info["incident_id"])
if step_specific_info["incident_id"]
else None
)
incident_title = step_specific_info["incident_title"] or DEFAULT_BACKUP_TITLE
return {"incident_link": incident_link, "incident_title": incident_title}
def get_step_specific_info(self):
step_specific_info = None
# in some cases step_specific_info was saved with using json.dumps

View file

@ -92,6 +92,7 @@ class EscalationPolicy(OrderedModel):
STEP_NOTIFY_IF_TIME,
STEP_NOTIFY_IF_NUM_ALERTS_IN_TIME_WINDOW,
STEP_REPEAT_ESCALATION_N_TIMES,
STEP_DECLARE_INCIDENT,
]
# Steps can be stored in db while interacting with internal api
# Includes important versions of default steps
@ -218,6 +219,7 @@ class EscalationPolicy(OrderedModel):
STEP_NOTIFY_IF_TIME,
STEP_NOTIFY_IF_NUM_ALERTS_IN_TIME_WINDOW,
STEP_REPEAT_ESCALATION_N_TIMES,
STEP_DECLARE_INCIDENT,
]
PUBLIC_STEP_CHOICES_MAP = {
@ -239,6 +241,7 @@ class EscalationPolicy(OrderedModel):
STEP_NOTIFY_IF_TIME: "notify_if_time_from_to",
STEP_NOTIFY_IF_NUM_ALERTS_IN_TIME_WINDOW: "notify_if_num_alerts_in_window",
STEP_REPEAT_ESCALATION_N_TIMES: "repeat_escalation",
STEP_DECLARE_INCIDENT: "declare_incident",
}
public_primary_key = models.CharField(

View file

@ -0,0 +1,48 @@
import typing
from urllib.parse import urljoin
from django.db import models
from common.constants.plugin_ids import PluginID
if typing.TYPE_CHECKING:
from django.db.models.manager import RelatedManager
from apps.alerts.models import AlertGroup, ChannelFilter
from apps.user_management.models import Organization
def get_incident_url(organization, incident_id) -> str:
return urljoin(organization.grafana_url, f"a/{PluginID.INCIDENT}/incidents/{incident_id}")
class RelatedIncident(models.Model):
attached_alert_groups: "RelatedManager['AlertGroup']"
channel_filter: typing.Optional["ChannelFilter"]
organization: "Organization"
incident_id = models.CharField(db_index=True, max_length=50)
organization = models.ForeignKey(
"user_management.Organization",
on_delete=models.CASCADE,
related_name="related_incidents",
)
channel_filter = models.ForeignKey(
"alerts.ChannelFilter",
on_delete=models.SET_NULL,
null=True,
related_name="related_incidents",
)
created_at = models.DateTimeField(auto_now_add=True)
is_active = models.BooleanField(default=True)
attached_alert_groups = models.ManyToManyField(
"alerts.AlertGroup",
related_name="related_incidents",
)
class Meta:
unique_together = ("organization", "incident_id")
def get_incident_link(self) -> str:
return get_incident_url(self.organization, self.incident_id)

View file

@ -5,6 +5,7 @@ from .alert_group_web_title_cache import ( # noqa:F401
)
from .check_escalation_finished import check_escalation_finished_task # noqa: F401
from .custom_webhook_result import custom_webhook_result # noqa: F401
from .declare_incident import declare_incident # noqa: F401
from .delete_alert_group import delete_alert_group # noqa: F401
from .delete_alert_group import finish_delete_alert_group # noqa: F401
from .delete_alert_group import send_alert_group_signal_for_delete # noqa: F401

View file

@ -0,0 +1,148 @@
import logging
from django.conf import settings
from apps.alerts.incident_appearance.renderers.constants import DEFAULT_BACKUP_TITLE
from common.custom_celery_tasks import shared_dedicated_queue_retry_task
from common.incident_api.client import (
DEFAULT_INCIDENT_SEVERITY,
DEFAULT_INCIDENT_STATUS,
IncidentAPIClient,
IncidentAPIException,
)
logger = logging.getLogger(__name__)
ATTACHMENT_CAPTION = "OnCall Alert Group"
ERROR_SEVERITY_NOT_FOUND = "Severity.FindOne: not found"
MAX_RETRIES = 1 if settings.DEBUG else 10
MAX_ATTACHED_ALERT_GROUPS_PER_INCIDENT = 5
def _attach_alert_group_to_incident(alert_group, incident_id, incident_title, escalation_policy, attached=False):
from apps.alerts.models import AlertGroupLogRecord, EscalationPolicy, RelatedIncident
declared_incident, _ = RelatedIncident.objects.get_or_create(
incident_id=incident_id,
organization=alert_group.channel.organization,
defaults={
"channel_filter": alert_group.channel_filter,
},
)
declared_incident.attached_alert_groups.add(alert_group)
reason = "attached to existing incident" if attached else "incident declared"
AlertGroupLogRecord.objects.create(
type=AlertGroupLogRecord.TYPE_ESCALATION_TRIGGERED,
reason=reason,
alert_group=alert_group,
step_specific_info={"incident_id": incident_id, "incident_title": incident_title},
escalation_policy=escalation_policy,
escalation_policy_step=EscalationPolicy.STEP_DECLARE_INCIDENT,
)
def _create_error_log_record(alert_group, escalation_policy, reason=""):
from apps.alerts.models import AlertGroupLogRecord, EscalationPolicy
AlertGroupLogRecord.objects.create(
type=AlertGroupLogRecord.TYPE_ESCALATION_FAILED,
escalation_error_code=AlertGroupLogRecord.ERROR_ESCALATION_INCIDENT_COULD_NOT_BE_DECLARED,
reason=reason,
alert_group=alert_group,
escalation_policy=escalation_policy,
escalation_policy_step=EscalationPolicy.STEP_DECLARE_INCIDENT,
)
@shared_dedicated_queue_retry_task(autoretry_for=(Exception,), retry_backoff=True, max_retries=MAX_RETRIES)
def declare_incident(alert_group_pk, escalation_policy_pk, severity=None):
from apps.alerts.models import AlertGroup, EscalationPolicy, RelatedIncident
alert_group = AlertGroup.objects.get(pk=alert_group_pk)
organization = alert_group.channel.organization
escalation_policy = None
if escalation_policy_pk:
escalation_policy = EscalationPolicy.objects.filter(pk=escalation_policy_pk).first()
if alert_group.channel_filter.is_default:
_create_error_log_record(
alert_group, escalation_policy, reason="Declare incident step is not enabled for default routes"
)
return
if declare_incident.request.retries == MAX_RETRIES:
_create_error_log_record(alert_group, escalation_policy)
return
incident_client = IncidentAPIClient(organization.grafana_url, organization.api_token)
# check for currently active related incident in the same route (channel_filter)
existing_incident = (
RelatedIncident.objects.filter(
organization=organization, channel_filter=alert_group.channel_filter, is_active=True
)
.order_by("-created_at")
.first()
)
if existing_incident:
incident_id = existing_incident.incident_id
try:
# get existing incident details
incident_data, _ = incident_client.get_incident(incident_id)
except IncidentAPIException as e:
logger.error(f"Error getting incident details: {e.msg}")
if e.status == 404:
# incident not found, mark as not opened
existing_incident.is_active = False
existing_incident.save(update_fields=["is_active"])
else:
# raise (and retry)
raise
else:
# incident exists, check if it is still active
if incident_data["status"] == DEFAULT_INCIDENT_STATUS:
# attach to incident context
incident_title = incident_data["title"]
num_attached = existing_incident.attached_alert_groups.count()
if num_attached < MAX_ATTACHED_ALERT_GROUPS_PER_INCIDENT:
try:
incident_data, _ = incident_client.add_activity(incident_id, alert_group.web_link)
except IncidentAPIException as e:
logger.error(f"Error attaching to existing incident: {e.msg}")
# setup association between alert group and incident (even if not attached)
_attach_alert_group_to_incident(
alert_group, incident_id, incident_title, escalation_policy, attached=True
)
else:
existing_incident.is_active = False
existing_incident.save(update_fields=["is_active"])
if existing_incident is None or not existing_incident.is_active:
# create new incident
if severity == EscalationPolicy.SEVERITY_SET_FROM_LABEL:
severity_label = alert_group.labels.filter(key_name="severity").first()
severity = severity_label.value_name if severity_label else None
severity = severity or DEFAULT_INCIDENT_SEVERITY
try:
incident_data, _ = incident_client.create_incident(
alert_group.web_title_cache if alert_group.web_title_cache else DEFAULT_BACKUP_TITLE,
severity=severity,
attachCaption=ATTACHMENT_CAPTION,
attachURL=alert_group.web_link,
)
except IncidentAPIException as e:
logger.error(f"Error creating new incident: {e.msg}")
if ERROR_SEVERITY_NOT_FOUND.lower() in e.msg.lower() and severity != DEFAULT_INCIDENT_SEVERITY:
# invalid severity, retry with default severity
declare_incident.apply_async(
args=(alert_group_pk, escalation_policy_pk),
kwargs={"severity": DEFAULT_INCIDENT_SEVERITY},
)
return
# else raise (and retry)
raise
else:
_attach_alert_group_to_incident(
alert_group, incident_data["incidentID"], incident_data["title"], escalation_policy
)

View file

@ -11,6 +11,7 @@ from apps.alerts.models import (
EscalationChain,
EscalationPolicy,
Invitation,
RelatedIncident,
ResolutionNote,
ResolutionNoteSlackMessage,
UserNotificationBundle,
@ -91,3 +92,8 @@ class InvitationFactory(factory.DjangoModelFactory):
class UserNotificationBundleFactory(factory.DjangoModelFactory):
class Meta:
model = UserNotificationBundle
class RelatedIncidentFactory(factory.DjangoModelFactory):
class Meta:
model = RelatedIncident

View file

@ -690,3 +690,52 @@ def test_notify_team_members(
(user_2.pk, alert_group.pk), expected_kwargs, immutable=True
)
assert mock_execute.signature.call_count == 2
@pytest.mark.django_db
def test_escalation_step_declare_incident(
escalation_step_test_setup,
make_escalation_policy,
):
organization, _, _, channel_filter, alert_group, reason = escalation_step_test_setup
declare_incident_step = make_escalation_policy(
escalation_chain=channel_filter.escalation_chain,
escalation_policy_step=EscalationPolicy.STEP_DECLARE_INCIDENT,
)
escalation_policy_snapshot = get_escalation_policy_snapshot_from_model(declare_incident_step)
expected_eta = timezone.now() + timezone.timedelta(seconds=NEXT_ESCALATION_DELAY)
with patch.object(EscalationPolicySnapshot, "_execute_tasks") as mocked_execute_tasks:
with patch(
"apps.alerts.escalation_snapshot.snapshot_classes.escalation_policy_snapshot.is_declare_incident_step_enabled",
return_value=True,
):
result = escalation_policy_snapshot.execute(alert_group, reason)
expected_result = EscalationPolicySnapshot.StepExecutionResultData(
eta=result.eta,
stop_escalation=False,
pause_escalation=False,
start_from_beginning=False,
)
assert (
expected_eta + timezone.timedelta(seconds=15)
> result.eta
> expected_eta - timezone.timedelta(seconds=15)
)
assert result == expected_result
assert not alert_group.log_records.exists()
mocked_execute_tasks.assert_called_once()
with patch.object(EscalationPolicySnapshot, "_execute_tasks") as mocked_execute_tasks:
with patch(
"apps.alerts.escalation_snapshot.snapshot_classes.escalation_policy_snapshot.is_declare_incident_step_enabled",
return_value=False,
):
escalation_policy_snapshot.execute(alert_group, reason)
mocked_execute_tasks.assert_not_called()
assert alert_group.log_records.exists()
log_record = alert_group.log_records.get()
assert log_record.type == AlertGroupLogRecord.TYPE_ESCALATION_FAILED
assert (
log_record.escalation_error_code
== AlertGroupLogRecord.ERROR_ESCALATION_DECLARE_INCIDENT_STEP_IS_NOT_ENABLED
)

View file

@ -0,0 +1,332 @@
from unittest.mock import patch
import httpretty
import pytest
from apps.alerts.models import AlertGroupLogRecord, EscalationPolicy, RelatedIncident
from apps.alerts.tasks.declare_incident import (
ATTACHMENT_CAPTION,
DEFAULT_BACKUP_TITLE,
DEFAULT_INCIDENT_SEVERITY,
ERROR_SEVERITY_NOT_FOUND,
MAX_ATTACHED_ALERT_GROUPS_PER_INCIDENT,
declare_incident,
)
from common.incident_api.client import IncidentAPIException
@pytest.fixture
def setup_alert_group_and_escalation_step(
make_organization,
make_alert_receive_channel,
make_alert_group,
make_channel_filter,
make_escalation_chain,
make_escalation_policy,
):
def _setup_alert_group_and_escalation_step(is_default_route=False, already_declared_incident=False):
organization = make_organization(grafana_url="https://stack.grafana.net", api_token="token")
alert_receive_channel = make_alert_receive_channel(organization=organization)
escalation_chain = make_escalation_chain(organization)
declare_incident_step = make_escalation_policy(
escalation_chain=escalation_chain,
escalation_policy_step=EscalationPolicy.STEP_DECLARE_INCIDENT,
)
channel_filter = make_channel_filter(
alert_receive_channel,
escalation_chain=escalation_chain,
is_default=is_default_route,
)
alert_group = make_alert_group(
alert_receive_channel=alert_receive_channel,
channel_filter=channel_filter,
)
declared_incident = None
if already_declared_incident:
declared_incident = RelatedIncident.objects.create(
incident_id="123",
organization=organization,
channel_filter=channel_filter,
)
return alert_group, declare_incident_step, declared_incident
return _setup_alert_group_and_escalation_step
@pytest.mark.django_db
def test_declare_incident_default_route(setup_alert_group_and_escalation_step):
alert_group, declare_incident_step, _ = setup_alert_group_and_escalation_step(is_default_route=True)
declare_incident(alert_group.pk, declare_incident_step.pk)
alert_group.refresh_from_db()
# check triggered log
log_record = alert_group.log_records.last()
assert log_record.type == log_record.TYPE_ESCALATION_FAILED
assert log_record.escalation_policy == declare_incident_step
assert log_record.escalation_policy_step == EscalationPolicy.STEP_DECLARE_INCIDENT
assert log_record.step_specific_info is None
assert log_record.reason == "Declare incident step is not enabled for default routes"
assert log_record.escalation_error_code == AlertGroupLogRecord.ERROR_ESCALATION_INCIDENT_COULD_NOT_BE_DECLARED
@pytest.mark.django_db
@httpretty.activate(verbose=True, allow_net_connect=False)
def test_declare_incident_ok(setup_alert_group_and_escalation_step):
alert_group, declare_incident_step, _ = setup_alert_group_and_escalation_step(already_declared_incident=False)
with patch("common.incident_api.client.IncidentAPIClient.create_incident") as mock_create_incident:
mock_create_incident.return_value = {"incidentID": "123", "title": "Incident"}, None
declare_incident(alert_group.pk, declare_incident_step.pk)
mock_create_incident.assert_called_with(
DEFAULT_BACKUP_TITLE,
severity=DEFAULT_INCIDENT_SEVERITY,
attachCaption=ATTACHMENT_CAPTION,
attachURL=alert_group.web_link,
)
alert_group.refresh_from_db()
# check declared incident
new_incident = alert_group.related_incidents.get()
assert new_incident.incident_id == "123"
assert new_incident.organization == alert_group.channel.organization
assert new_incident.channel_filter == alert_group.channel_filter
# check triggered log
log_record = alert_group.log_records.last()
assert log_record.type == log_record.TYPE_ESCALATION_TRIGGERED
assert log_record.escalation_policy == declare_incident_step
assert log_record.escalation_policy_step == EscalationPolicy.STEP_DECLARE_INCIDENT
assert log_record.step_specific_info == {"incident_id": "123", "incident_title": "Incident"}
assert log_record.reason == "incident declared"
assert log_record.escalation_error_code is None
@pytest.mark.django_db
@httpretty.activate(verbose=True, allow_net_connect=False)
def test_declare_incident_set_severity(setup_alert_group_and_escalation_step):
alert_group, declare_incident_step, _ = setup_alert_group_and_escalation_step(already_declared_incident=False)
severity = "critical"
with patch("common.incident_api.client.IncidentAPIClient.create_incident") as mock_create_incident:
mock_create_incident.return_value = {"incidentID": "123", "title": "Incident"}, None
declare_incident(alert_group.pk, declare_incident_step.pk, severity=severity)
mock_create_incident.assert_called_with(
DEFAULT_BACKUP_TITLE, severity=severity, attachCaption=ATTACHMENT_CAPTION, attachURL=alert_group.web_link
)
@pytest.mark.django_db
@httpretty.activate(verbose=True, allow_net_connect=False)
def test_declare_incident_set_severity_from_label(setup_alert_group_and_escalation_step):
alert_group, declare_incident_step, _ = setup_alert_group_and_escalation_step(already_declared_incident=False)
expected_severity = "minor"
# set alert group label
alert_group.labels.create(
organization=alert_group.channel.organization, key_name="severity", value_name=expected_severity
)
severity = EscalationPolicy.SEVERITY_SET_FROM_LABEL
with patch("common.incident_api.client.IncidentAPIClient.create_incident") as mock_create_incident:
mock_create_incident.return_value = {"incidentID": "123", "title": "Incident"}, None
declare_incident(alert_group.pk, declare_incident_step.pk, severity=severity)
mock_create_incident.assert_called_with(
DEFAULT_BACKUP_TITLE,
severity=expected_severity,
attachCaption=ATTACHMENT_CAPTION,
attachURL=alert_group.web_link,
)
@pytest.mark.django_db
@httpretty.activate(verbose=True, allow_net_connect=False)
def test_declare_incident_invalid_severity_fallback(setup_alert_group_and_escalation_step):
alert_group, declare_incident_step, _ = setup_alert_group_and_escalation_step(already_declared_incident=False)
severity = "INVALID"
with patch("common.incident_api.client.IncidentAPIClient.create_incident") as mock_create_incident:
with patch.object(declare_incident, "apply_async") as mock_declare_incident_apply_async:
mock_create_incident.side_effect = IncidentAPIException(
status=500, url="some-url", msg=ERROR_SEVERITY_NOT_FOUND
)
declare_incident(alert_group.pk, declare_incident_step.pk, severity=severity)
# create call failing with invalid severity
mock_create_incident.assert_called_with(
DEFAULT_BACKUP_TITLE, severity=severity, attachCaption=ATTACHMENT_CAPTION, attachURL=alert_group.web_link
)
# new task is queued with default severity instead
mock_declare_incident_apply_async.assert_called_with(
args=(alert_group.pk, declare_incident_step.pk), kwargs={"severity": DEFAULT_INCIDENT_SEVERITY}
)
@pytest.mark.django_db
@httpretty.activate(verbose=True, allow_net_connect=False)
def test_declare_incident_attach_alert_group(setup_alert_group_and_escalation_step):
alert_group, declare_incident_step, existing_open_incident = setup_alert_group_and_escalation_step(
already_declared_incident=True
)
incident_id = existing_open_incident.incident_id
with patch("common.incident_api.client.IncidentAPIClient.get_incident") as mock_get_incident:
with patch("common.incident_api.client.IncidentAPIClient.add_activity") as mock_add_activity:
mock_get_incident.return_value = {"incidentID": incident_id, "title": "Incident", "status": "active"}, None
mock_add_activity.return_value = {"activityItemID": "111"}, None
declare_incident(alert_group.pk, declare_incident_step.pk)
# check declared incident
assert existing_open_incident.attached_alert_groups.filter(id=alert_group.id).exists()
log_record = alert_group.log_records.last()
assert log_record.type == log_record.TYPE_ESCALATION_TRIGGERED
assert log_record.escalation_policy == declare_incident_step
assert log_record.escalation_policy_step == EscalationPolicy.STEP_DECLARE_INCIDENT
assert log_record.step_specific_info == {"incident_id": incident_id, "incident_title": "Incident"}
assert log_record.reason == "attached to existing incident"
assert log_record.escalation_error_code is None
@pytest.mark.django_db
@httpretty.activate(verbose=True, allow_net_connect=False)
def test_declare_incident_resolved_update(setup_alert_group_and_escalation_step):
alert_group, declare_incident_step, existing_open_incident = setup_alert_group_and_escalation_step(
already_declared_incident=True
)
incident_id = existing_open_incident.incident_id
new_incident_id = "333"
assert new_incident_id != incident_id
with patch("common.incident_api.client.IncidentAPIClient.get_incident") as mock_get_incident:
with patch("common.incident_api.client.IncidentAPIClient.create_incident") as mock_create_incident:
mock_get_incident.return_value = {
"incidentID": incident_id,
"title": "Incident1",
"status": "resolved",
}, None
mock_create_incident.return_value = {"incidentID": new_incident_id, "title": "Incident2"}, None
declare_incident(alert_group.pk, declare_incident_step.pk)
existing_open_incident.refresh_from_db()
assert existing_open_incident.is_active is False
# check declared incident
assert not existing_open_incident.attached_alert_groups.filter(id=alert_group.id).exists()
assert alert_group.related_incidents.get().incident_id == new_incident_id
log_record = alert_group.log_records.last()
assert log_record.type == log_record.TYPE_ESCALATION_TRIGGERED
assert log_record.escalation_policy == declare_incident_step
assert log_record.escalation_policy_step == EscalationPolicy.STEP_DECLARE_INCIDENT
assert log_record.step_specific_info == {"incident_id": new_incident_id, "incident_title": "Incident2"}
assert log_record.reason == "incident declared"
assert log_record.escalation_error_code is None
@pytest.mark.django_db
@httpretty.activate(verbose=True, allow_net_connect=False)
def test_declare_incident_attach_alert_group_skip_incident_update(
setup_alert_group_and_escalation_step, make_alert_group
):
alert_group, declare_incident_step, existing_open_incident = setup_alert_group_and_escalation_step(
already_declared_incident=True
)
alert_receive_channel = alert_group.channel
channel_filter = alert_group.channel_filter
incident_id = existing_open_incident.incident_id
# attach max alert groups to incident
for _ in range(MAX_ATTACHED_ALERT_GROUPS_PER_INCIDENT):
ag = make_alert_group(alert_receive_channel=alert_receive_channel, channel_filter=channel_filter)
existing_open_incident.attached_alert_groups.add(ag)
with patch("common.incident_api.client.IncidentAPIClient.get_incident") as mock_get_incident:
with patch("common.incident_api.client.IncidentAPIClient.add_activity") as mock_add_activity:
mock_get_incident.return_value = {"incidentID": incident_id, "title": "Incident", "status": "active"}, None
declare_incident(alert_group.pk, declare_incident_step.pk)
assert not mock_add_activity.called
# check declared incident
assert existing_open_incident.attached_alert_groups.filter(id=alert_group.id).exists()
log_record = alert_group.log_records.last()
assert log_record.type == log_record.TYPE_ESCALATION_TRIGGERED
assert log_record.escalation_policy == declare_incident_step
assert log_record.escalation_policy_step == EscalationPolicy.STEP_DECLARE_INCIDENT
assert log_record.step_specific_info == {"incident_id": incident_id, "incident_title": "Incident"}
assert log_record.reason == "attached to existing incident"
assert log_record.escalation_error_code is None
@pytest.mark.django_db
@httpretty.activate(verbose=True, allow_net_connect=False)
def test_get_existing_incident_error(setup_alert_group_and_escalation_step):
alert_group, declare_incident_step, existing_open_incident = setup_alert_group_and_escalation_step(
already_declared_incident=True
)
with patch("common.incident_api.client.IncidentAPIClient.get_incident") as mock_get_incident:
mock_get_incident.side_effect = IncidentAPIException(status=500, url="some-url")
with pytest.raises(IncidentAPIException):
declare_incident(alert_group.pk, declare_incident_step.pk)
# but if incident was not found, a new one should be created
incident_id = existing_open_incident.incident_id
new_incident_id = "333"
assert new_incident_id != incident_id
with patch("common.incident_api.client.IncidentAPIClient.get_incident") as mock_get_incident:
with patch("common.incident_api.client.IncidentAPIClient.create_incident") as mock_create_incident:
mock_get_incident.side_effect = IncidentAPIException(status=404, url="some-url")
mock_create_incident.return_value = {"incidentID": new_incident_id, "title": "Incident"}, None
declare_incident(alert_group.pk, declare_incident_step.pk)
alert_group.refresh_from_db()
# check declared incident
assert not existing_open_incident.attached_alert_groups.filter(id=alert_group.id).exists()
new_incident = alert_group.related_incidents.get()
assert new_incident != existing_open_incident
assert new_incident.incident_id == new_incident_id
assert new_incident.organization == alert_group.channel.organization
assert new_incident.channel_filter == alert_group.channel_filter
@pytest.mark.django_db
@httpretty.activate(verbose=True, allow_net_connect=False)
def test_attach_alert_group_error(setup_alert_group_and_escalation_step):
alert_group, declare_incident_step, existing_open_incident = setup_alert_group_and_escalation_step(
already_declared_incident=True
)
incident_id = existing_open_incident.incident_id
with patch("common.incident_api.client.IncidentAPIClient.get_incident") as mock_get_incident:
with patch("common.incident_api.client.IncidentAPIClient.add_activity") as mock_add_activity:
mock_get_incident.return_value = {"incidentID": incident_id, "title": "Incident", "status": "active"}, None
mock_add_activity.side_effect = IncidentAPIException(status=500, url="some-url")
declare_incident(alert_group.pk, declare_incident_step.pk)
alert_group.refresh_from_db()
# incident attachment failed, but DB is still updated
assert existing_open_incident.attached_alert_groups.filter(id=alert_group.id).exists()
log_record = alert_group.log_records.last()
assert log_record.type == log_record.TYPE_ESCALATION_TRIGGERED
assert log_record.escalation_policy == declare_incident_step
assert log_record.escalation_policy_step == EscalationPolicy.STEP_DECLARE_INCIDENT
assert log_record.step_specific_info == {"incident_id": incident_id, "incident_title": "Incident"}
assert log_record.reason == "attached to existing incident"
assert log_record.escalation_error_code is None
@pytest.mark.django_db
@httpretty.activate(verbose=True, allow_net_connect=False)
def test_create_incident_error(setup_alert_group_and_escalation_step):
alert_group, declare_incident_step, _ = setup_alert_group_and_escalation_step(already_declared_incident=False)
with patch("common.incident_api.client.IncidentAPIClient.create_incident") as mock_create_incident:
mock_create_incident.side_effect = IncidentAPIException(status=500, url="some-url")
with pytest.raises(IncidentAPIException):
declare_incident(alert_group.pk, declare_incident_step.pk)

View file

@ -1,3 +1,11 @@
import typing
from django.conf import settings
if typing.TYPE_CHECKING:
from apps.user_management.models import Organization
def render_relative_timeline(log_created_at, alert_group_started_at):
time_delta = log_created_at - alert_group_started_at
seconds = int(time_delta.total_seconds())
@ -12,3 +20,7 @@ def render_relative_timeline(log_created_at, alert_group_started_at):
return "%dm%ds" % (minutes, seconds)
else:
return "%ds" % (seconds,)
def is_declare_incident_step_enabled(organization: "Organization") -> bool:
return organization.is_grafana_incident_enabled and settings.FEATURE_DECLARE_INCIDENT_STEP_ENABLED

View file

@ -3,6 +3,7 @@ from datetime import timedelta
from rest_framework import serializers
from apps.alerts.models import EscalationChain, EscalationPolicy
from apps.alerts.utils import is_declare_incident_step_enabled
from apps.schedules.models import OnCallSchedule
from apps.slack.models import SlackUserGroup
from apps.user_management.models import Team, User
@ -24,6 +25,7 @@ TO_TIME = "to_time"
NUM_ALERTS_IN_WINDOW = "num_alerts_in_window"
NUM_MINUTES_IN_WINDOW = "num_minutes_in_window"
CUSTOM_WEBHOOK_TRIGGER = "custom_webhook"
SEVERITY = "severity"
STEP_TYPE_TO_RELATED_FIELD_MAP = {
EscalationPolicy.STEP_WAIT: [WAIT_DELAY],
@ -35,6 +37,7 @@ STEP_TYPE_TO_RELATED_FIELD_MAP = {
EscalationPolicy.STEP_NOTIFY_IF_TIME: [FROM_TIME, TO_TIME],
EscalationPolicy.STEP_NOTIFY_IF_NUM_ALERTS_IN_TIME_WINDOW: [NUM_ALERTS_IN_WINDOW, NUM_MINUTES_IN_WINDOW],
EscalationPolicy.STEP_TRIGGER_CUSTOM_WEBHOOK: [CUSTOM_WEBHOOK_TRIGGER],
EscalationPolicy.STEP_DECLARE_INCIDENT: [SEVERITY],
}
@ -81,6 +84,7 @@ class EscalationPolicySerializer(EagerLoadingMixin, serializers.ModelSerializer)
allow_null=True,
filter_field="organization",
)
severity = serializers.CharField(required=False, allow_null=True)
class Meta:
model = EscalationPolicy
@ -99,6 +103,7 @@ class EscalationPolicySerializer(EagerLoadingMixin, serializers.ModelSerializer)
"notify_schedule",
"notify_to_group",
"notify_to_team_members",
"severity",
"important",
]
@ -123,6 +128,7 @@ class EscalationPolicySerializer(EagerLoadingMixin, serializers.ModelSerializer)
NUM_ALERTS_IN_WINDOW,
NUM_MINUTES_IN_WINDOW,
CUSTOM_WEBHOOK_TRIGGER,
SEVERITY,
]
step = data.get("step")
@ -151,6 +157,8 @@ class EscalationPolicySerializer(EagerLoadingMixin, serializers.ModelSerializer)
raise serializers.ValidationError("Invalid step value")
if step_type in EscalationPolicy.SLACK_INTEGRATION_REQUIRED_STEPS and organization.slack_team_identity is None:
raise serializers.ValidationError("Invalid escalation step type: step is Slack-specific")
if step_type == EscalationPolicy.STEP_DECLARE_INCIDENT and not is_declare_incident_step_enabled(organization):
raise serializers.ValidationError("Invalid escalation step type: step is not enabled")
return step_type
def to_representation(self, instance):
@ -214,6 +222,7 @@ class EscalationPolicyUpdateSerializer(EscalationPolicySerializer):
NUM_ALERTS_IN_WINDOW,
NUM_MINUTES_IN_WINDOW,
CUSTOM_WEBHOOK_TRIGGER,
SEVERITY,
]
for f in STEP_TYPE_TO_RELATED_FIELD_MAP.get(step, []):

View file

@ -975,6 +975,37 @@ def test_get_filter_labels(
assert response.json()["results"][0]["pk"] == alert_groups[0].public_primary_key
@pytest.mark.django_db
def test_get_filter_by_related_incident(
alert_group_internal_api_setup, make_related_incident, make_alert_group, make_user_auth_headers
):
user, token, alert_groups = alert_group_internal_api_setup
alert_group = alert_groups[0]
related_incident = make_related_incident("1", alert_group.channel.organization, alert_group.channel_filter)
related_incident.attached_alert_groups.add(alert_group)
client = APIClient()
url = reverse("api-internal:alertgroup-list")
response = client.get(
url + "?has_related_incident=true",
format="json",
**make_user_auth_headers(user, token),
)
assert response.status_code == status.HTTP_200_OK
assert len(response.data["results"]) == 1
response = client.get(
url + "?has_related_incident=false",
format="json",
**make_user_auth_headers(user, token),
)
assert response.status_code == status.HTTP_200_OK
assert len(response.data["results"]) == 3
@pytest.mark.django_db
def test_get_title_search(
settings,

View file

@ -10,6 +10,7 @@ from rest_framework.test import APIClient
from apps.alerts.models import EscalationPolicy
from apps.api.permissions import LegacyAccessControlRole
from common.incident_api.client import DEFAULT_INCIDENT_SEVERITY, IncidentAPIException
@pytest.fixture()
@ -651,8 +652,13 @@ def test_create_escalation_policy_with_no_important_version(
make_escalation_chain,
step,
make_user_auth_headers,
settings,
):
organization, user, _, _ = make_organization_and_user_with_slack_identities()
# make sure declare incident step is enabled
settings.FEATURE_DECLARE_INCIDENT_STEP_ENABLED = True
organization.is_grafana_incident_enabled = True
organization.save()
_, token = make_token_for_organization(organization)
escalation_chain = make_escalation_chain(organization)
@ -832,6 +838,7 @@ def test_escalation_policy_switch_importance(
"notify_schedule": None,
"notify_to_group": None,
"notify_to_team_members": None,
"severity": None,
"important": True,
"wait_delay": None,
}
@ -889,6 +896,7 @@ def test_escalation_policy_filter_by_user(
"notify_schedule": None,
"notify_to_group": None,
"notify_to_team_members": None,
"severity": None,
"important": False,
},
{
@ -906,6 +914,7 @@ def test_escalation_policy_filter_by_user(
"notify_schedule": None,
"notify_to_group": None,
"notify_to_team_members": None,
"severity": None,
"important": False,
},
]
@ -971,6 +980,7 @@ def test_escalation_policy_filter_by_slack_channel(
"notify_schedule": None,
"notify_to_group": None,
"notify_to_team_members": None,
"severity": None,
"important": False,
},
]
@ -1001,3 +1011,88 @@ def test_escalation_policy_escalation_options_webhooks(
returned_options = [option["value"] for option in response.json()]
assert EscalationPolicy.STEP_TRIGGER_CUSTOM_WEBHOOK in returned_options
@pytest.mark.django_db
def test_escalation_policy_severity_options(
make_organization_and_user_with_plugin_token,
make_user_auth_headers,
):
organization, user, token = make_organization_and_user_with_plugin_token()
organization.is_grafana_labels_enabled = False
organization.save()
client = APIClient()
url = reverse("api-internal:escalation_policy-severity-options")
# without labels enabled
available_severities = [
{"severityID": "abc", "orgID": "1", "displayLabel": "Pending", "level": -1},
{"severityID": "def", "orgID": "1", "displayLabel": "Critical", "level": 1},
]
with patch("common.incident_api.client.IncidentAPIClient.get_severities") as mock_get_severities:
mock_get_severities.return_value = available_severities, None
response = client.get(url, format="json", **make_user_auth_headers(user, token))
expected_options = [{"value": s["displayLabel"], "display_name": s["displayLabel"]} for s in available_severities]
assert response.json() == expected_options
# failing request does not break; fallback to default option only
with patch("common.incident_api.client.IncidentAPIClient.get_severities") as mock_get_severities:
mock_get_severities.side_effect = IncidentAPIException(status=404, url="some-url")
response = client.get(url, format="json", **make_user_auth_headers(user, token))
fallback_options = [{"value": DEFAULT_INCIDENT_SEVERITY, "display_name": DEFAULT_INCIDENT_SEVERITY}]
assert response.json() == fallback_options
# labels enabled
organization.is_grafana_labels_enabled = True
organization.save()
with patch("common.incident_api.client.IncidentAPIClient.get_severities") as mock_get_severities:
mock_get_severities.return_value = available_severities, None
response = client.get(url, format="json", **make_user_auth_headers(user, token))
# include set from label option
expected_options = [
{
"value": EscalationPolicy.SEVERITY_SET_FROM_LABEL,
"display_name": EscalationPolicy.SEVERITY_SET_FROM_LABEL_DISPLAY_VALUE,
}
] + expected_options
assert response.json() == expected_options
@pytest.mark.django_db
def test_create_escalation_policy_declare_incident(
escalation_policy_internal_api_setup, make_user_auth_headers, settings
):
token, escalation_chain, _, user, _ = escalation_policy_internal_api_setup
organization = escalation_chain.organization
client = APIClient()
url = reverse("api-internal:escalation_policy-list")
data = {
"step": EscalationPolicy.STEP_DECLARE_INCIDENT,
"severity": "critical",
"escalation_chain": escalation_chain.public_primary_key,
}
response = client.post(url, data, format="json", **make_user_auth_headers(user, token))
assert response.status_code == status.HTTP_400_BAD_REQUEST
# make sure declare incident step is enabled
settings.FEATURE_DECLARE_INCIDENT_STEP_ENABLED = True
organization.is_grafana_incident_enabled = True
organization.save()
response = client.post(url, data, format="json", **make_user_auth_headers(user, token))
assert response.status_code == status.HTTP_201_CREATED
escalation_policy = EscalationPolicy.objects.get(public_primary_key=response.data["id"])
assert escalation_policy.step == EscalationPolicy.STEP_DECLARE_INCIDENT
assert escalation_policy.severity == "critical"
url = reverse("api-internal:escalation_policy-detail", kwargs={"pk": escalation_policy.public_primary_key})
response = client.get(url, format="json", **make_user_auth_headers(user, token))
response_data = response.json()
assert response_data["step"] == EscalationPolicy.STEP_DECLARE_INCIDENT
assert response_data["severity"] == "critical"

View file

@ -17,6 +17,7 @@ from apps.alerts.constants import ActionSource
from apps.alerts.models import AlertGroup, AlertReceiveChannel, EscalationChain, ResolutionNote
from apps.alerts.paging import unpage_user
from apps.alerts.tasks import delete_alert_group, send_update_resolution_note_signal
from apps.alerts.utils import is_declare_incident_step_enabled
from apps.api.errors import AlertGroupAPIError
from apps.api.label_filtering import parse_label_query
from apps.api.permissions import RBACPermission
@ -120,6 +121,7 @@ class AlertGroupFilter(DateRangeFilterMixin, ModelFieldFilterMixin, filters.Filt
)
with_resolution_note = filters.BooleanFilter(method="filter_with_resolution_note")
mine = filters.BooleanFilter(method="filter_mine")
has_related_incident = filters.BooleanFilter(field_name="related_incidents", lookup_expr="isnull", exclude=True)
def filter_status(self, queryset, name, value):
if not value:
@ -719,6 +721,7 @@ class AlertGroupView(
"""
Retrieve a list of valid filter options that can be used to filter alert groups
"""
organization = self.request.auth.organization
api_root = "/api/internal/v1/"
default_day_range = 30
@ -804,7 +807,7 @@ class AlertGroupView(
filter_options = [{"name": "search", "type": "search", "description": description}] + filter_options
if is_labels_feature_enabled(self.request.auth.organization):
if is_labels_feature_enabled(organization):
filter_options.append(
{
"name": "label",
@ -813,6 +816,15 @@ class AlertGroupView(
}
)
if is_declare_incident_step_enabled(organization):
filter_options.append(
{
"name": "has_related_incident",
"type": "boolean",
"default": "true",
}
)
return Response(filter_options)
@extend_schema(

View file

@ -1,3 +1,5 @@
import logging
from django.conf import settings
from django.db.models import Q
from rest_framework.decorators import action
@ -5,6 +7,7 @@ from rest_framework.permissions import IsAuthenticated
from rest_framework.response import Response
from apps.alerts.models import EscalationPolicy
from apps.alerts.utils import is_declare_incident_step_enabled
from apps.api.permissions import RBACPermission
from apps.api.serializers.escalation_policy import (
EscalationPolicyCreateSerializer,
@ -19,9 +22,12 @@ from common.api_helpers.mixins import (
TeamFilteringMixin,
UpdateSerializerMixin,
)
from common.incident_api.client import DEFAULT_INCIDENT_SEVERITY, IncidentAPIClient, IncidentAPIException
from common.insight_log import EntityEvent, write_resource_insight_log
from common.ordered_model.viewset import OrderedModelViewSet
logger = logging.getLogger(__name__)
class EscalationPolicyView(
TeamFilteringMixin,
@ -42,6 +48,7 @@ class EscalationPolicyView(
"escalation_options": [RBACPermission.Permissions.ESCALATION_CHAINS_READ],
"delay_options": [RBACPermission.Permissions.ESCALATION_CHAINS_READ],
"num_minutes_in_window_options": [RBACPermission.Permissions.ESCALATION_CHAINS_READ],
"severity_options": [RBACPermission.Permissions.ESCALATION_CHAINS_READ],
"create": [RBACPermission.Permissions.ESCALATION_CHAINS_WRITE],
"update": [RBACPermission.Permissions.ESCALATION_CHAINS_WRITE],
"partial_update": [RBACPermission.Permissions.ESCALATION_CHAINS_WRITE],
@ -116,6 +123,7 @@ class EscalationPolicyView(
@action(detail=False, methods=["get"])
def escalation_options(self, request):
grafana_declare_incident_enabled = is_declare_incident_step_enabled(organization=self.request.auth.organization)
choices = []
for step in EscalationPolicy.INTERNAL_API_STEPS:
verbal = EscalationPolicy.INTERNAL_API_STEPS_TO_VERBAL_MAP[step]
@ -126,7 +134,7 @@ class EscalationPolicyView(
if slack_integration_required and not settings.FEATURE_SLACK_INTEGRATION_ENABLED:
continue
if step == EscalationPolicy.STEP_DECLARE_INCIDENT:
if step == EscalationPolicy.STEP_DECLARE_INCIDENT and not grafana_declare_incident_enabled:
continue
choices.append(
@ -155,3 +163,25 @@ class EscalationPolicyView(
{"value": choice[0], "display_name": choice[1]} for choice in EscalationPolicy.WEB_DURATION_CHOICES_MINUTES
]
return Response(choices)
@action(detail=False, methods=["get"])
def severity_options(self, request):
organization = self.request.auth.organization
choices = []
if organization.is_grafana_labels_enabled:
choices = [
{
"value": EscalationPolicy.SEVERITY_SET_FROM_LABEL,
"display_name": EscalationPolicy.SEVERITY_SET_FROM_LABEL_DISPLAY_VALUE,
}
]
incident_client = IncidentAPIClient(organization.grafana_url, organization.api_token)
try:
severities, _ = incident_client.get_severities()
choices += [
{"value": severity["displayLabel"], "display_name": severity["displayLabel"]} for severity in severities
]
except IncidentAPIException as e:
logger.error(f"Error getting severities: {e.msg}")
choices += [{"value": DEFAULT_INCIDENT_SEVERITY, "display_name": DEFAULT_INCIDENT_SEVERITY}]
return Response(choices)

View file

@ -5,6 +5,7 @@ from django.utils.functional import cached_property
from rest_framework import fields, serializers
from apps.alerts.models import EscalationChain, EscalationPolicy
from apps.alerts.utils import is_declare_incident_step_enabled
from apps.schedules.models import OnCallSchedule
from apps.slack.models import SlackUserGroup
from apps.user_management.models import Team, User
@ -72,6 +73,7 @@ class EscalationPolicySerializer(EagerLoadingMixin, OrderedModelSerializer):
required=False,
source="custom_webhook",
)
severity = serializers.CharField(required=False)
important = serializers.BooleanField(required=False)
TIME_FORMAT = "%H:%M:%SZ"
@ -101,6 +103,7 @@ class EscalationPolicySerializer(EagerLoadingMixin, OrderedModelSerializer):
"notify_if_time_to",
"num_alerts_in_window",
"num_minutes_in_window",
"severity",
]
PREFETCH_RELATED = ["notify_to_users_queue"]
@ -120,6 +123,9 @@ class EscalationPolicySerializer(EagerLoadingMixin, OrderedModelSerializer):
if step_type == EscalationPolicy.STEP_FINAL_NOTIFYALL and organization.slack_team_identity is None:
raise BadRequest(detail="Invalid escalation step type: step is Slack-specific")
if step_type == EscalationPolicy.STEP_DECLARE_INCIDENT and not is_declare_incident_step_enabled(organization):
raise BadRequest("Invalid escalation step type: step is not enabled")
return step_type
def create(self, validated_data):
@ -163,6 +169,7 @@ class EscalationPolicySerializer(EagerLoadingMixin, OrderedModelSerializer):
"notify_if_time_to",
"num_alerts_in_window",
"num_minutes_in_window",
"severity",
]
if step == EscalationPolicy.STEP_WAIT:
fields_to_remove.remove("duration")
@ -190,6 +197,8 @@ class EscalationPolicySerializer(EagerLoadingMixin, OrderedModelSerializer):
elif step == EscalationPolicy.STEP_NOTIFY_IF_NUM_ALERTS_IN_TIME_WINDOW:
fields_to_remove.remove("num_alerts_in_window")
fields_to_remove.remove("num_minutes_in_window")
elif step == EscalationPolicy.STEP_DECLARE_INCIDENT:
fields_to_remove.remove("severity")
if (
step in EscalationPolicy.DEFAULT_TO_IMPORTANT_STEP_MAPPING
@ -213,6 +222,7 @@ class EscalationPolicySerializer(EagerLoadingMixin, OrderedModelSerializer):
"to_time",
"num_alerts_in_window",
"num_minutes_in_window",
"severity",
]
step = validated_data.get("step")
important = validated_data.pop("important", None)
@ -243,6 +253,8 @@ class EscalationPolicySerializer(EagerLoadingMixin, OrderedModelSerializer):
elif step == EscalationPolicy.STEP_NOTIFY_IF_NUM_ALERTS_IN_TIME_WINDOW:
validated_data_fields_to_remove.remove("num_alerts_in_window")
validated_data_fields_to_remove.remove("num_minutes_in_window")
elif step == EscalationPolicy.STEP_DECLARE_INCIDENT:
validated_data_fields_to_remove.remove("severity")
for field in validated_data_fields_to_remove:
validated_data.pop(field, None)
@ -299,5 +311,7 @@ class EscalationPolicyUpdateSerializer(EscalationPolicySerializer):
if step != EscalationPolicy.STEP_NOTIFY_IF_NUM_ALERTS_IN_TIME_WINDOW:
instance.num_alerts_in_window = None
instance.num_minutes_in_window = None
if step != EscalationPolicy.STEP_DECLARE_INCIDENT:
instance.severity = None
return super().update(instance, validated_data)

View file

@ -463,3 +463,43 @@ def test_update_escalation_policy_using_notify_team_members(
escalation_policy = EscalationPolicy.objects.get(public_primary_key=response.data["id"])
serializer = EscalationPolicySerializer(escalation_policy)
assert response.data == serializer.data
@pytest.mark.django_db
def test_create_escalation_policy_declare_incident(
make_organization_and_user_with_token,
escalation_policies_setup,
settings,
):
organization, user, token = make_organization_and_user_with_token()
escalation_chain, _, _ = escalation_policies_setup(organization, user)
data_for_create = {
"escalation_chain_id": escalation_chain.public_primary_key,
"type": "declare_incident",
"position": 0,
"severity": "critical",
}
client = APIClient()
url = reverse("api-public:escalation_policies-list")
response = client.post(url, data=data_for_create, format="json", HTTP_AUTHORIZATION=token)
assert response.status_code == status.HTTP_400_BAD_REQUEST
# make sure declare incident step is enabled
settings.FEATURE_DECLARE_INCIDENT_STEP_ENABLED = True
organization.is_grafana_incident_enabled = True
organization.save()
response = client.post(url, data=data_for_create, format="json", HTTP_AUTHORIZATION=token)
assert response.status_code == status.HTTP_201_CREATED
escalation_policy = EscalationPolicy.objects.get(public_primary_key=response.data["id"])
assert escalation_policy.step == EscalationPolicy.STEP_DECLARE_INCIDENT
assert escalation_policy.severity == "critical"
url = reverse("api-public:escalation_policies-detail", kwargs={"pk": escalation_policy.public_primary_key})
response = client.get(url, format="json", HTTP_AUTHORIZATION=token)
response_data = response.json()
assert response_data["type"] == EscalationPolicy.PUBLIC_STEP_CHOICES_MAP[EscalationPolicy.STEP_DECLARE_INCIDENT]
assert response_data["severity"] == "critical"

View file

@ -35,6 +35,7 @@ from apps.alerts.tests.factories import (
EscalationChainFactory,
EscalationPolicyFactory,
InvitationFactory,
RelatedIncidentFactory,
ResolutionNoteFactory,
ResolutionNoteSlackMessageFactory,
UserNotificationBundleFactory,
@ -1112,3 +1113,11 @@ def make_user_notification_bundle():
)
return _make_user_notification_bundle
@pytest.fixture
def make_related_incident():
def _make_related_incident(incident_id, organization, channel_filter):
return RelatedIncidentFactory(incident_id=incident_id, organization=organization, channel_filter=channel_filter)
return _make_related_incident

View file

@ -75,6 +75,7 @@ FEATURE_LABELS_ENABLED_PER_ORG = getenv_list("FEATURE_LABELS_ENABLED_PER_ORG", d
FEATURE_ALERT_GROUP_SEARCH_ENABLED = getenv_boolean("FEATURE_ALERT_GROUP_SEARCH_ENABLED", default=True)
FEATURE_ALERT_GROUP_SEARCH_CUTOFF_DAYS = getenv_integer("FEATURE_ALERT_GROUP_SEARCH_CUTOFF_DAYS", default=None)
FEATURE_NOTIFICATION_BUNDLE_ENABLED = getenv_boolean("FEATURE_NOTIFICATION_BUNDLE_ENABLED", default=True)
FEATURE_DECLARE_INCIDENT_STEP_ENABLED = getenv_boolean("FEATURE_DECLARE_INCIDENT_STEP_ENABLED", default=False)
TWILIO_API_KEY_SID = os.environ.get("TWILIO_API_KEY_SID")
TWILIO_API_KEY_SECRET = os.environ.get("TWILIO_API_KEY_SECRET")

View file

@ -94,6 +94,7 @@ CELERY_TASK_ROUTES = {
# CRITICAL
"apps.alerts.tasks.acknowledge_reminder.acknowledge_reminder_task": {"queue": "critical"},
"apps.alerts.tasks.acknowledge_reminder.unacknowledge_timeout_task": {"queue": "critical"},
"apps.alerts.tasks.declare_incident.declare_incident": {"queue": "critical"},
"apps.alerts.tasks.distribute_alert.send_alert_create_signal": {"queue": "critical"},
"apps.alerts.tasks.escalate_alert_group.escalate_alert_group": {"queue": "critical"},
"apps.alerts.tasks.invite_user_to_join_incident.invite_user_to_join_incident": {"queue": "critical"},