diff --git a/CHANGELOG.md b/CHANGELOG.md index 110ee554..a3ef7363 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,27 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## v1.1.41 (2023-03-21) + +### Added + +- Modified `check_escalation_finished_task` celery task to use read-only databases for its query, if one is defined + + make the validation logic stricter + ping a configurable heartbeat on successful completion of this task ([1266](https://github.com/grafana/oncall/pull/1266)) + +### Changed + +- Updated wording throughout plugin to use 'Alert Group' instead of 'Incident' ([1565](https://github.com/grafana/oncall/pull/1565), + [1576](https://github.com/grafana/oncall/pull/1576)) +- Check for enabled Telegram feature was added to ChatOps and to User pages ([319](https://github.com/grafana/oncall/issues/319)) +- Filtering for Editors/Admins was added to rotation form. It is not allowed to assign Viewer to rotation ([1124](https://github.com/grafana/oncall/issues/1124)) +- Modified search behaviour on the Escalation Chains page to allow for "partial searching" ([1578](https://github.com/grafana/oncall/pull/1578)) + +### Fixed + +- Fixed a few permission issues on the UI ([1448](https://github.com/grafana/oncall/pull/1448)) +- Fix resolution note rendering in Slack message threads where the Slack username was not + being properly rendered ([1561](https://github.com/grafana/oncall/pull/1561)) + ## v1.1.40 (2023-03-16) ### Fixed diff --git a/Makefile b/Makefile index 863a9566..55cc3611 100644 --- a/Makefile +++ b/Makefile @@ -1,3 +1,10 @@ +help: + @sed \ + -e '/^[a-zA-Z0-9_\-]*:.*##/!d' \ + -e 's/:.*##\s*/:/' \ + -e 's/^\(.\+\):\(.*\)/$(shell tput setaf 6)\1$(shell tput sgr0):\2/' \ + $(MAKEFILE_LIST) | column -c2 -t -s : + DOCKER_COMPOSE_FILE = docker-compose-developer.yml DOCKER_COMPOSE_DEV_LABEL = com.grafana.oncall.env=dev @@ -62,7 +69,7 @@ define run_engine_docker_command endef # touch SQLITE_DB_FILE if it does not exist and DB is eqaul to SQLITE_PROFILE -start: +start: ## start all of the docker containers ifeq ($(DB),$(SQLITE_PROFILE)) @if [ ! -f $(SQLITE_DB_FILE) ]; then \ touch $(SQLITE_DB_FILE); \ @@ -71,7 +78,7 @@ endif $(call run_docker_compose_command,up --remove-orphans -d) -init: +init: ## build the frontend plugin code then run make start # if the oncall UI is to be run in docker we should do an initial build of the frontend code # this makes sure that it will be available when the grafana container starts up without the need to # restart the grafana container initially @@ -79,16 +86,17 @@ ifeq ($(findstring $(UI_PROFILE),$(COMPOSE_PROFILES)),$(UI_PROFILE)) cd grafana-plugin && yarn install && yarn build:dev endif -stop: +stop: # stop all of the docker containers $(call run_docker_compose_command,down) -restart: +restart: ## restart all docker containers $(call run_docker_compose_command,restart) -build: +build: ## rebuild images (e.g. when changing requirements.txt) $(call run_docker_compose_command,build) -cleanup: stop +cleanup: stop ## this will remove all of the images, containers, volumes, and networks + ## associated with your local OnCall developer setup docker system prune --filter label="$(DOCKER_COMPOSE_DEV_LABEL)" --all --volumes install-pre-commit: @@ -99,38 +107,43 @@ install-pre-commit: echo "pre-commit already installed"; \ fi -lint: install-pre-commit +lint: install-pre-commit ## run both frontend and backend linters + ## may need to run `yarn install` from within `grafana-plugin` + ## to install several `pre-commit` dependencies + pre-commit run --all-files install-precommit-hook: install-pre-commit pre-commit install -test: +test: ## run backend tests $(call run_engine_docker_command,pytest) -start-celery-beat: +start-celery-beat: ## start celery beat $(call run_engine_docker_command,celery -A engine beat -l info) -purge-queues: +purge-queues: ## purge celery queues $(call run_engine_docker_command,celery -A engine purge -f) -shell: +shell: ## starts an OnCall engine Django shell $(call run_engine_docker_command,python manage.py shell) -dbshell: +dbshell: ## opens a DB shell $(call run_engine_docker_command,python manage.py dbshell) -engine-manage: +engine-manage: ## run Django's `manage.py` script, inside of a docker container, passing `$CMD` as arguments. + ## e.g. `make engine-manage CMD="makemigrations"` + ## https://docs.djangoproject.com/en/4.1/ref/django-admin/#django-admin-makemigrations $(call run_engine_docker_command,python manage.py $(CMD)) -exec-engine: +exec-engine: ## exec into engine container's bash docker exec -it oncall_engine bash -_backend-debug-enable: +_backend-debug-enable: ## enable Django's debug mode and Silk profiling (this is disabled by default for performance reasons) $(shell ./dev/add_env_var.sh DEBUG True $(DEV_ENV_FILE)) $(shell ./dev/add_env_var.sh SILK_PROFILER_ENABLED True $(DEV_ENV_FILE)) -_backend-debug-disable: +_backend-debug-disable: ## disable Django's debug mode and Silk profiling $(shell ./dev/add_env_var.sh DEBUG False $(DEV_ENV_FILE)) $(shell ./dev/add_env_var.sh SILK_PROFILER_ENABLED False $(DEV_ENV_FILE)) @@ -164,5 +177,8 @@ run-backend-celery: backend-command: $(call backend_command,$(CMD)) -backend-manage-command: +backend-manage-command: ## run Django's `manage.py` script, passing `$CMD` as arguments. + ## e.g. `make backend-manage-command CMD="makemigrations"` + ## https://docs.djangoproject.com/en/4.1/ref/django-admin/#django-admin-makemigrations + ## alternatively you can open docker container with engine and run commands from there $(call backend_command,python manage.py $(CMD)) diff --git a/dev/.env.dev.example b/dev/.env.dev.example index 9350004d..07258b08 100644 --- a/dev/.env.dev.example +++ b/dev/.env.dev.example @@ -29,10 +29,11 @@ GRAFANA_INCIDENT_STATIC_API_KEY= GRAFANA_API_URL=http://localhost:3000 CELERY_WORKER_QUEUE="default,critical,long,slack,telegram,webhook,retry,celery" -CELERY_WORKER_CONCURRENCY=1 +CELERY_WORKER_CONCURRENCY=3 CELERY_WORKER_MAX_TASKS_PER_CHILD=100 CELERY_WORKER_SHUTDOWN_INTERVAL=65m CELERY_WORKER_BEAT_ENABLED=True +CELERY_WORKER_DEBUG_LOGS=False RABBITMQ_USERNAME=rabbitmq RABBITMQ_PASSWORD=rabbitmq diff --git a/dev/README.md b/dev/README.md index 3848dfbc..e2a31124 100644 --- a/dev/README.md +++ b/dev/README.md @@ -150,45 +150,14 @@ To run these tests locally simply do the following: npx playwright install # install playwright dependencies cp ./grafana-plugin/.env.example ./grafana-plugin/.env # you may need to tweak the values in ./grafana-plugin/.env according to your local setup +cd grafana-plugin yarn test:integration ``` ## Useful `make` commands See [`COMPOSE_PROFILES`](#compose_profiles) for more information on what this option is and how to configure it. - -```bash -make init # build the frontend plugin code then run make start -make start # start all of the docker containers -make stop # stop all of the docker containers -make restart # restart all docker containers -make build # rebuild images (e.g. when changing requirements.txt) -# run Django's `manage.py` script, inside of a docker container, passing `$CMD` as arguments. -# e.g. `make engine-manage CMD="makemigrations"` - https://docs.djangoproject.com/en/4.1/ref/django-admin/#django-admin-makemigrations -make engine-manage CMD="..." - -make backend-debug-enable # enable Django's debug mode and Silk profiling (this is disabled by default for performance reasons) -make backend-debug-disable # disable Django's debug mode and Silk profiling - -# this will remove all of the images, containers, volumes, and networks -# associated with your local OnCall developer setup -make cleanup - -make start-celery-beat # start celery beat -make purge-queues # purge celery queues -make shell # starts an OnCall engine Django shell -make dbshell # opens a DB shell -make exec-engine # exec into engine container's bash -make test # run backend tests - -# run Django's `manage.py` script, passing `$CMD` as arguments. -# e.g. `make backend-manage-command CMD="makemigrations"` - https://docs.djangoproject.com/en/4.1/ref/django-admin/#django-admin-makemigrations -make backend-manage-command CMD="..." - -# run both frontend and backend linters -# may need to run `yarn install` from within `grafana-plugin` to install several `pre-commit` dependencies -make lint -``` +> 🚶‍This part was moved to `make help` command. Run it to see all the available commands and their descriptions ## Setting environment variables diff --git a/docs/sources/open-source/_index.md b/docs/sources/open-source/_index.md index f13c3213..5d3f981d 100644 --- a/docs/sources/open-source/_index.md +++ b/docs/sources/open-source/_index.md @@ -243,7 +243,7 @@ The limit can be changed using env variables: ## Mobile application set up ->**Note**: This application is currently in beta +> **Note**: This application is currently in beta Grafana OnCall OSS users can use the mobile app to receive push notifications from OnCall. Grafana OnCall OSS relies on Grafana Cloud as on relay for push notifications. @@ -255,3 +255,29 @@ For Grafana OnCall OSS, the mobile app QR code includes an authentication token Your Grafana OnCall OSS instance should be reachable from the same network as your mobile device, preferably from the internet. For more information, see [Grafana OnCall mobile app]({{< relref "../mobile-app" >}}) + +## Alert Group Escalation Auditor + +Grafana OnCall has a periodic background task, which runs to check that all alert group escalations have finished +properly. This feature, if configured, can also ping an OnCall Webhook Integration's heartbeat URL, so that you can be +alerted, in the event that something goes wrong. + +Logs originating from the celery worker, for the `apps.alerts.tasks.check_escalation_finished.check_escalation_finished_task` +task, that reference a `AlertGroupEscalationPolicyExecutionAuditException` exception +indicate that the auditor periodic task is failing check(s) on one or more alert groups. Logs for this task which +mention `.. passed the audit checks` indicate that there were no issues with with the escalation on the audited +alert groups. + +To configure this feature as such: + +1. Create a Webhook, or Formatted Webhook, Integration type. +1. Under the "Heartbeat" tab in the Integration modal, copy the unique heartbeat URL that is shown. +1. Set the hearbeat's expected time interval to 15 minutes (see note below regarding `ALERT_GROUP_ESCALATION_AUDITOR_CELERY_TASK_HEARTBEAT_INTERVAL`) +1. Configure the integration's escalation chain as necessary +1. Populate the following env variables: + +- `ALERT_GROUP_ESCALATION_AUDITOR_CELERY_TASK_HEARTBEAT_URL` - integration's unique heartbeat URL +- `ALERT_GROUP_ESCALATION_AUDITOR_CELERY_TASK_HEARTBEAT_INTERVAL` - how often the auditor task should run. By default the + task runs every 13 minutes so we therefore recommend setting the heartbeat's expected time interval to 15 minutes. If you + would like to modify this, we recommend configuring this env variable to 1 or 2 minutes less than the value set for the + integration's heartbeat expected time interval. diff --git a/engine/apps/alerts/escalation_snapshot/escalation_snapshot_mixin.py b/engine/apps/alerts/escalation_snapshot/escalation_snapshot_mixin.py index 1f9072ce..04c1cb8a 100644 --- a/engine/apps/alerts/escalation_snapshot/escalation_snapshot_mixin.py +++ b/engine/apps/alerts/escalation_snapshot/escalation_snapshot_mixin.py @@ -1,3 +1,4 @@ +import datetime import logging from typing import Optional @@ -5,19 +6,16 @@ import pytz from celery import uuid as celery_uuid from dateutil.parser import parse from django.apps import apps -from django.utils import timezone from django.utils.functional import cached_property from rest_framework.exceptions import ValidationError -from apps.alerts.constants import NEXT_ESCALATION_DELAY from apps.alerts.escalation_snapshot.snapshot_classes import ( ChannelFilterSnapshot, EscalationChainSnapshot, EscalationPolicySnapshot, EscalationSnapshot, ) -from apps.alerts.escalation_snapshot.utils import eta_for_escalation_step_notify_if_time -from apps.alerts.tasks import calculate_escalation_finish_time, escalate_alert_group +from apps.alerts.tasks import escalate_alert_group logger = logging.getLogger(__name__) @@ -90,8 +88,7 @@ class EscalationSnapshotMixin: 'next_step_eta': '2021-10-18T10:28:28.890369Z } """ - - escalation_snapshot = None + data = {} if self.escalation_chain_exists: channel_filter = self.channel_filter @@ -104,53 +101,7 @@ class EscalationSnapshotMixin: "escalation_policies_snapshots": escalation_policies, "slack_channel_id": self.slack_channel_id, } - escalation_snapshot = EscalationSnapshot.serializer(data).data - return escalation_snapshot - - def calculate_eta_for_finish_escalation(self, escalation_started=False, start_time=None): - if not self.escalation_snapshot: - return - EscalationPolicy = apps.get_model("alerts", "EscalationPolicy") - TOLERANCE_SECONDS = 1 - TOLERANCE_TIME = timezone.timedelta(seconds=NEXT_ESCALATION_DELAY + TOLERANCE_SECONDS) - start_time = start_time or timezone.now() # start time may be different for silenced incidents - wait_summ = timezone.timedelta() - # Get next_active_escalation_policy_order using flag `escalation_started` because this calculation can be - # started in parallel with escalation task where next_active_escalation_policy_order can be changed. - # That's why we are using `escalation_started` flag here, which means, that we want count eta from the first - # step. - next_escalation_policy_order = ( - self.escalation_snapshot.next_active_escalation_policy_order if escalation_started else 0 - ) - escalation_policies = self.escalation_snapshot.escalation_policies_snapshots[next_escalation_policy_order:] - for escalation_policy in escalation_policies: - if escalation_policy.step == EscalationPolicy.STEP_WAIT: - if escalation_policy.wait_delay is not None: - wait_summ += escalation_policy.wait_delay - else: - wait_summ += EscalationPolicy.DEFAULT_WAIT_DELAY # Default wait in case it's not selected yet - elif escalation_policy.step == EscalationPolicy.STEP_NOTIFY_IF_TIME: - if escalation_policy.from_time and escalation_policy.to_time: - estimate_start_time = start_time + wait_summ - STEP_TOLERANCE = timezone.timedelta(minutes=1) - next_step_estimate_start_time = eta_for_escalation_step_notify_if_time( - escalation_policy.from_time, - escalation_policy.to_time, - estimate_start_time + STEP_TOLERANCE, - ) - wait_summ += next_step_estimate_start_time - estimate_start_time - elif escalation_policy.step == EscalationPolicy.STEP_REPEAT_ESCALATION_N_TIMES: - # the part of escalation with repeat step will be passed six times: the first time plus five repeats - wait_summ *= EscalationPolicy.MAX_TIMES_REPEAT + 1 - elif escalation_policy.step == EscalationPolicy.STEP_NOTIFY_IF_NUM_ALERTS_IN_TIME_WINDOW: - # In this case we cannot calculate finish time, so we return None - return - elif escalation_policy.step == EscalationPolicy.STEP_FINAL_RESOLVE: - break - wait_summ += TOLERANCE_TIME - - escalation_finish_time = start_time + wait_summ - return escalation_finish_time + return EscalationSnapshot.serializer(data).data @property def channel_filter_with_respect_to_escalation_snapshot(self): @@ -166,38 +117,52 @@ class EscalationSnapshotMixin: @cached_property def channel_filter_snapshot(self) -> Optional[ChannelFilterSnapshot]: - # in some cases we need only channel filter and don't want to serialize whole escalation - channel_filter_snapshot_object = None + """ + in some cases we need only channel filter and don't want to serialize whole escalation + """ escalation_snapshot = self.raw_escalation_snapshot - if escalation_snapshot is not None: - channel_filter_snapshot = ChannelFilterSnapshot.serializer().to_internal_value( - escalation_snapshot["channel_filter_snapshot"] - ) - channel_filter_snapshot_object = ChannelFilterSnapshot(**channel_filter_snapshot) - return channel_filter_snapshot_object + if not escalation_snapshot: + return None + + channel_filter_snapshot = escalation_snapshot["channel_filter_snapshot"] + if not channel_filter_snapshot: + return None + + channel_filter_snapshot = ChannelFilterSnapshot.serializer().to_internal_value(channel_filter_snapshot) + return ChannelFilterSnapshot(**channel_filter_snapshot) @cached_property def escalation_chain_snapshot(self) -> Optional[EscalationChainSnapshot]: - # in some cases we need only escalation chain and don't want to serialize whole escalation + """ + in some cases we need only escalation chain and don't want to serialize whole escalation escalation_chain_snapshot_object = None + """ escalation_snapshot = self.raw_escalation_snapshot - if escalation_snapshot is not None: - escalation_chain_snapshot = EscalationChainSnapshot.serializer().to_internal_value( - escalation_snapshot["escalation_chain_snapshot"] - ) - escalation_chain_snapshot_object = EscalationChainSnapshot(**escalation_chain_snapshot) - return escalation_chain_snapshot_object + if not escalation_snapshot: + return None + + escalation_chain_snapshot = escalation_snapshot["escalation_chain_snapshot"] + if not escalation_chain_snapshot: + return None + + escalation_chain_snapshot = EscalationChainSnapshot.serializer().to_internal_value(escalation_chain_snapshot) + return EscalationChainSnapshot(**escalation_chain_snapshot) @cached_property def escalation_snapshot(self) -> Optional[EscalationSnapshot]: - escalation_snapshot_object = None raw_escalation_snapshot = self.raw_escalation_snapshot - if raw_escalation_snapshot is not None: + if raw_escalation_snapshot: try: - escalation_snapshot_object = self._deserialize_escalation_snapshot(raw_escalation_snapshot) + return self._deserialize_escalation_snapshot(raw_escalation_snapshot) except ValidationError as e: logger.error(f"Error trying to deserialize raw escalation snapshot: {e}") - return escalation_snapshot_object + return None + + @cached_property + def has_escalation_policies_snapshots(self) -> bool: + if not self.raw_escalation_snapshot: + return False + return len(self.raw_escalation_snapshot["escalation_policies_snapshots"]) > 0 def _deserialize_escalation_snapshot(self, raw_escalation_snapshot) -> EscalationSnapshot: """ @@ -225,20 +190,34 @@ class EscalationSnapshotMixin: return escalation_snapshot_object @property - def escalation_chain_exists(self): - return not self.pause_escalation and self.channel_filter and self.channel_filter.escalation_chain + def escalation_chain_exists(self) -> bool: + if self.pause_escalation: + return False + elif not self.channel_filter: + return False + return self.channel_filter.escalation_chain is not None @property - def pause_escalation(self): - # get pause_escalation field directly to avoid serialization overhead - return self.raw_escalation_snapshot is not None and self.raw_escalation_snapshot.get("pause_escalation", False) + def pause_escalation(self) -> bool: + """ + get pause_escalation field directly to avoid serialization overhead + """ + if not self.raw_escalation_snapshot: + return False + return self.raw_escalation_snapshot.get("pause_escalation", False) @property - def next_step_eta(self): - # get next_step_eta field directly to avoid serialization overhead - raw_next_step_eta = ( - self.raw_escalation_snapshot.get("next_step_eta") if self.raw_escalation_snapshot is not None else None - ) + def next_step_eta(self) -> Optional[datetime.datetime]: + """ + get next_step_eta field directly to avoid serialization overhead + """ + if not self.raw_escalation_snapshot: + return None + + raw_next_step_eta = self.raw_escalation_snapshot.get("next_step_eta") + if not raw_next_step_eta: + return None + if raw_next_step_eta: return parse(raw_next_step_eta).replace(tzinfo=pytz.UTC) @@ -272,13 +251,10 @@ class EscalationSnapshotMixin: is_escalation_finished=False, raw_escalation_snapshot=raw_escalation_snapshot, ) - if not self.pause_escalation: - calculate_escalation_finish_time.apply_async((self.pk,), immutable=True) escalate_alert_group.apply_async((self.pk,), countdown=countdown, immutable=True, eta=eta, task_id=task_id) def stop_escalation(self): self.is_escalation_finished = True - self.estimate_escalation_finish_time = None # change active_escalation_id to prevent alert escalation self.active_escalation_id = "intentionally_stopped" - self.save(update_fields=["is_escalation_finished", "estimate_escalation_finish_time", "active_escalation_id"]) + self.save(update_fields=["is_escalation_finished", "active_escalation_id"]) diff --git a/engine/apps/alerts/escalation_snapshot/serializers/escalation_snapshot.py b/engine/apps/alerts/escalation_snapshot/serializers/escalation_snapshot.py index fbe15be0..f2b5fb4d 100644 --- a/engine/apps/alerts/escalation_snapshot/serializers/escalation_snapshot.py +++ b/engine/apps/alerts/escalation_snapshot/serializers/escalation_snapshot.py @@ -8,11 +8,11 @@ from apps.alerts.escalation_snapshot.serializers import ( class EscalationSnapshotSerializer(serializers.Serializer): - channel_filter_snapshot = ChannelFilterSnapshotSerializer() - escalation_chain_snapshot = EscalationChainSnapshotSerializer() + channel_filter_snapshot = ChannelFilterSnapshotSerializer(allow_null=True, default=None) + escalation_chain_snapshot = EscalationChainSnapshotSerializer(allow_null=True, default=None) last_active_escalation_policy_order = serializers.IntegerField(allow_null=True, default=None) - escalation_policies_snapshots = EscalationPolicySnapshotSerializer(many=True) - slack_channel_id = serializers.CharField(allow_null=True) + escalation_policies_snapshots = EscalationPolicySnapshotSerializer(many=True, default=list) + slack_channel_id = serializers.CharField(allow_null=True, default=None) pause_escalation = serializers.BooleanField(allow_null=True, default=False) next_step_eta = serializers.DateTimeField(allow_null=True, default=None) diff --git a/engine/apps/alerts/escalation_snapshot/snapshot_classes/escalation_snapshot.py b/engine/apps/alerts/escalation_snapshot/snapshot_classes/escalation_snapshot.py index d4845c57..1997408d 100644 --- a/engine/apps/alerts/escalation_snapshot/snapshot_classes/escalation_snapshot.py +++ b/engine/apps/alerts/escalation_snapshot/snapshot_classes/escalation_snapshot.py @@ -1,15 +1,23 @@ import logging -from typing import Optional +import typing from celery.utils.log import get_task_logger +from django.utils import timezone from apps.alerts.escalation_snapshot.serializers import EscalationSnapshotSerializer -from apps.alerts.escalation_snapshot.snapshot_classes.escalation_policy_snapshot import EscalationPolicySnapshot from apps.alerts.models.alert_group_log_record import AlertGroupLogRecord logger = get_task_logger(__name__) logger.setLevel(logging.DEBUG) +if typing.TYPE_CHECKING: + from apps.alerts.escalation_snapshot.snapshot_classes import ( + ChannelFilterSnapshot, + EscalationChainSnapshot, + EscalationPolicySnapshot, + ) + from apps.alerts.models import AlertGroup + class EscalationSnapshot: __slots__ = ( @@ -28,34 +36,34 @@ class EscalationSnapshot: def __init__( self, - alert_group, - channel_filter_snapshot, - escalation_chain_snapshot, - last_active_escalation_policy_order, - escalation_policies_snapshots, - slack_channel_id, - pause_escalation, - next_step_eta, + alert_group: "AlertGroup", + channel_filter_snapshot: "ChannelFilterSnapshot", + escalation_chain_snapshot: "EscalationChainSnapshot", + last_active_escalation_policy_order: int, + escalation_policies_snapshots: typing.List["EscalationPolicySnapshot"], + slack_channel_id: str, + pause_escalation: bool, + next_step_eta: typing.Optional[str], ): self.alert_group = alert_group - self.channel_filter_snapshot = channel_filter_snapshot # ChannelFilterSnapshot object - self.escalation_chain_snapshot = escalation_chain_snapshot # EscalationChainSnapshot object + self.channel_filter_snapshot = channel_filter_snapshot + self.escalation_chain_snapshot = escalation_chain_snapshot self.last_active_escalation_policy_order = last_active_escalation_policy_order - self.escalation_policies_snapshots = escalation_policies_snapshots # list of EscalationPolicySnapshot objects + self.escalation_policies_snapshots = escalation_policies_snapshots self.slack_channel_id = slack_channel_id self.pause_escalation = pause_escalation self.next_step_eta = next_step_eta self.stop_escalation = False @property - def last_active_escalation_policy_snapshot(self) -> Optional[EscalationPolicySnapshot]: + def last_active_escalation_policy_snapshot(self) -> typing.Optional["EscalationPolicySnapshot"]: order = self.last_active_escalation_policy_order if order is None: return None return self.escalation_policies_snapshots[order] @property - def next_active_escalation_policy_snapshot(self) -> Optional[EscalationPolicySnapshot]: + def next_active_escalation_policy_snapshot(self) -> typing.Optional["EscalationPolicySnapshot"]: order = self.next_active_escalation_policy_order if len(self.escalation_policies_snapshots) < order + 1: next_link = None @@ -71,6 +79,31 @@ class EscalationSnapshot: next_order = self.last_active_escalation_policy_order + 1 return next_order + @property + def executed_escalation_policy_snapshots(self) -> typing.List["EscalationPolicySnapshot"]: + """ + Returns a list of escalation policy snapshots that have already been executed, according + to the value of last_active_escalation_policy_order + """ + if self.last_active_escalation_policy_order is None: + return [] + elif self.last_active_escalation_policy_order == 0: + return [self.escalation_policies_snapshots[0]] + return self.escalation_policies_snapshots[: self.last_active_escalation_policy_order] + + def next_step_eta_is_valid(self) -> typing.Union[None, bool]: + """ + `next_step_eta` should never be less than the current time (with a 5 minute buffer provided) + as this field should be updated as the escalation policy is executed over time. If it is, this means that + an escalation policy step has been missed, or is substantially delayed + + if `next_step_eta` is `None` then `None` is returned, otherwise a boolean is returned + representing the result of the time comparision + """ + if self.next_step_eta is None: + return None + return self.next_step_eta > (timezone.now() - timezone.timedelta(minutes=5)) + def save_to_alert_group(self) -> None: self.alert_group.raw_escalation_snapshot = self.convert_to_dict() self.alert_group.save(update_fields=["raw_escalation_snapshot"]) @@ -83,7 +116,6 @@ class EscalationSnapshot: Executes actual escalation step and saves result of execution like stop_escalation param and eta, that will be used for start next escalate_alert_group task. Also updates self.last_active_escalation_policy_order if escalation step was executed. - :return: None """ escalation_policy_snapshot = self.next_active_escalation_policy_snapshot if escalation_policy_snapshot is None: diff --git a/engine/apps/alerts/incident_log_builder/incident_log_builder.py b/engine/apps/alerts/incident_log_builder/incident_log_builder.py index f3f55d58..1b204bae 100644 --- a/engine/apps/alerts/incident_log_builder/incident_log_builder.py +++ b/engine/apps/alerts/incident_log_builder/incident_log_builder.py @@ -134,7 +134,7 @@ class IncidentLogBuilder: # check if escalation snapshot wasn't saved and channel filter was deleted. # We cannot generate escalation plan in this case escalation_snapshot = self.alert_group.escalation_snapshot - if escalation_snapshot is None: + if not self.alert_group.has_escalation_policies_snapshots: return escalation_plan_dict if self.alert_group.silenced_until: diff --git a/engine/apps/alerts/models/alert_group.py b/engine/apps/alerts/models/alert_group.py index bf58d3d3..7d4c4c95 100644 --- a/engine/apps/alerts/models/alert_group.py +++ b/engine/apps/alerts/models/alert_group.py @@ -13,6 +13,7 @@ from django.db import IntegrityError, models, transaction from django.db.models import JSONField, Q, QuerySet from django.utils import timezone from django.utils.functional import cached_property +from django_deprecate_fields import deprecate_field from apps.alerts.escalation_snapshot import EscalationSnapshotMixin from apps.alerts.incident_appearance.renderers.constants import DEFAULT_BACKUP_TITLE @@ -336,7 +337,9 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. maintenance_uuid = models.CharField(max_length=100, unique=True, null=True, default=None) raw_escalation_snapshot = JSONField(null=True, default=None) - estimate_escalation_finish_time = models.DateTimeField(null=True, default=None) + + # THIS FIELD IS DEPRECATED AND SHOULD EVENTUALLY BE REMOVED + estimate_escalation_finish_time = deprecate_field(models.DateTimeField(null=True, default=None)) # This field is used for constraints so we can use get_or_create() in concurrent calls # https://docs.djangoproject.com/en/3.2/ref/models/querysets/#get-or-create @@ -1464,14 +1467,7 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. def start_unsilence_task(self, countdown): task_id = celery_uuid() self.unsilence_task_uuid = task_id - - # recalculate finish escalation time - escalation_start_time = timezone.now() + timezone.timedelta(seconds=countdown) - self.estimate_escalation_finish_time = self.calculate_eta_for_finish_escalation( - start_time=escalation_start_time - ) - - self.save(update_fields=["unsilence_task_uuid", "estimate_escalation_finish_time"]) + self.save(update_fields=["unsilence_task_uuid"]) unsilence_task.apply_async((self.pk,), task_id=task_id, countdown=countdown) @property diff --git a/engine/apps/alerts/tasks/__init__.py b/engine/apps/alerts/tasks/__init__.py index 0ccc5b70..70146f9c 100644 --- a/engine/apps/alerts/tasks/__init__.py +++ b/engine/apps/alerts/tasks/__init__.py @@ -3,7 +3,6 @@ from .alert_group_web_title_cache import ( # noqa:F401 update_web_title_cache, update_web_title_cache_for_alert_receive_channel, ) -from .calculcate_escalation_finish_time import calculate_escalation_finish_time # noqa from .call_ack_url import call_ack_url # noqa: F401 from .check_escalation_finished import check_escalation_finished_task # noqa: F401 from .create_contact_points_for_datasource import create_contact_points_for_datasource # noqa: F401 diff --git a/engine/apps/alerts/tasks/calculcate_escalation_finish_time.py b/engine/apps/alerts/tasks/calculcate_escalation_finish_time.py deleted file mode 100644 index ef1483c1..00000000 --- a/engine/apps/alerts/tasks/calculcate_escalation_finish_time.py +++ /dev/null @@ -1,15 +0,0 @@ -from django.apps import apps -from django.conf import settings - -from common.custom_celery_tasks import shared_dedicated_queue_retry_task - - -@shared_dedicated_queue_retry_task( - autoretry_for=(Exception,), retry_backoff=True, max_retries=1 if settings.DEBUG else None -) -def calculate_escalation_finish_time(alert_group_pk): - AlertGroup = apps.get_model("alerts", "AlertGroup") - alert_group = AlertGroup.all_objects.filter(pk=alert_group_pk)[0] - if alert_group.escalation_snapshot: - alert_group.estimate_escalation_finish_time = alert_group.calculate_eta_for_finish_escalation() - alert_group.save(update_fields=["estimate_escalation_finish_time"]) diff --git a/engine/apps/alerts/tasks/check_escalation_finished.py b/engine/apps/alerts/tasks/check_escalation_finished.py index 64f44d3d..c382aa5f 100644 --- a/engine/apps/alerts/tasks/check_escalation_finished.py +++ b/engine/apps/alerts/tasks/check_escalation_finished.py @@ -1,48 +1,148 @@ +import datetime +import typing + +import requests +from celery import shared_task from django.apps import apps from django.conf import settings from django.db.models import Q from django.utils import timezone from apps.alerts.tasks.task_logger import task_logger -from common.custom_celery_tasks import shared_dedicated_queue_retry_task +from common.database import get_random_readonly_database_key_if_present_otherwise_default + +if typing.TYPE_CHECKING: + from apps.alerts.models.alert_group import AlertGroup -@shared_dedicated_queue_retry_task( - autoretry_for=(Exception,), retry_backoff=True, max_retries=1 if settings.DEBUG else None, default_retry_delay=60 -) +class AlertGroupEscalationPolicyExecutionAuditException(BaseException): + """This exception is raised when an alert group's escalation policy did not execute execute properly for some reason""" + + +def send_alert_group_escalation_auditor_task_heartbeat() -> None: + heartbeat_url = settings.ALERT_GROUP_ESCALATION_AUDITOR_CELERY_TASK_HEARTBEAT_URL + if heartbeat_url: + task_logger.info(f"Sending heartbeat to configured URL: {heartbeat_url}") + requests.get(heartbeat_url).raise_for_status() + task_logger.info(f"Heartbeat successfully sent to {heartbeat_url}") + else: + task_logger.info(f"Skipping sending heartbeat as no heartbeat URL is configured") + + +def audit_alert_group_escalation(alert_group: "AlertGroup") -> None: + escalation_snapshot = alert_group.escalation_snapshot + alert_group_id = alert_group.id + base_msg = f"Alert group {alert_group_id}" + + if not escalation_snapshot: + raise AlertGroupEscalationPolicyExecutionAuditException( + f"{base_msg} does not have an escalation snapshot associated with it, this should never occur" + ) + task_logger.info(f"{base_msg} has an escalation snapshot associated with it, auditing if it executed properly") + + escalation_policies_snapshots = escalation_snapshot.escalation_policies_snapshots + + if not escalation_policies_snapshots: + task_logger.info( + f"{base_msg}'s escalation snapshot has an empty escalation_policies_snapshots, skipping further validation" + ) + return + task_logger.info( + f"{base_msg}'s escalation snapshot has a populated escalation_policies_snapshots, continuing validation" + ) + + if escalation_snapshot.next_step_eta_is_valid() is False: + raise AlertGroupEscalationPolicyExecutionAuditException( + f"{base_msg}'s escalation snapshot does not have a valid next_step_eta: {escalation_snapshot.next_step_eta}" + ) + task_logger.info(f"{base_msg}'s escalation snapshot has a valid next_step_eta: {escalation_snapshot.next_step_eta}") + + executed_escalation_policy_snapshots = escalation_snapshot.executed_escalation_policy_snapshots + num_of_executed_escalation_policy_snapshots = len(executed_escalation_policy_snapshots) + + if num_of_executed_escalation_policy_snapshots == 0: + task_logger.info( + f"{base_msg}'s escalation snapshot does not have any executed escalation policies, skipping further validation" + ) + else: + task_logger.info( + f"{base_msg}'s escalation snapshot has {num_of_executed_escalation_policy_snapshots} executed escalation policies" + ) + + # TODO: consider adding the below checks later on. This is it a bit trickier to properly audit as the + # number of log records can vary if there are any STEP_NOTIFY_IF_NUM_ALERTS_IN_TIME_WINDOW or + # STEP_REPEAT_ESCALATION_N_TIMES escalation policy steps in the escalation chain + # see conversations in the original PR (https://github.com/grafana/oncall/pull/1266) for more context on this + # + # compare number of triggered/failed alert group log records to the number of executed + # escalation policy snapshot steps + # num_of_relevant_log_records = AlertGroupLogRecord.objects.filter( + # alert_group_id=alert_group_id, + # type__in=[AlertGroupLogRecord.TYPE_ESCALATION_TRIGGERED, AlertGroupLogRecord.TYPE_ESCALATION_FAILED], + # ).count() + + # if num_of_relevant_log_records < num_of_executed_escalation_policy_snapshots: + # raise AlertGroupEscalationPolicyExecutionAuditException( + # f"{base_msg}'s number of triggered/failed alert group log records ({num_of_relevant_log_records}) is less " + # f"than the number of executed escalation policy snapshot steps ({num_of_executed_escalation_policy_snapshots})" + # ) + + # task_logger.info( + # f"{base_msg}'s number of triggered/failed alert group log records ({num_of_relevant_log_records}) is greater " + # f"than or equal to the number of executed escalation policy snapshot steps ({num_of_executed_escalation_policy_snapshots})" + # ) + + task_logger.info(f"{base_msg} passed the audit checks") + + +def get_auditable_alert_groups_started_at_range() -> typing.Tuple[datetime.datetime, datetime.datetime]: + """ + NOTE: this started_at__range is a bit of a hack.. + we wanted to avoid performing a migration on the alerts_alertgroup table to update + alert groups where raw_escalation_snapshot was None. raw_escalation_snapshot being None is a legitimate case, + where the alert group's integration does not have an escalation chain associated with it. + + However, we wanted a way to be able to differentiate between "actually None" and "there was an error writing to + raw_escalation_snapshot" (as this is performed async by a celery task). + + This field was updated, in the commit that added this comment, to no longer be set to None by default. + As part of this celery task we do a check that this field is in fact not None, so if we were to check older + alert groups, whose integration did not have an escalation chain at the time the alert group was created + we would raise errors + """ + return (datetime.datetime(2023, 3, 25), timezone.now() - timezone.timedelta(days=2)) + + +# don't retry this task as the AlertGroup DB query is rather expensive +@shared_task def check_escalation_finished_task(): - """ - This task periodically checks if there are no alert groups with not finished escalations. - TODO: QA this properly, check if new type of escalations had been added - """ AlertGroup = apps.get_model("alerts", "AlertGroup") AlertReceiveChannel = apps.get_model("alerts", "AlertReceiveChannel") - CHECKING_TOLERANCE = timezone.timedelta(minutes=5) - CHECKING_TIME = timezone.now() - CHECKING_TOLERANCE - - alert_groups = AlertGroup.all_objects.filter( + alert_groups = AlertGroup.all_objects.using(get_random_readonly_database_key_if_present_otherwise_default()).filter( ~Q(channel__integration=AlertReceiveChannel.INTEGRATION_MAINTENANCE), ~Q(silenced=True, silenced_until__isnull=True), # filter silenced forever alert_groups is_escalation_finished=False, resolved=False, acknowledged=False, root_alert_group=None, - estimate_escalation_finish_time__lte=CHECKING_TIME, + started_at__range=get_auditable_alert_groups_started_at_range(), ) if not alert_groups.exists(): - return + task_logger.info("There are no alert groups to audit, everything is good :)") - exception_template = "Escalation for alert_group {} is not finished at expected time {}, now {}" + alert_group_ids_that_failed_audit: typing.List[str] = [] - now = timezone.now() - exception_text = "\n".join( - exception_template.format(alert_group.pk, alert_group.estimate_escalation_finish_time, now) - for alert_group in alert_groups - ) + for alert_group in alert_groups: + try: + audit_alert_group_escalation(alert_group) + except AlertGroupEscalationPolicyExecutionAuditException: + alert_group_ids_that_failed_audit.append(str(alert_group.id)) - ids = alert_groups.values_list("pk", flat=True) - task_logger.debug(ids) + if alert_group_ids_that_failed_audit: + raise AlertGroupEscalationPolicyExecutionAuditException( + f"The following alert group id(s) failed auditing: {', '.join(alert_group_ids_that_failed_audit)}" + ) - raise Exception(exception_text) + send_alert_group_escalation_auditor_task_heartbeat() diff --git a/engine/apps/alerts/tests/conftest.py b/engine/apps/alerts/tests/conftest.py index 2550fdef..6038a091 100644 --- a/engine/apps/alerts/tests/conftest.py +++ b/engine/apps/alerts/tests/conftest.py @@ -1,6 +1,9 @@ +import datetime + import pytest from apps.alerts.incident_appearance.templaters import AlertSlackTemplater +from apps.alerts.models import EscalationPolicy @pytest.fixture() @@ -9,3 +12,51 @@ def mock_alert_renderer_render_for(monkeypatch): return "invalid_render_for" monkeypatch.setattr(AlertSlackTemplater, "_render_for", mock_render_for) + + +@pytest.fixture() +def escalation_snapshot_test_setup( + make_organization_and_user, + make_user_for_organization, + make_alert_receive_channel, + make_channel_filter, + make_escalation_chain, + make_escalation_policy, + make_alert_group, +): + organization, user_1 = make_organization_and_user() + user_2 = make_user_for_organization(organization) + + alert_receive_channel = make_alert_receive_channel(organization) + + escalation_chain = make_escalation_chain(organization) + channel_filter = make_channel_filter( + alert_receive_channel, + escalation_chain=escalation_chain, + notification_backends={"BACKEND": {"channel_id": "abc123"}}, + ) + + notify_to_multiple_users_step = make_escalation_policy( + escalation_chain=channel_filter.escalation_chain, + escalation_policy_step=EscalationPolicy.STEP_NOTIFY_MULTIPLE_USERS, + ) + notify_to_multiple_users_step.notify_to_users_queue.set([user_1, user_2]) + wait_step = make_escalation_policy( + escalation_chain=channel_filter.escalation_chain, + escalation_policy_step=EscalationPolicy.STEP_WAIT, + wait_delay=EscalationPolicy.FIFTEEN_MINUTES, + ) + # random time for test + from_time = datetime.time(10, 30) + to_time = datetime.time(18, 45) + notify_if_time_step = make_escalation_policy( + escalation_chain=channel_filter.escalation_chain, + escalation_policy_step=EscalationPolicy.STEP_NOTIFY_IF_TIME, + from_time=from_time, + to_time=to_time, + ) + + alert_group = make_alert_group(alert_receive_channel, channel_filter=channel_filter) + alert_group.raw_escalation_snapshot = alert_group.build_raw_escalation_snapshot() + alert_group.save() + return alert_group, notify_to_multiple_users_step, wait_step, notify_if_time_step diff --git a/engine/apps/alerts/tests/test_check_escalation_finished_task.py b/engine/apps/alerts/tests/test_check_escalation_finished_task.py index 2e2ce436..ad057102 100644 --- a/engine/apps/alerts/tests/test_check_escalation_finished_task.py +++ b/engine/apps/alerts/tests/test_check_escalation_finished_task.py @@ -1,45 +1,329 @@ +from unittest.mock import Mock, PropertyMock, patch + import pytest +import requests +from django.test import override_settings from django.utils import timezone -from apps.alerts.models import AlertReceiveChannel -from apps.alerts.tasks import check_escalation_finished_task +from apps.alerts.models import AlertGroup +from apps.alerts.tasks.check_escalation_finished import ( + AlertGroupEscalationPolicyExecutionAuditException, + audit_alert_group_escalation, + check_escalation_finished_task, + send_alert_group_escalation_auditor_task_heartbeat, +) + +MOCKED_HEARTBEAT_URL = "https://hello.com/lsdjjkf" + + +# def _get_relevant_log_record_type() -> int: +# return random.choice([AlertGroupLogRecord.TYPE_ESCALATION_TRIGGERED, AlertGroupLogRecord.TYPE_ESCALATION_FAILED]) + + +def test_send_alert_group_escalation_auditor_task_heartbeat_does_not_call_the_heartbeat_url_if_one_is_not_configured(): + with patch("apps.alerts.tasks.check_escalation_finished.requests") as mock_requests: + send_alert_group_escalation_auditor_task_heartbeat() + mock_requests.get.assert_not_called() + + +@override_settings(ALERT_GROUP_ESCALATION_AUDITOR_CELERY_TASK_HEARTBEAT_URL=MOCKED_HEARTBEAT_URL) +def test_send_alert_group_escalation_auditor_task_heartbeat_calls_the_heartbeat_url_if_one_is_configured(): + with patch("apps.alerts.tasks.check_escalation_finished.requests") as mock_requests: + send_alert_group_escalation_auditor_task_heartbeat() + + mock_requests.get.assert_called_once_with(MOCKED_HEARTBEAT_URL) + mock_requests.get.return_value.raise_for_status.assert_called_once_with() + + +@override_settings(ALERT_GROUP_ESCALATION_AUDITOR_CELERY_TASK_HEARTBEAT_URL=MOCKED_HEARTBEAT_URL) +def test_send_alert_group_escalation_auditor_task_heartbeat_raises_an_exception_if_the_heartbeat_url_request_fails(): + with patch("apps.alerts.tasks.check_escalation_finished.requests") as mock_requests: + mock_response = Mock() + mock_response.status_code = 500 + mock_response.raise_for_status.side_effect = requests.exceptions.HTTPError + + mock_requests.get.return_value = mock_response + + with pytest.raises(requests.exceptions.HTTPError): + send_alert_group_escalation_auditor_task_heartbeat() + + mock_requests.get.assert_called_once_with(MOCKED_HEARTBEAT_URL) + mock_requests.get.return_value.raise_for_status.assert_called_once_with() @pytest.mark.django_db -def test_check_escalation_finished_task( +def test_audit_alert_group_escalation_raises_exception_if_the_alert_group_does_not_have_an_escalation_snapshot( + escalation_snapshot_test_setup, +): + alert_group, _, _, _ = escalation_snapshot_test_setup + alert_group.escalation_snapshot = None + + with pytest.raises(AlertGroupEscalationPolicyExecutionAuditException): + audit_alert_group_escalation(alert_group) + + +@pytest.mark.django_db +def test_audit_alert_group_escalation_skips_further_validation_if_the_escalation_policies_snapshots_is_empty( + escalation_snapshot_test_setup, +): + alert_group, _, _, _ = escalation_snapshot_test_setup + + alert_group.escalation_snapshot.escalation_policies_snapshots = [] + audit_alert_group_escalation(alert_group) + + alert_group.escalation_snapshot.escalation_policies_snapshots = None + audit_alert_group_escalation(alert_group) + + +@pytest.mark.django_db +@pytest.mark.parametrize( + "next_step_eta_is_valid_return_value,raises_exception", + [ + (None, False), + (True, False), + (False, True), + ], +) +def test_audit_alert_group_escalation_next_step_eta_validation( + escalation_snapshot_test_setup, next_step_eta_is_valid_return_value, raises_exception +): + alert_group, _, _, _ = escalation_snapshot_test_setup + + with patch( + "apps.alerts.escalation_snapshot.snapshot_classes.escalation_snapshot.EscalationSnapshot.next_step_eta_is_valid" + ) as mock_next_step_eta_is_valid: + mock_next_step_eta_is_valid.return_value = next_step_eta_is_valid_return_value + + if raises_exception: + with pytest.raises(AlertGroupEscalationPolicyExecutionAuditException): + audit_alert_group_escalation(alert_group) + else: + try: + audit_alert_group_escalation(alert_group) + except AlertGroupEscalationPolicyExecutionAuditException: + pytest.fail() + + mock_next_step_eta_is_valid.assert_called_once_with() + + +@pytest.mark.django_db +def test_audit_alert_group_escalation_no_executed_escalation_policy_snapshots(escalation_snapshot_test_setup): + alert_group, _, _, _ = escalation_snapshot_test_setup + + with patch( + "apps.alerts.escalation_snapshot.snapshot_classes.escalation_snapshot.EscalationSnapshot.executed_escalation_policy_snapshots", + new_callable=PropertyMock, + ) as mock_executed_escalation_policy_snapshots: + mock_executed_escalation_policy_snapshots.return_value = [] + audit_alert_group_escalation(alert_group) + mock_executed_escalation_policy_snapshots.assert_called_once_with() + + +# # see TODO: comment in engine/apps/alerts/tasks/check_escalation_finished.py +# @pytest.mark.django_db +# def test_audit_alert_group_escalation_all_executed_escalation_policy_snapshots_have_triggered_log_records( +# escalation_snapshot_test_setup, +# make_organization_and_user, +# make_alert_group_log_record, +# ): +# _, user = make_organization_and_user() +# alert_group, _, _, _ = escalation_snapshot_test_setup +# escalation_policies_snapshots = alert_group.escalation_snapshot.escalation_policies_snapshots + +# for escalation_policy_snapshot in escalation_policies_snapshots: +# escalation_policy = EscalationPolicy.objects.get(id=escalation_policy_snapshot.id) +# log_record_type = _get_relevant_log_record_type() + +# make_alert_group_log_record(alert_group, log_record_type, user, escalation_policy=escalation_policy) + +# with patch( +# "apps.alerts.escalation_snapshot.snapshot_classes.escalation_snapshot.EscalationSnapshot.executed_escalation_policy_snapshots", +# new_callable=PropertyMock, +# ) as mock_executed_escalation_policy_snapshots: +# mock_executed_escalation_policy_snapshots.return_value = escalation_policies_snapshots +# audit_alert_group_escalation(alert_group) +# mock_executed_escalation_policy_snapshots.assert_called_once_with() + +# see TODO: comment in engine/apps/alerts/tasks/check_escalation_finished.py +# @pytest.mark.django_db +# def test_audit_alert_group_escalation_one_executed_escalation_policy_snapshot_does_not_have_a_triggered_log_record( +# escalation_snapshot_test_setup, +# make_organization_and_user, +# make_alert_group_log_record, +# ): +# _, user = make_organization_and_user() +# alert_group, _, _, _ = escalation_snapshot_test_setup +# escalation_policies_snapshots = alert_group.escalation_snapshot.escalation_policies_snapshots + +# # let's skip creating a relevant alert group log record for the first executed escalation policy +# for idx, escalation_policy_snapshot in enumerate(escalation_policies_snapshots): +# if idx != 0: +# escalation_policy = EscalationPolicy.objects.get(id=escalation_policy_snapshot.id) +# make_alert_group_log_record( +# alert_group, _get_relevant_log_record_type(), user, escalation_policy=escalation_policy +# ) + +# with patch( +# "apps.alerts.escalation_snapshot.snapshot_classes.escalation_snapshot.EscalationSnapshot.executed_escalation_policy_snapshots", +# new_callable=PropertyMock, +# ) as mock_executed_escalation_policy_snapshots: +# mock_executed_escalation_policy_snapshots.return_value = escalation_policies_snapshots + +# with pytest.raises(AlertGroupEscalationPolicyExecutionAuditException): +# audit_alert_group_escalation(alert_group) +# mock_executed_escalation_policy_snapshots.assert_called_once_with() + + +@pytest.mark.django_db +def test_check_escalation_finished_task_queries_doesnt_grab_alert_groups_outside_of_date_range( make_organization_and_user, make_alert_receive_channel, make_alert_group, ): - organization, user = make_organization_and_user() - alert_receive_channel = make_alert_receive_channel( - organization, integration=AlertReceiveChannel.INTEGRATION_GRAFANA - ) - alert_group = make_alert_group(alert_receive_channel) + organization, _ = make_organization_and_user() + alert_receive_channel = make_alert_receive_channel(organization) now = timezone.now() + two_days_ago = now - timezone.timedelta(days=2) + two_days_in_future = now + timezone.timedelta(days=2) - # we don't have escalation finish time, seems we cannot calculate it due escalation chain snapshot has uncalculated - # steps or does not exist, no exception is raised - check_escalation_finished_task() + # we can't simply pass started_at to the fixture because started_at is being "auto-set" on the Model + alert_group1 = make_alert_group(alert_receive_channel) + alert_group1.started_at = now - # it's acceptable time for finish escalation, because we have tolerance time 5 min from now, no exception is raised - alert_group.estimate_escalation_finish_time = now - alert_group.save() - check_escalation_finished_task() + alert_group2 = make_alert_group(alert_receive_channel) + alert_group2.started_at = now - timezone.timedelta(days=5) - # it is acceptable time for finish escalation, so no exception is raised - alert_group.estimate_escalation_finish_time = now + timezone.timedelta(minutes=10) - alert_group.save() - check_escalation_finished_task() + alert_group3 = make_alert_group(alert_receive_channel) + alert_group3.started_at = now + timezone.timedelta(days=5) - # escalation is not finished yet and passed more than 5 minutes after estimate time, exception is raised - alert_group.estimate_escalation_finish_time = now - timezone.timedelta(minutes=10) - alert_group.save() - with pytest.raises(Exception): - check_escalation_finished_task() + AlertGroup.all_objects.bulk_update([alert_group1, alert_group2, alert_group3], ["started_at"]) - # escalation is finished and we don't care anymore about its finish time, so no exception is raised - alert_group.is_escalation_finished = True - alert_group.save() - check_escalation_finished_task() + with patch( + "apps.alerts.tasks.check_escalation_finished.get_auditable_alert_groups_started_at_range" + ) as mocked_get_auditable_alert_groups_started_at_range: + with patch( + "apps.alerts.tasks.check_escalation_finished.audit_alert_group_escalation" + ) as mocked_audit_alert_group_escalation: + with patch( + "apps.alerts.tasks.check_escalation_finished.send_alert_group_escalation_auditor_task_heartbeat" + ) as mocked_send_alert_group_escalation_auditor_task_heartbeat: + mocked_get_auditable_alert_groups_started_at_range.return_value = (two_days_ago, two_days_in_future) + + check_escalation_finished_task() + + mocked_audit_alert_group_escalation.assert_called_once_with(alert_group1) + mocked_send_alert_group_escalation_auditor_task_heartbeat.assert_called_once_with() + + +@pytest.mark.django_db +def test_check_escalation_finished_task_calls_audit_alert_group_escalation_for_every_alert_group( + make_organization_and_user, + make_alert_receive_channel, + make_alert_group, +): + organization, _ = make_organization_and_user() + alert_receive_channel = make_alert_receive_channel(organization) + + now = timezone.now() + two_days_ago = now - timezone.timedelta(days=2) + two_days_in_future = now + timezone.timedelta(days=2) + + # we can't simply pass started_at to the fixture because started_at is being "auto-set" on the Model + alert_group1 = make_alert_group(alert_receive_channel) + alert_group1.started_at = now + + alert_group2 = make_alert_group(alert_receive_channel) + alert_group2.started_at = now + + alert_group3 = make_alert_group(alert_receive_channel) + alert_group3.started_at = now + + AlertGroup.all_objects.bulk_update([alert_group1, alert_group2, alert_group3], ["started_at"]) + + with patch( + "apps.alerts.tasks.check_escalation_finished.get_auditable_alert_groups_started_at_range" + ) as mocked_get_auditable_alert_groups_started_at_range: + with patch( + "apps.alerts.tasks.check_escalation_finished.audit_alert_group_escalation" + ) as mocked_audit_alert_group_escalation: + with patch( + "apps.alerts.tasks.check_escalation_finished.send_alert_group_escalation_auditor_task_heartbeat" + ) as mocked_send_alert_group_escalation_auditor_task_heartbeat: + mocked_get_auditable_alert_groups_started_at_range.return_value = (two_days_ago, two_days_in_future) + + check_escalation_finished_task() + + mocked_audit_alert_group_escalation.assert_any_call(alert_group1) + mocked_audit_alert_group_escalation.assert_any_call(alert_group2) + mocked_audit_alert_group_escalation.assert_any_call(alert_group3) + mocked_send_alert_group_escalation_auditor_task_heartbeat.assert_called_once_with() + + +@pytest.mark.django_db +def test_check_escalation_finished_task_simply_calls_heartbeat_when_no_alert_groups_found(): + with patch( + "apps.alerts.tasks.check_escalation_finished.audit_alert_group_escalation" + ) as mocked_audit_alert_group_escalation: + with patch( + "apps.alerts.tasks.check_escalation_finished.send_alert_group_escalation_auditor_task_heartbeat" + ) as mocked_send_alert_group_escalation_auditor_task_heartbeat: + check_escalation_finished_task() + mocked_audit_alert_group_escalation.assert_not_called() + mocked_send_alert_group_escalation_auditor_task_heartbeat.assert_called_once_with() + + +@pytest.mark.django_db +def test_check_escalation_finished_task_calls_audit_alert_group_escalation_for_every_alert_group_even_if_one_fails( + make_organization_and_user, + make_alert_receive_channel, + make_alert_group, +): + organization, _ = make_organization_and_user() + alert_receive_channel = make_alert_receive_channel(organization) + + now = timezone.now() + two_days_ago = now - timezone.timedelta(days=2) + two_days_in_future = now + timezone.timedelta(days=2) + + # we can't simply pass started_at to the fixture because started_at is being "auto-set" on the Model + alert_group1 = make_alert_group(alert_receive_channel) + alert_group1.started_at = now + + alert_group2 = make_alert_group(alert_receive_channel) + alert_group2.started_at = now + + alert_group3 = make_alert_group(alert_receive_channel) + alert_group3.started_at = now + + AlertGroup.all_objects.bulk_update([alert_group1, alert_group2, alert_group3], ["started_at"]) + + def _mocked_audit_alert_group_escalation(alert_group): + if not alert_group.id == alert_group3.id: + raise AlertGroupEscalationPolicyExecutionAuditException("asdfasdf") + + with patch( + "apps.alerts.tasks.check_escalation_finished.get_auditable_alert_groups_started_at_range" + ) as mocked_get_auditable_alert_groups_started_at_range: + with patch( + "apps.alerts.tasks.check_escalation_finished.audit_alert_group_escalation" + ) as mocked_audit_alert_group_escalation: + with patch( + "apps.alerts.tasks.check_escalation_finished.send_alert_group_escalation_auditor_task_heartbeat" + ) as mocked_send_alert_group_escalation_auditor_task_heartbeat: + mocked_get_auditable_alert_groups_started_at_range.return_value = (two_days_ago, two_days_in_future) + mocked_audit_alert_group_escalation.side_effect = _mocked_audit_alert_group_escalation + + with pytest.raises(AlertGroupEscalationPolicyExecutionAuditException) as exc: + check_escalation_finished_task() + + assert ( + str(exc.value) + == f"The following alert group id(s) failed auditing: {alert_group1.id}, {alert_group2.id}" + ) + + mocked_audit_alert_group_escalation.assert_any_call(alert_group1) + mocked_audit_alert_group_escalation.assert_any_call(alert_group2) + mocked_audit_alert_group_escalation.assert_any_call(alert_group3) + + mocked_send_alert_group_escalation_auditor_task_heartbeat.assert_not_called() diff --git a/engine/apps/alerts/tests/test_escalation_policy_snapshot.py b/engine/apps/alerts/tests/test_escalation_policy_snapshot.py index e84050b2..038a93f8 100644 --- a/engine/apps/alerts/tests/test_escalation_policy_snapshot.py +++ b/engine/apps/alerts/tests/test_escalation_policy_snapshot.py @@ -134,7 +134,7 @@ def test_escalation_step_notify_multiple_users( escalation_step_test_setup, make_escalation_policy, ): - organization, user, _, channel_filter, alert_group, reason = escalation_step_test_setup + _, user, _, channel_filter, alert_group, reason = escalation_step_test_setup notify_users_step = make_escalation_policy( escalation_chain=channel_filter.escalation_chain, @@ -386,7 +386,7 @@ def test_escalation_step_notify_if_num_alerts_in_window( ).exists() assert not mocked_execute_tasks.called - organization, user, _, channel_filter, alert_group, reason = escalation_step_test_setup + _, _, _, channel_filter, alert_group, reason = escalation_step_test_setup make_alert(alert_group=alert_group, raw_request_data={}) diff --git a/engine/apps/alerts/tests/test_escalation_snapshot.py b/engine/apps/alerts/tests/test_escalation_snapshot.py index 98aaad03..8ce55b8f 100644 --- a/engine/apps/alerts/tests/test_escalation_snapshot.py +++ b/engine/apps/alerts/tests/test_escalation_snapshot.py @@ -1,5 +1,3 @@ -import datetime - import pytest from django.utils import timezone @@ -11,54 +9,6 @@ from apps.alerts.escalation_snapshot.snapshot_classes import ( from apps.alerts.models import EscalationPolicy -@pytest.fixture() -def escalation_snapshot_test_setup( - make_organization_and_user, - make_user_for_organization, - make_alert_receive_channel, - make_channel_filter, - make_escalation_chain, - make_escalation_policy, - make_alert_group, -): - organization, user_1 = make_organization_and_user() - user_2 = make_user_for_organization(organization) - - alert_receive_channel = make_alert_receive_channel(organization) - - escalation_chain = make_escalation_chain(organization) - channel_filter = make_channel_filter( - alert_receive_channel, - escalation_chain=escalation_chain, - notification_backends={"BACKEND": {"channel_id": "abc123"}}, - ) - - notify_to_multiple_users_step = make_escalation_policy( - escalation_chain=channel_filter.escalation_chain, - escalation_policy_step=EscalationPolicy.STEP_NOTIFY_MULTIPLE_USERS, - ) - notify_to_multiple_users_step.notify_to_users_queue.set([user_1, user_2]) - wait_step = make_escalation_policy( - escalation_chain=channel_filter.escalation_chain, - escalation_policy_step=EscalationPolicy.STEP_WAIT, - wait_delay=EscalationPolicy.FIFTEEN_MINUTES, - ) - # random time for test - from_time = datetime.time(10, 30) - to_time = datetime.time(18, 45) - notify_if_time_step = make_escalation_policy( - escalation_chain=channel_filter.escalation_chain, - escalation_policy_step=EscalationPolicy.STEP_NOTIFY_IF_TIME, - from_time=from_time, - to_time=to_time, - ) - - alert_group = make_alert_group(alert_receive_channel, channel_filter=channel_filter) - alert_group.raw_escalation_snapshot = alert_group.build_raw_escalation_snapshot() - alert_group.save() - return alert_group, notify_to_multiple_users_step, wait_step, notify_if_time_step - - @pytest.mark.django_db def test_raw_escalation_snapshot(escalation_snapshot_test_setup): alert_group, notify_to_multiple_users_step, wait_step, notify_if_time_step = escalation_snapshot_test_setup @@ -142,7 +92,7 @@ def test_raw_escalation_snapshot(escalation_snapshot_test_setup): @pytest.mark.django_db def test_serialized_escalation_snapshot(escalation_snapshot_test_setup): - alert_group, notify_to_multiple_users_step, wait_step, notify_if_time_step = escalation_snapshot_test_setup + alert_group, _, _, _ = escalation_snapshot_test_setup escalation_snapshot = alert_group.escalation_snapshot assert isinstance(escalation_snapshot, EscalationSnapshot) assert escalation_snapshot.channel_filter_snapshot is not None and isinstance( @@ -163,7 +113,7 @@ def test_serialized_escalation_snapshot(escalation_snapshot_test_setup): @pytest.mark.django_db def test_escalation_snapshot_with_deleted_channel_filter(escalation_snapshot_test_setup): - alert_group, notify_to_multiple_users_step, wait_step, notify_if_time_step = escalation_snapshot_test_setup + alert_group, _, _, _ = escalation_snapshot_test_setup alert_group.channel_filter.delete() escalation_snapshot = alert_group.escalation_snapshot @@ -174,7 +124,7 @@ def test_escalation_snapshot_with_deleted_channel_filter(escalation_snapshot_tes @pytest.mark.django_db def test_change_escalation_snapshot(escalation_snapshot_test_setup): - alert_group, notify_to_multiple_users_step, wait_step, notify_if_time_step = escalation_snapshot_test_setup + alert_group, _, _, _ = escalation_snapshot_test_setup new_active_order = 2 now = timezone.now() @@ -194,7 +144,7 @@ def test_change_escalation_snapshot(escalation_snapshot_test_setup): @pytest.mark.django_db def test_next_escalation_policy_snapshot(escalation_snapshot_test_setup): - alert_group, notify_to_multiple_users_step, wait_step, notify_if_time_step = escalation_snapshot_test_setup + alert_group, _, _, _ = escalation_snapshot_test_setup escalation_snapshot = alert_group.escalation_snapshot assert escalation_snapshot.last_active_escalation_policy_order is None @@ -226,3 +176,39 @@ def test_next_escalation_policy_snapshot(escalation_snapshot_test_setup): is escalation_snapshot.escalation_policies_snapshots[-1] ) assert escalation_snapshot.next_active_escalation_policy_snapshot is None + + +@pytest.mark.django_db +@pytest.mark.parametrize( + "next_step_eta,expected", + [ + (None, None), + (timezone.now() - timezone.timedelta(weeks=50), False), + (timezone.now() - timezone.timedelta(minutes=4), True), + (timezone.now() + timezone.timedelta(minutes=4), True), + ], +) +def test_next_step_eta_is_valid(escalation_snapshot_test_setup, next_step_eta, expected) -> None: + alert_group, _, _, _ = escalation_snapshot_test_setup + escalation_snapshot = alert_group.escalation_snapshot + + escalation_snapshot.next_step_eta = next_step_eta + + assert escalation_snapshot.next_step_eta_is_valid() is expected + + +@pytest.mark.django_db +def test_executed_escalation_policy_snapshots(escalation_snapshot_test_setup): + alert_group, _, _, _ = escalation_snapshot_test_setup + escalation_snapshot = alert_group.escalation_snapshot + + escalation_snapshot.last_active_escalation_policy_order = None + assert escalation_snapshot.executed_escalation_policy_snapshots == [] + + escalation_snapshot.last_active_escalation_policy_order = 0 + assert escalation_snapshot.executed_escalation_policy_snapshots == [ + escalation_snapshot.escalation_policies_snapshots[0] + ] + + escalation_snapshot.last_active_escalation_policy_order = len(escalation_snapshot.escalation_policies_snapshots) + assert escalation_snapshot.executed_escalation_policy_snapshots == escalation_snapshot.escalation_policies_snapshots diff --git a/engine/apps/alerts/tests/test_escalation_snapshot_mixin.py b/engine/apps/alerts/tests/test_escalation_snapshot_mixin.py new file mode 100644 index 00000000..4aa984e0 --- /dev/null +++ b/engine/apps/alerts/tests/test_escalation_snapshot_mixin.py @@ -0,0 +1,658 @@ +from unittest.mock import PropertyMock, patch + +import pytest +import pytz +from rest_framework.exceptions import ValidationError + +from apps.alerts.escalation_snapshot.snapshot_classes import EscalationSnapshot +from apps.alerts.models import EscalationPolicy + +MOCK_SLACK_CHANNEL_ID = "asdfljaskdf" +EMPTY_RAW_ESCALATION_SNAPSHOT = { + "channel_filter_snapshot": None, + "escalation_chain_snapshot": None, + "last_active_escalation_policy_order": None, + "escalation_policies_snapshots": [], + "slack_channel_id": None, + "pause_escalation": False, + "next_step_eta": None, +} + + +@patch("apps.alerts.models.alert_group.AlertGroup.slack_channel_id", new_callable=PropertyMock) +@pytest.mark.django_db +def test_build_raw_escalation_snapshot_escalation_chain_exists( + mock_alert_group_slack_channel_id, + make_organization_and_user, + make_alert_receive_channel, + make_channel_filter, + make_escalation_chain, + make_escalation_policy, + make_alert_group, +): + mock_alert_group_slack_channel_id.return_value = MOCK_SLACK_CHANNEL_ID + + organization, _ = make_organization_and_user() + alert_receive_channel = make_alert_receive_channel(organization) + escalation_chain = make_escalation_chain(organization=organization) + channel_filter = make_channel_filter(alert_receive_channel, escalation_chain=escalation_chain) + make_escalation_policy( + escalation_chain=channel_filter.escalation_chain, + escalation_policy_step=EscalationPolicy.STEP_WAIT, + wait_delay=EscalationPolicy.FIFTEEN_MINUTES, + ) + + alert_group = make_alert_group(alert_receive_channel, channel_filter=channel_filter) + + expected_snapshot = EscalationSnapshot.serializer( + { + "channel_filter_snapshot": alert_group.channel_filter, + "escalation_chain_snapshot": alert_group.channel_filter.escalation_chain, + "escalation_policies_snapshots": alert_group.channel_filter.escalation_chain.escalation_policies.all(), + "slack_channel_id": MOCK_SLACK_CHANNEL_ID, + } + ) + + assert alert_group.build_raw_escalation_snapshot() == expected_snapshot.data + + +@patch( + "apps.alerts.escalation_snapshot.escalation_snapshot_mixin.EscalationSnapshotMixin.pause_escalation", + new_callable=PropertyMock, +) +@pytest.mark.django_db +def test_build_raw_escalation_snapshot_escalation_chain_does_not_exist_escalation_paused( + mocked_pause_escalation, + make_organization_and_user, + make_alert_receive_channel, + make_channel_filter, + make_escalation_chain, + make_alert_group, +): + mocked_pause_escalation.return_value = True + + organization, _ = make_organization_and_user() + alert_receive_channel = make_alert_receive_channel(organization) + escalation_chain = make_escalation_chain(organization=organization) + channel_filter = make_channel_filter(alert_receive_channel, escalation_chain=escalation_chain) + alert_group = make_alert_group(alert_receive_channel, channel_filter=channel_filter) + + assert alert_group.build_raw_escalation_snapshot() == EMPTY_RAW_ESCALATION_SNAPSHOT + + +@pytest.mark.django_db +def test_build_raw_escalation_snapshot_escalation_chain_does_not_exist_no_channel_filter( + make_organization_and_user, + make_alert_receive_channel, + make_alert_group, +): + organization, _ = make_organization_and_user() + alert_receive_channel = make_alert_receive_channel(organization) + alert_group = make_alert_group(alert_receive_channel) + + assert alert_group.build_raw_escalation_snapshot() == EMPTY_RAW_ESCALATION_SNAPSHOT + + +@pytest.mark.django_db +def test_build_raw_escalation_snapshot_escalation_chain_does_not_exist_no_channel_filter_escalation_chain( + make_organization_and_user, + make_alert_receive_channel, + make_channel_filter, + make_alert_group, +): + organization, _ = make_organization_and_user() + alert_receive_channel = make_alert_receive_channel(organization) + channel_filter = make_channel_filter(alert_receive_channel) + alert_group = make_alert_group(alert_receive_channel, channel_filter=channel_filter) + + assert alert_group.build_raw_escalation_snapshot() == EMPTY_RAW_ESCALATION_SNAPSHOT + + +@patch( + "apps.alerts.escalation_snapshot.escalation_snapshot_mixin.EscalationSnapshotMixin.channel_filter_snapshot", + new_callable=PropertyMock, +) +@pytest.mark.django_db +def test_channel_filter_with_respect_to_escalation_snapshot( + mock_channel_filter_snapshot, + make_organization_and_user, + make_alert_receive_channel, + make_channel_filter, + make_alert_group, +): + channel_filter_snapshot = "asdfasdfadsfadsf" + mock_channel_filter_snapshot.return_value = channel_filter_snapshot + + organization, _ = make_organization_and_user() + alert_receive_channel = make_alert_receive_channel(organization) + channel_filter = make_channel_filter(alert_receive_channel) + alert_group = make_alert_group(alert_receive_channel, channel_filter=channel_filter) + + assert alert_group.channel_filter_with_respect_to_escalation_snapshot == channel_filter_snapshot + + +@patch( + "apps.alerts.escalation_snapshot.escalation_snapshot_mixin.EscalationSnapshotMixin.channel_filter_snapshot", + new_callable=PropertyMock, +) +@pytest.mark.django_db +def test_channel_filter_with_respect_to_escalation_snapshot_no_channel_filter_snapshot( + mock_channel_filter_snapshot, + make_organization_and_user, + make_alert_receive_channel, + make_channel_filter, + make_alert_group, +): + mock_channel_filter_snapshot.return_value = None + + organization, _ = make_organization_and_user() + alert_receive_channel = make_alert_receive_channel(organization) + channel_filter = make_channel_filter(alert_receive_channel) + alert_group = make_alert_group(alert_receive_channel, channel_filter=channel_filter) + + assert alert_group.channel_filter_with_respect_to_escalation_snapshot == channel_filter + + +@patch( + "apps.alerts.escalation_snapshot.escalation_snapshot_mixin.EscalationSnapshotMixin.escalation_chain_snapshot", + new_callable=PropertyMock, +) +@pytest.mark.django_db +def test_escalation_chain_with_respect_to_escalation_snapshot( + mock_escalation_chain_snapshot, + make_organization_and_user, + make_alert_receive_channel, + make_channel_filter, + make_alert_group, +): + escalation_chain_snapshot = "asdfasdfadsfadsf" + mock_escalation_chain_snapshot.return_value = escalation_chain_snapshot + + organization, _ = make_organization_and_user() + alert_receive_channel = make_alert_receive_channel(organization) + channel_filter = make_channel_filter(alert_receive_channel) + alert_group = make_alert_group(alert_receive_channel, channel_filter=channel_filter) + + assert alert_group.escalation_chain_with_respect_to_escalation_snapshot == escalation_chain_snapshot + + +@patch( + "apps.alerts.escalation_snapshot.escalation_snapshot_mixin.EscalationSnapshotMixin.escalation_chain_snapshot", + new_callable=PropertyMock, +) +@pytest.mark.django_db +def test_escalation_chain_with_respect_to_escalation_snapshot_no_escalation_chain_snapshot( + mock_escalation_chain_snapshot, + make_organization_and_user, + make_alert_receive_channel, + make_channel_filter, + make_escalation_chain, + make_alert_group, +): + mock_escalation_chain_snapshot.return_value = None + + organization, _ = make_organization_and_user() + alert_receive_channel = make_alert_receive_channel(organization) + escalation_chain = make_escalation_chain(organization=organization) + channel_filter = make_channel_filter(alert_receive_channel, escalation_chain=escalation_chain) + + alert_group = make_alert_group(alert_receive_channel, channel_filter=channel_filter) + + assert alert_group.escalation_chain_with_respect_to_escalation_snapshot == escalation_chain + + alert_group = make_alert_group(alert_receive_channel) + + assert alert_group.channel_filter is None + assert alert_group.escalation_chain_with_respect_to_escalation_snapshot is None + + +@patch( + "apps.alerts.escalation_snapshot.escalation_snapshot_mixin.EscalationSnapshotMixin.escalation_chain_snapshot", + new_callable=PropertyMock, +) +@pytest.mark.django_db +def test_escalation_chain_with_respect_to_escalation_snapshot_no_escalation_chain_snapshot_and_no_channel_filter( + mock_escalation_chain_snapshot, + make_organization_and_user, + make_alert_receive_channel, + make_alert_group, +): + mock_escalation_chain_snapshot.return_value = None + + organization, _ = make_organization_and_user() + alert_receive_channel = make_alert_receive_channel(organization) + alert_group = make_alert_group(alert_receive_channel) + + assert alert_group.escalation_chain_with_respect_to_escalation_snapshot is None + + +@pytest.mark.django_db +def test_channel_filter_snapshot( + make_organization_and_user, + make_alert_receive_channel, + make_channel_filter, + make_escalation_chain, + make_escalation_policy, + make_alert_group, +): + organization, _ = make_organization_and_user() + alert_receive_channel = make_alert_receive_channel(organization) + escalation_chain = make_escalation_chain(organization=organization) + channel_filter = make_channel_filter(alert_receive_channel, escalation_chain=escalation_chain) + make_escalation_policy( + escalation_chain=channel_filter.escalation_chain, + escalation_policy_step=EscalationPolicy.STEP_WAIT, + wait_delay=EscalationPolicy.FIFTEEN_MINUTES, + ) + + alert_group = make_alert_group(alert_receive_channel, channel_filter=channel_filter) + alert_group.raw_escalation_snapshot = alert_group.build_raw_escalation_snapshot() + + assert alert_group.channel_filter_snapshot.id == channel_filter.id + + +@pytest.mark.django_db +def test_channel_filter_snapshot_no_escalation_chain_exists( + make_organization_and_user, + make_alert_receive_channel, + make_channel_filter, + make_alert_group, +): + organization, _ = make_organization_and_user() + alert_receive_channel = make_alert_receive_channel(organization) + channel_filter = make_channel_filter(alert_receive_channel) + alert_group = make_alert_group(alert_receive_channel, channel_filter=channel_filter) + + alert_group.raw_escalation_snapshot = alert_group.build_raw_escalation_snapshot() + + assert alert_group.raw_escalation_snapshot["channel_filter_snapshot"] is None + assert alert_group.channel_filter_snapshot is None + + +@pytest.mark.django_db +def test_channel_filter_snapshot_no_alert_group_raw_escalation_snapshot( + make_organization_and_user, + make_alert_receive_channel, + make_channel_filter, + make_alert_group, +): + organization, _ = make_organization_and_user() + alert_receive_channel = make_alert_receive_channel(organization) + channel_filter = make_channel_filter(alert_receive_channel) + alert_group = make_alert_group(alert_receive_channel, channel_filter=channel_filter) + + assert alert_group.channel_filter_snapshot is None + + +@pytest.mark.django_db +def test_escalation_chain_snapshot( + make_organization_and_user, + make_alert_receive_channel, + make_channel_filter, + make_escalation_chain, + make_escalation_policy, + make_alert_group, +): + organization, _ = make_organization_and_user() + alert_receive_channel = make_alert_receive_channel(organization) + escalation_chain = make_escalation_chain(organization=organization) + channel_filter = make_channel_filter(alert_receive_channel, escalation_chain=escalation_chain) + make_escalation_policy( + escalation_chain=channel_filter.escalation_chain, + escalation_policy_step=EscalationPolicy.STEP_WAIT, + wait_delay=EscalationPolicy.FIFTEEN_MINUTES, + ) + + alert_group = make_alert_group(alert_receive_channel, channel_filter=channel_filter) + alert_group.raw_escalation_snapshot = alert_group.build_raw_escalation_snapshot() + + assert alert_group.escalation_chain_snapshot.id == escalation_chain.id + + +@pytest.mark.django_db +def test_escalation_chain_snapshot_no_escalation_chain_exists( + make_organization_and_user, + make_alert_receive_channel, + make_channel_filter, + make_alert_group, +): + organization, _ = make_organization_and_user() + alert_receive_channel = make_alert_receive_channel(organization) + channel_filter = make_channel_filter(alert_receive_channel) + alert_group = make_alert_group(alert_receive_channel, channel_filter=channel_filter) + + alert_group.raw_escalation_snapshot = alert_group.build_raw_escalation_snapshot() + + assert alert_group.raw_escalation_snapshot["escalation_chain_snapshot"] is None + assert alert_group.escalation_chain_snapshot is None + + +@pytest.mark.django_db +def test_escalation_chain_snapshot_no_alert_group_raw_escalation_snapshot( + make_organization_and_user, + make_alert_receive_channel, + make_channel_filter, + make_alert_group, +): + organization, _ = make_organization_and_user() + alert_receive_channel = make_alert_receive_channel(organization) + channel_filter = make_channel_filter(alert_receive_channel) + alert_group = make_alert_group(alert_receive_channel, channel_filter=channel_filter) + + assert alert_group.escalation_chain_snapshot is None + + +@pytest.mark.django_db +def test_escalation_snapshot( + make_organization_and_user, + make_alert_receive_channel, + make_channel_filter, + make_alert_group, +): + organization, _ = make_organization_and_user() + alert_receive_channel = make_alert_receive_channel(organization) + channel_filter = make_channel_filter(alert_receive_channel) + alert_group = make_alert_group(alert_receive_channel, channel_filter=channel_filter) + alert_group.raw_escalation_snapshot = alert_group.build_raw_escalation_snapshot() + + return_value = "asdfasdfasdf" + with patch( + "apps.alerts.escalation_snapshot.escalation_snapshot_mixin.EscalationSnapshotMixin._deserialize_escalation_snapshot", + return_value=return_value, + ): + assert alert_group.escalation_snapshot == return_value + + +@pytest.mark.django_db +def test_escalation_snapshot_validation_error( + make_organization_and_user, + make_alert_receive_channel, + make_channel_filter, + make_alert_group, +): + organization, _ = make_organization_and_user() + alert_receive_channel = make_alert_receive_channel(organization) + channel_filter = make_channel_filter(alert_receive_channel) + alert_group = make_alert_group(alert_receive_channel, channel_filter=channel_filter) + alert_group.raw_escalation_snapshot = alert_group.build_raw_escalation_snapshot() + + with patch( + "apps.alerts.escalation_snapshot.escalation_snapshot_mixin.EscalationSnapshotMixin._deserialize_escalation_snapshot", + side_effect=ValidationError("asdfasdf"), + ): + assert alert_group.escalation_snapshot is None + + +@pytest.mark.django_db +def test_escalation_snapshot_no_alert_group_raw_escalation_snapshot( + make_organization_and_user, + make_alert_receive_channel, + make_channel_filter, + make_alert_group, +): + organization, _ = make_organization_and_user() + alert_receive_channel = make_alert_receive_channel(organization) + channel_filter = make_channel_filter(alert_receive_channel) + alert_group = make_alert_group(alert_receive_channel, channel_filter=channel_filter) + + assert alert_group.escalation_snapshot is None + + +@pytest.mark.django_db +def test_escalation_snapshot_empty_escalation_policies_snapshot( + make_organization_and_user, + make_alert_receive_channel, + make_channel_filter, + make_alert_group, +): + organization, _ = make_organization_and_user() + alert_receive_channel = make_alert_receive_channel(organization) + channel_filter = make_channel_filter(alert_receive_channel) + alert_group = make_alert_group(alert_receive_channel, channel_filter=channel_filter) + alert_group.raw_escalation_snapshot = alert_group.build_raw_escalation_snapshot() + + assert alert_group.raw_escalation_snapshot is not None + assert alert_group.has_escalation_policies_snapshots is False + + +@pytest.mark.django_db +def test_escalation_snapshot_nonempty_escalation_policies_snapshot( + make_organization_and_user, + make_alert_receive_channel, + make_channel_filter, + make_escalation_chain, + make_escalation_policy, + make_alert_group, +): + organization, _ = make_organization_and_user() + alert_receive_channel = make_alert_receive_channel(organization) + escalation_chain = make_escalation_chain(organization=organization) + channel_filter = make_channel_filter(alert_receive_channel, escalation_chain=escalation_chain) + make_escalation_policy( + escalation_chain=channel_filter.escalation_chain, + escalation_policy_step=EscalationPolicy.STEP_WAIT, + wait_delay=EscalationPolicy.FIFTEEN_MINUTES, + ) + + alert_group = make_alert_group(alert_receive_channel, channel_filter=channel_filter) + alert_group.raw_escalation_snapshot = alert_group.build_raw_escalation_snapshot() + + assert alert_group.raw_escalation_snapshot is not None + assert alert_group.has_escalation_policies_snapshots is True + + +@pytest.mark.django_db +def test_has_escalation_policies_snapshots_no_alert_group_raw_escalation_snapshot( + make_organization_and_user, + make_alert_receive_channel, + make_channel_filter, + make_alert_group, +): + organization, _ = make_organization_and_user() + alert_receive_channel = make_alert_receive_channel(organization) + channel_filter = make_channel_filter(alert_receive_channel) + alert_group = make_alert_group(alert_receive_channel, channel_filter=channel_filter) + + assert alert_group.raw_escalation_snapshot is None + assert alert_group.has_escalation_policies_snapshots is False + + +@patch("apps.alerts.models.alert_group.AlertGroup.slack_channel_id", new_callable=PropertyMock) +@pytest.mark.django_db +def test_deserialize_escalation_snapshot( + mock_alert_group_slack_channel_id, + make_organization_and_user, + make_alert_receive_channel, + make_channel_filter, + make_escalation_chain, + make_escalation_policy, + make_alert_group, +): + mock_alert_group_slack_channel_id.return_value = MOCK_SLACK_CHANNEL_ID + + organization, _ = make_organization_and_user() + alert_receive_channel = make_alert_receive_channel(organization) + escalation_chain = make_escalation_chain(organization=organization) + channel_filter = make_channel_filter(alert_receive_channel, escalation_chain=escalation_chain) + escalation_policy = make_escalation_policy( + escalation_chain=channel_filter.escalation_chain, + escalation_policy_step=EscalationPolicy.STEP_WAIT, + wait_delay=EscalationPolicy.FIFTEEN_MINUTES, + ) + + alert_group = make_alert_group(alert_receive_channel, channel_filter=channel_filter) + alert_group.raw_escalation_snapshot = alert_group.build_raw_escalation_snapshot() + + deserialized_escalation_snapshot = alert_group._deserialize_escalation_snapshot(alert_group.raw_escalation_snapshot) + + assert deserialized_escalation_snapshot.alert_group == alert_group + assert deserialized_escalation_snapshot.channel_filter_snapshot.id == channel_filter.id + assert deserialized_escalation_snapshot.escalation_chain_snapshot.id == escalation_chain.id + assert deserialized_escalation_snapshot.last_active_escalation_policy_order is None + assert len(deserialized_escalation_snapshot.escalation_policies_snapshots) == 1 + assert deserialized_escalation_snapshot.escalation_policies_snapshots[0].id == escalation_policy.id + assert deserialized_escalation_snapshot.slack_channel_id == MOCK_SLACK_CHANNEL_ID + assert deserialized_escalation_snapshot.pause_escalation is False + assert deserialized_escalation_snapshot.next_step_eta is None + assert deserialized_escalation_snapshot.stop_escalation is False + + +@pytest.mark.django_db +def test_escalation_chain_exists( + make_organization_and_user, + make_alert_receive_channel, + make_channel_filter, + make_escalation_chain, + make_alert_group, +): + organization, _ = make_organization_and_user() + alert_receive_channel = make_alert_receive_channel(organization) + escalation_chain = make_escalation_chain(organization=organization) + channel_filter = make_channel_filter(alert_receive_channel, escalation_chain=escalation_chain) + alert_group = make_alert_group(alert_receive_channel, channel_filter=channel_filter) + + assert alert_group.pause_escalation is False + assert alert_group.escalation_chain_exists is True + + +@patch( + "apps.alerts.escalation_snapshot.escalation_snapshot_mixin.EscalationSnapshotMixin.pause_escalation", + new_callable=PropertyMock, +) +@pytest.mark.django_db +def test_escalation_chain_exists_paused_escalation( + mocked_pause_escalation, + make_organization_and_user, + make_alert_receive_channel, + make_alert_group, +): + mocked_pause_escalation.return_value = True + + organization, _ = make_organization_and_user() + alert_receive_channel = make_alert_receive_channel(organization) + alert_group = make_alert_group(alert_receive_channel) + + assert alert_group.pause_escalation is True + assert alert_group.escalation_chain_exists is False + + +@pytest.mark.django_db +def test_escalation_chain_exists_no_channel_filter( + make_organization_and_user, + make_alert_receive_channel, + make_alert_group, +): + organization, _ = make_organization_and_user() + alert_receive_channel = make_alert_receive_channel(organization) + alert_group = make_alert_group(alert_receive_channel) + + assert alert_group.pause_escalation is False + assert alert_group.channel_filter is None + assert alert_group.escalation_chain_exists is False + + +@pytest.mark.django_db +def test_escalation_chain_exists_no_channel_filter_escalation_chain( + make_organization_and_user, + make_alert_receive_channel, + make_channel_filter, + make_alert_group, +): + organization, _ = make_organization_and_user() + alert_receive_channel = make_alert_receive_channel(organization) + channel_filter = make_channel_filter(alert_receive_channel) + alert_group = make_alert_group(alert_receive_channel, channel_filter=channel_filter) + + assert alert_group.pause_escalation is False + assert alert_group.channel_filter == channel_filter + assert alert_group.channel_filter.escalation_chain is None + assert alert_group.escalation_chain_exists is False + + +@pytest.mark.django_db +def test_pause_escalation_no_raw_escalation_snapshot( + make_organization_and_user, + make_alert_receive_channel, + make_alert_group, +): + organization, _ = make_organization_and_user() + alert_receive_channel = make_alert_receive_channel(organization) + alert_group = make_alert_group(alert_receive_channel) + + assert alert_group.raw_escalation_snapshot is None + assert alert_group.pause_escalation is False + + +@pytest.mark.django_db +def test_pause_escalation_raw_escalation_snapshot_exists( + make_organization_and_user, + make_alert_receive_channel, + make_alert_group, +): + organization, _ = make_organization_and_user() + alert_receive_channel = make_alert_receive_channel(organization) + alert_group = make_alert_group(alert_receive_channel) + alert_group.raw_escalation_snapshot = alert_group.build_raw_escalation_snapshot() + + assert alert_group.raw_escalation_snapshot is not None + assert alert_group.raw_escalation_snapshot["pause_escalation"] is False + + alert_group.raw_escalation_snapshot["pause_escalation"] = True + + assert alert_group.pause_escalation is True + + +@pytest.mark.django_db +def test_next_step_eta_no_raw_escalation_snapshot( + make_organization_and_user, + make_alert_receive_channel, + make_alert_group, +): + organization, _ = make_organization_and_user() + alert_receive_channel = make_alert_receive_channel(organization) + alert_group = make_alert_group(alert_receive_channel) + + assert alert_group.raw_escalation_snapshot is None + assert alert_group.next_step_eta is None + + +@pytest.mark.django_db +def test_next_step_eta_no_next_step_eta( + make_organization_and_user, + make_alert_receive_channel, + make_alert_group, +): + organization, _ = make_organization_and_user() + alert_receive_channel = make_alert_receive_channel(organization) + alert_group = make_alert_group(alert_receive_channel) + alert_group.raw_escalation_snapshot = alert_group.build_raw_escalation_snapshot() + + assert alert_group.raw_escalation_snapshot is not None + assert alert_group.raw_escalation_snapshot["next_step_eta"] is None + assert alert_group.next_step_eta is None + + +@patch("apps.alerts.escalation_snapshot.escalation_snapshot_mixin.parse") +@pytest.mark.django_db +def test_next_step_eta( + mock_dateutil_parser, + make_organization_and_user, + make_alert_receive_channel, + make_alert_group, +): + mocked_raw_date = "mcvnmcvmnvc" + mocked_parsed_date = "asdfasdfaf" + mock_dateutil_parser.return_value.replace.return_value = mocked_parsed_date + + organization, _ = make_organization_and_user() + alert_receive_channel = make_alert_receive_channel(organization) + alert_group = make_alert_group(alert_receive_channel) + alert_group.raw_escalation_snapshot = alert_group.build_raw_escalation_snapshot() + alert_group.raw_escalation_snapshot["next_step_eta"] = mocked_raw_date + + assert alert_group.raw_escalation_snapshot is not None + assert alert_group.raw_escalation_snapshot["next_step_eta"] is mocked_raw_date + assert alert_group.next_step_eta == mocked_parsed_date + + mock_dateutil_parser.assert_called_once_with(mocked_raw_date) + mock_dateutil_parser.return_value.replace.assert_called_once_with(tzinfo=pytz.UTC) diff --git a/engine/apps/api/serializers/webhook.py b/engine/apps/api/serializers/webhook.py index 5af1e5a2..f9160789 100644 --- a/engine/apps/api/serializers/webhook.py +++ b/engine/apps/api/serializers/webhook.py @@ -1,28 +1,26 @@ from collections import defaultdict -from http.client import responses from rest_framework import serializers from rest_framework.validators import UniqueTogetherValidator -from apps.webhooks.models import Webhook, WebhookLog +from apps.webhooks.models import Webhook, WebhookResponse from common.api_helpers.custom_fields import TeamPrimaryKeyRelatedField from common.api_helpers.utils import CurrentOrganizationDefault, CurrentTeamDefault, CurrentUserDefault from common.jinja_templater import apply_jinja_template from common.jinja_templater.apply_jinja_template import JinjaTemplateError, JinjaTemplateWarning -class WebhookLogSerializer(serializers.ModelSerializer): +class WebhookResponseSerializer(serializers.ModelSerializer): class Meta: - model = WebhookLog + model = WebhookResponse fields = [ - "last_run_at", - "input_data", + "timestamp", "url", - "trigger", - "headers", - "data", - "response_status", - "response", + "request_trigger", + "request_headers", + "request_data", + "status_code", + "content", ] @@ -34,7 +32,7 @@ class WebhookSerializer(serializers.ModelSerializer): last_run = serializers.SerializerMethodField() trigger_type = serializers.CharField(required=True) forward_all = serializers.BooleanField(allow_null=True, required=False) - last_status_log = serializers.SerializerMethodField() + last_response_log = serializers.SerializerMethodField() trigger_type_name = serializers.SerializerMethodField() class Meta: @@ -58,7 +56,7 @@ class WebhookSerializer(serializers.ModelSerializer): "trigger_type", "trigger_type_name", "last_run", - "last_status_log", + "last_response_log", ] extra_kwargs = { "authorization_header": {"write_only": True}, @@ -106,16 +104,12 @@ class WebhookSerializer(serializers.ModelSerializer): def get_last_run(self, obj): last_run = "" - last_log = obj.logs.all().last() - if last_log: - last_run = last_log.last_run_at.strftime("%Y-%m-%dT%H:%M:%SZ") - if last_log.response_status: - reason = responses[int(last_log.response_status)] - last_run += " ({} {})".format(last_log.response_status, reason) + if last_log := obj.responses.all().last(): + last_run = last_log.timestamp.strftime("%Y-%m-%dT%H:%M:%SZ") return last_run - def get_last_status_log(self, obj): - return WebhookLogSerializer(obj.logs.all().last()).data + def get_last_response_log(self, obj): + return WebhookResponseSerializer(obj.responses.all().last()).data def get_trigger_type_name(self, obj): trigger_type_name = "" diff --git a/engine/apps/api/tests/test_webhooks.py b/engine/apps/api/tests/test_webhooks.py index 7d163181..e91f1102 100644 --- a/engine/apps/api/tests/test_webhooks.py +++ b/engine/apps/api/tests/test_webhooks.py @@ -48,14 +48,13 @@ def test_get_list_webhooks(webhook_internal_api_setup, make_user_auth_headers): "headers": None, "http_method": "POST", "last_run": "", - "last_status_log": { - "data": "", - "headers": "", - "input_data": None, - "last_run_at": None, - "response": "", - "response_status": "", - "trigger": "", + "last_response_log": { + "request_data": "", + "request_headers": "", + "timestamp": None, + "content": "", + "status_code": None, + "request_trigger": "", "url": "", }, "trigger_template": None, @@ -86,14 +85,13 @@ def test_get_detail_webhook(webhook_internal_api_setup, make_user_auth_headers): "headers": None, "http_method": "POST", "last_run": "", - "last_status_log": { - "data": "", - "headers": "", - "input_data": None, - "last_run_at": None, - "response": "", - "response_status": "", - "trigger": "", + "last_response_log": { + "request_data": "", + "request_headers": "", + "timestamp": None, + "content": "", + "status_code": None, + "request_trigger": "", "url": "", }, "trigger_template": None, @@ -129,14 +127,13 @@ def test_create_webhook(mocked_check_webhooks_2_enabled, webhook_internal_api_se "headers": None, "http_method": "POST", "last_run": "", - "last_status_log": { - "data": "", - "headers": "", - "input_data": None, - "last_run_at": None, - "response": "", - "response_status": "", - "trigger": "", + "last_response_log": { + "request_data": "", + "request_headers": "", + "timestamp": None, + "content": "", + "status_code": None, + "request_trigger": "", "url": "", }, "trigger_template": None, @@ -185,14 +182,13 @@ def test_create_valid_templated_field( "data": None, "http_method": "POST", "last_run": "", - "last_status_log": { - "data": "", - "headers": "", - "input_data": None, - "last_run_at": None, - "response": "", - "response_status": "", - "trigger": "", + "last_response_log": { + "request_data": "", + "request_headers": "", + "timestamp": None, + "content": "", + "status_code": None, + "request_trigger": "", "url": "", }, "trigger_template": None, diff --git a/engine/apps/api/views/auth.py b/engine/apps/api/views/auth.py index 2282c50e..582918b6 100644 --- a/engine/apps/api/views/auth.py +++ b/engine/apps/api/views/auth.py @@ -1,6 +1,7 @@ import logging from urllib.parse import urljoin +from django.conf import settings from django.contrib.auth import REDIRECT_FIELD_NAME from django.http import HttpResponseRedirect from django.views.decorators.cache import never_cache @@ -25,6 +26,11 @@ def overridden_login_slack_auth(request, backend): # We can't just redirect frontend here because we need to make a API call and pass tokens to this view from JS. # So frontend can't follow our redirect. # So wrapping and returning URL to redirect as a string. + if settings.SLACK_INTEGRATION_MAINTENANCE_ENABLED: + return Response( + "Grafana OnCall is temporary unable to connect your slack account or install OnCall to your slack workspace", + status=400, + ) url_to_redirect_to = do_auth(request.backend, redirect_name=REDIRECT_FIELD_NAME).url return Response(url_to_redirect_to, 200) diff --git a/engine/apps/api/views/escalation_chain.py b/engine/apps/api/views/escalation_chain.py index 81cd1ecb..7cbc1f93 100644 --- a/engine/apps/api/views/escalation_chain.py +++ b/engine/apps/api/views/escalation_chain.py @@ -46,7 +46,7 @@ class EscalationChainViewSet( } filter_backends = [SearchFilter] - search_fields = ("^name",) + search_fields = ("name",) serializer_class = EscalationChainSerializer list_serializer_class = EscalationChainListSerializer diff --git a/engine/apps/api/views/webhooks.py b/engine/apps/api/views/webhooks.py index 03dffe5f..f4e21062 100644 --- a/engine/apps/api/views/webhooks.py +++ b/engine/apps/api/views/webhooks.py @@ -32,7 +32,7 @@ class WebhooksView(TeamFilteringMixin, PublicPrimaryKeyMixin, ModelViewSet): queryset = Webhook.objects.filter( organization=self.request.auth.organization, team=self.request.user.current_team, - ).prefetch_related("logs") + ).prefetch_related("responses") return queryset def get_object(self): diff --git a/engine/apps/slack/models/slack_message.py b/engine/apps/slack/models/slack_message.py index 047c82ef..2d576378 100644 --- a/engine/apps/slack/models/slack_message.py +++ b/engine/apps/slack/models/slack_message.py @@ -116,7 +116,7 @@ class SlackMessage(models.Model): slack_user_identity = user.slack_user_identity if slack_user_identity is None: - text = "{}\nTried to invite {} to look at incident. Unfortunately {} is not in slack.".format( + text = "{}\nTried to invite {} to look at the alert group. Unfortunately {} is not in slack.".format( alert_group.long_verbose_name, user_verbal, user_verbal ) @@ -131,7 +131,7 @@ class SlackMessage(models.Model): notification_error_code=UserNotificationPolicyLogRecord.ERROR_NOTIFICATION_IN_SLACK_USER_NOT_IN_SLACK, ).save() else: - text = "{}\nInviting {} to look at incident.".format(alert_group.long_verbose_name, user_verbal) + text = "{}\nInviting {} to look at the alert group.".format(alert_group.long_verbose_name, user_verbal) blocks = [ { diff --git a/engine/apps/slack/scenarios/escalation_delivery.py b/engine/apps/slack/scenarios/escalation_delivery.py index d0dc4c3c..43aad105 100644 --- a/engine/apps/slack/scenarios/escalation_delivery.py +++ b/engine/apps/slack/scenarios/escalation_delivery.py @@ -33,4 +33,4 @@ class EscalationDeliveryStep(scenario_step.ScenarioStep): # Don't mention if asked to notify somehow else but drop a note for colleagues user_mention_as = user_verbal notify_by = " by {}".format(UserNotificationPolicy.NotificationChannel(notification_channel).label) - return "Inviting {}{} to look at incident.".format(user_mention_as, notify_by) + return "Inviting {}{} to look at the alert group.".format(user_mention_as, notify_by) diff --git a/engine/apps/slack/scenarios/resolution_note.py b/engine/apps/slack/scenarios/resolution_note.py index 02bff31b..94c2a8df 100644 --- a/engine/apps/slack/scenarios/resolution_note.py +++ b/engine/apps/slack/scenarios/resolution_note.py @@ -362,9 +362,8 @@ class UpdateResolutionNoteStep(scenario_step.ScenarioStep): "type": "context", "elements": [ { - "type": "plain_text", + "type": "mrkdwn", "text": f"{author_verbal} resolution note from {resolution_note.get_source_display()}.", - "emoji": True, } ], } diff --git a/engine/apps/slack/views.py b/engine/apps/slack/views.py index 3d91d01b..8b016271 100644 --- a/engine/apps/slack/views.py +++ b/engine/apps/slack/views.py @@ -540,20 +540,25 @@ class ResetSlackView(APIView): } def post(self, request): - organization = request.auth.organization - slack_team_identity = organization.slack_team_identity - if slack_team_identity is not None: - clean_slack_integration_leftovers.apply_async((organization.pk,)) - if settings.FEATURE_MULTIREGION_ENABLED: - delete_slack_connector_async.apply_async((slack_team_identity.slack_id,)) - write_chatops_insight_log( - author=request.user, - event_name=ChatOpsEvent.WORKSPACE_DISCONNECTED, - chatops_type=ChatOpsType.SLACK, + if settings.SLACK_INTEGRATION_MAINTENANCE_ENABLED: + response = Response( + "Grafana OnCall is temporary unable to connect your slack account or install OnCall to your slack workspace", + status=400, ) - unpopulate_slack_user_identities(organization.pk, True) - response = Response(status=200) else: - response = Response(status=400) - + organization = request.auth.organization + slack_team_identity = organization.slack_team_identity + if slack_team_identity is not None: + clean_slack_integration_leftovers.apply_async((organization.pk,)) + if settings.FEATURE_MULTIREGION_ENABLED: + delete_slack_connector_async.apply_async((slack_team_identity.slack_id,)) + write_chatops_insight_log( + author=request.user, + event_name=ChatOpsEvent.WORKSPACE_DISCONNECTED, + chatops_type=ChatOpsType.SLACK, + ) + unpopulate_slack_user_identities(organization.pk, True) + response = Response(status=200) + else: + response = Response(status=400) return response diff --git a/engine/apps/webhooks/listeners.py b/engine/apps/webhooks/listeners.py index fad28096..31c83637 100644 --- a/engine/apps/webhooks/listeners.py +++ b/engine/apps/webhooks/listeners.py @@ -13,7 +13,8 @@ def on_alert_created(**kwargs): alert_pk = kwargs["alert"] alert = Alert.objects.get(pk=alert_pk) - alert_group_created.apply_async((alert.group_id,)) + if alert.is_the_first_alert_in_group: + alert_group_created.apply_async((alert.group_id,)) def on_action_triggered(**kwargs): diff --git a/engine/apps/webhooks/migrations/0002_auto_20230320_1604.py b/engine/apps/webhooks/migrations/0002_auto_20230320_1604.py new file mode 100644 index 00000000..78a6dba4 --- /dev/null +++ b/engine/apps/webhooks/migrations/0002_auto_20230320_1604.py @@ -0,0 +1,38 @@ +# Generated by Django 3.2.18 on 2023-03-20 16:04 + +from django.db import migrations, models +import django.db.models.deletion +import django.utils.timezone + +import django_migration_linter as linter + + +class Migration(migrations.Migration): + + dependencies = [ + ('alerts', '0010_channelfilter_filtering_term_type'), + ('webhooks', '0001_initial'), + ] + + operations = [ + linter.IgnoreMigration(), + migrations.CreateModel( + name='WebhookResponse', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('trigger_type', models.IntegerField(choices=[(0, 'Escalation step'), (1, 'Triggered'), (2, 'Acknowledged'), (3, 'Resolved'), (4, 'Silenced'), (5, 'Unsilenced'), (6, 'Unresolved')])), + ('timestamp', models.DateTimeField(default=django.utils.timezone.now)), + ('request_trigger', models.TextField(default=None, null=True)), + ('request_headers', models.TextField(default=None, null=True)), + ('request_data', models.TextField(default=None, null=True)), + ('url', models.TextField(default=None, null=True)), + ('status_code', models.IntegerField(default=None, null=True)), + ('content', models.TextField(default=None, null=True)), + ('alert_group', models.ForeignKey(null=True, on_delete=django.db.models.deletion.CASCADE, related_name='webhook_responses', to='alerts.alertgroup')), + ('webhook', models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='responses', to='webhooks.webhook')), + ], + ), + migrations.DeleteModel( + name='WebhookLog', + ), + ] diff --git a/engine/apps/webhooks/models/__init__.py b/engine/apps/webhooks/models/__init__.py index 8b05fae9..d276a7cc 100644 --- a/engine/apps/webhooks/models/__init__.py +++ b/engine/apps/webhooks/models/__init__.py @@ -1 +1 @@ -from .webhook import Webhook, WebhookLog # noqa: F401 +from .webhook import Webhook, WebhookResponse # noqa: F401 diff --git a/engine/apps/webhooks/models/webhook.py b/engine/apps/webhooks/models/webhook.py index 7ff9c097..ad6b679f 100644 --- a/engine/apps/webhooks/models/webhook.py +++ b/engine/apps/webhooks/models/webhook.py @@ -244,19 +244,27 @@ class Webhook(models.Model): return result -class WebhookLog(models.Model): - last_run_at = models.DateTimeField(blank=True, null=True) - input_data = models.JSONField(default=None) - url = models.TextField(null=True, default=None) - trigger = models.TextField(null=True, default=None) - headers = models.TextField(null=True, default=None) - data = models.TextField(null=True, default=None) - response_status = models.CharField(max_length=100, null=True, default=None) - response = models.TextField(null=True, default=None) - webhook = models.ForeignKey( - to="webhooks.Webhook", +class WebhookResponse(models.Model): + alert_group = models.ForeignKey( + "alerts.AlertGroup", on_delete=models.CASCADE, - related_name="logs", - blank=False, - null=False, + null=True, + related_name="webhook_responses", ) + webhook = models.ForeignKey( + "webhooks.Webhook", + on_delete=models.SET_NULL, + null=True, + related_name="responses", + ) + trigger_type = models.IntegerField(choices=Webhook.TRIGGER_TYPES) + timestamp = models.DateTimeField(default=timezone.now) + request_trigger = models.TextField(null=True, default=None) + request_headers = models.TextField(null=True, default=None) + request_data = models.TextField(null=True, default=None) + url = models.TextField(null=True, default=None) + status_code = models.IntegerField(default=None, null=True) + content = models.TextField(null=True, default=None) + + def json(self): + return json.loads(self.content) diff --git a/engine/apps/webhooks/tasks/trigger_webhook.py b/engine/apps/webhooks/tasks/trigger_webhook.py index 6dffb655..dd9a4bf3 100644 --- a/engine/apps/webhooks/tasks/trigger_webhook.py +++ b/engine/apps/webhooks/tasks/trigger_webhook.py @@ -5,11 +5,10 @@ from json import JSONDecodeError from celery.utils.log import get_task_logger from django.apps import apps from django.conf import settings -from django.utils import timezone from apps.alerts.models import AlertGroup from apps.user_management.models import User -from apps.webhooks.models import Webhook, WebhookLog +from apps.webhooks.models import Webhook, WebhookResponse from apps.webhooks.utils import ( InvalidWebhookData, InvalidWebhookHeaders, @@ -23,6 +22,16 @@ logger = get_task_logger(__name__) logger.setLevel(logging.DEBUG) +TRIGGER_TYPE_TO_LABEL = { + Webhook.TRIGGER_NEW: "firing", + Webhook.TRIGGER_ACKNOWLEDGE: "acknowledge", + Webhook.TRIGGER_RESOLVE: "resolve", + Webhook.TRIGGER_SILENCE: "silence", + Webhook.TRIGGER_UNSILENCE: "unsilence", + Webhook.TRIGGER_UNRESOLVE: "unresolve", +} + + @shared_dedicated_queue_retry_task( autoretry_for=(Exception,), retry_backoff=True, max_retries=1 if settings.DEBUG else None ) @@ -38,45 +47,36 @@ def _isoformat_date(date_value): return date_value.isoformat() if date_value else None -def _build_payload(trigger_type, alert_group_id, user_id): +def _build_payload(trigger_type, alert_group, user_id): user = None - try: - alert_group = AlertGroup.unarchived_objects.get(pk=alert_group_id) - if user_id is not None: - user = User.objects.filter(pk=user_id).first() - except AlertGroup.DoesNotExist: - return + if user_id is not None: + user = User.objects.filter(pk=user_id).first() + event = { + "type": TRIGGER_TYPE_TO_LABEL[trigger_type], + } if trigger_type == Webhook.TRIGGER_NEW: - event = { - "type": "Firing", - "time": _isoformat_date(alert_group.started_at), - } + event["time"] = _isoformat_date(alert_group.started_at) elif trigger_type == Webhook.TRIGGER_ACKNOWLEDGE: - event = { - "type": "Acknowledge", - "time": _isoformat_date(alert_group.acknowledged_at), - } + event["time"] = _isoformat_date(alert_group.acknowledged_at) elif trigger_type == Webhook.TRIGGER_RESOLVE: - event = { - "type": "Resolve", - "time": _isoformat_date(alert_group.resolved_at), - } + event["time"] = _isoformat_date(alert_group.resolved_at) elif trigger_type == Webhook.TRIGGER_SILENCE: - event = { - "type": "Silence", - "time": _isoformat_date(alert_group.silenced_at), - "until": _isoformat_date(alert_group.silenced_until), - } - elif trigger_type == Webhook.TRIGGER_UNSILENCE: - event = { - "type": "Unsilence", - } - elif trigger_type == Webhook.TRIGGER_UNRESOLVE: - event = { - "type": "Unresolve", - } - data = serialize_event(event, alert_group, user) + event["time"] = _isoformat_date(alert_group.silenced_at) + event["until"] = _isoformat_date(alert_group.silenced_until) + + # include latest response data per trigger in the event input data + responses_data = {} + responses = alert_group.webhook_responses.all().order_by("timestamp") + for r in responses: + try: + response_data = r.json() + except JSONDecodeError: + response_data = r.content + responses_data[TRIGGER_TYPE_TO_LABEL[r.trigger_type]] = response_data + + data = serialize_event(event, alert_group, user, responses_data) + return data @@ -91,54 +91,60 @@ def execute_webhook(webhook_pk, alert_group_id, user_id): logger.warn(f"Webhook {webhook_pk} does not exist") return - data = _build_payload(webhook.trigger_type, alert_group_id, user_id) + try: + alert_group = AlertGroup.unarchived_objects.get(pk=alert_group_id) + except AlertGroup.DoesNotExist: + return + + data = _build_payload(webhook.trigger_type, alert_group, user_id) status = { - "last_run_at": timezone.now(), - "input_data": data, "url": None, - "trigger": None, - "headers": None, - "data": None, - "response_status": None, - "response": None, + "request_trigger": None, + "request_headers": None, + "request_data": data, + "status_code": None, + "content": None, + "webhook": webhook, } exception = None try: - triggered, status["trigger"] = webhook.check_trigger(data) + triggered, status["request_trigger"] = webhook.check_trigger(data) if triggered: status["url"] = webhook.build_url(data) request_kwargs = webhook.build_request_kwargs(data, raise_data_errors=True) - status["headers"] = json.dumps(request_kwargs.get("headers", {})) - if webhook.forward_all: - status["data"] = "All input_data forwarded as payload" - elif "json" in request_kwargs: - status["data"] = json.dumps(request_kwargs["json"]) + status["request_headers"] = json.dumps(request_kwargs.get("headers", {})) + if "json" in request_kwargs: + status["request_data"] = json.dumps(request_kwargs["json"]) else: - status["data"] = request_kwargs.get("data") + status["request_data"] = request_kwargs.get("data") response = webhook.make_request(status["url"], request_kwargs) - status["response_status"] = response.status_code + status["status_code"] = response.status_code try: - status["response"] = json.dumps(response.json()) + status["content"] = json.dumps(response.json()) except JSONDecodeError: - status["response"] = response.content.decode("utf-8") + status["content"] = response.content.decode("utf-8") else: # do not add a log entry if the webhook is not triggered return except InvalidWebhookUrl as e: status["url"] = e.message except InvalidWebhookTrigger as e: - status["trigger"] = e.message + status["request_trigger"] = e.message except InvalidWebhookHeaders as e: - status["headers"] = e.message + status["request_headers"] = e.message except InvalidWebhookData as e: - status["data"] = e.message + status["request_data"] = e.message except Exception as e: - status["response"] = str(e) + status["content"] = str(e) exception = e - # create/update log entry - WebhookLog.objects.update_or_create(webhook_id=webhook_pk, defaults=status) + # create response entry + WebhookResponse.objects.create( + alert_group=alert_group, + trigger_type=webhook.trigger_type, + **status, + ) if exception: raise exception diff --git a/engine/apps/webhooks/tests/factories.py b/engine/apps/webhooks/tests/factories.py index 7ff6ec40..b456d41a 100644 --- a/engine/apps/webhooks/tests/factories.py +++ b/engine/apps/webhooks/tests/factories.py @@ -1,6 +1,6 @@ import factory -from apps.webhooks.models import Webhook +from apps.webhooks.models import Webhook, WebhookResponse from common.utils import UniqueFaker @@ -11,3 +11,10 @@ class CustomWebhookFactory(factory.DjangoModelFactory): class Meta: model = Webhook + + +class WebhookResponseFactory(factory.DjangoModelFactory): + timestamp = factory.Faker("date_time") + + class Meta: + model = WebhookResponse diff --git a/engine/apps/webhooks/tests/test_trigger_webhook.py b/engine/apps/webhooks/tests/test_trigger_webhook.py index ba41d11d..4a8408ab 100644 --- a/engine/apps/webhooks/tests/test_trigger_webhook.py +++ b/engine/apps/webhooks/tests/test_trigger_webhook.py @@ -100,12 +100,43 @@ def test_execute_webhook_ok( ) assert mock_requests.post.call_args == expected_call # check logs - log = webhook.logs.all()[0] - assert log.response_status == "200" - assert log.response == json.dumps(mock_response.json()) + log = webhook.responses.all()[0] + assert log.status_code == 200 + assert log.content == json.dumps(mock_response.json()) + assert log.request_data == json.dumps({"value": alert_group.public_primary_key}) + assert log.request_headers == json.dumps({"some-header": alert_group.public_primary_key}) + assert log.url == "https://something/{}/".format(alert_group.public_primary_key) + + +@pytest.mark.django_db +def test_execute_webhook_ok_forward_all( + make_organization, make_user_for_organization, make_alert_receive_channel, make_alert_group, make_custom_webhook +): + organization = make_organization() + user = make_user_for_organization(organization) + alert_receive_channel = make_alert_receive_channel(organization) + alert_group = make_alert_group( + alert_receive_channel, acknowledged_at=timezone.now(), acknowledged=True, acknowledged_by=user.pk + ) + webhook = make_custom_webhook( + organization=organization, + url="https://something/{{ alert_group_id }}/", + http_method="POST", + trigger_type=Webhook.TRIGGER_ACKNOWLEDGE, + forward_all=True, + ) + + mock_response = MockResponse() + with patch("apps.webhooks.utils.socket.gethostbyname") as mock_gethostbyname: + mock_gethostbyname.return_value = "8.8.8.8" + with patch("apps.webhooks.models.webhook.requests") as mock_requests: + mock_requests.post.return_value = mock_response + execute_webhook(webhook.pk, alert_group.pk, user.pk) + + assert mock_requests.post.called expected_data = { "event": { - "type": "Acknowledge", + "type": "acknowledge", "time": alert_group.acknowledged_at.isoformat(), }, "user": user.username, @@ -113,12 +144,78 @@ def test_execute_webhook_ok( "alert_group_id": alert_group.public_primary_key, "alert_payload": "", } - assert log.input_data == expected_data - assert log.data == json.dumps({"value": alert_group.public_primary_key}) - assert log.headers == json.dumps({"some-header": alert_group.public_primary_key}) + expected_call = call( + "https://something/{}/".format(alert_group.public_primary_key), + timeout=10, + headers={}, + json=expected_data, + ) + assert mock_requests.post.call_args == expected_call + # check logs + log = webhook.responses.all()[0] + assert log.status_code == 200 + assert log.content == json.dumps(mock_response.json()) + assert json.loads(log.request_data) == expected_data assert log.url == "https://something/{}/".format(alert_group.public_primary_key) +@pytest.mark.django_db +def test_execute_webhook_using_responses_data( + make_organization, + make_user_for_organization, + make_alert_receive_channel, + make_alert_group, + make_custom_webhook, + make_webhook_response, +): + organization = make_organization() + user = make_user_for_organization(organization) + alert_receive_channel = make_alert_receive_channel(organization) + alert_group = make_alert_group( + alert_receive_channel, acknowledged_at=timezone.now(), acknowledged=True, acknowledged_by=user.pk + ) + webhook = make_custom_webhook( + organization=organization, + url="https://something/{{ responses.firing.id }}/", + http_method="POST", + trigger_type=Webhook.TRIGGER_RESOLVE, + data='{"value": "{{ responses.acknowledge.status }}"}', + forward_all=False, + ) + # add previous webhook responses for the related alert group + make_webhook_response( + alert_group=alert_group, trigger_type=Webhook.TRIGGER_NEW, content=json.dumps({"id": "third-party-id"}) + ) + make_webhook_response( + alert_group=alert_group, + trigger_type=Webhook.TRIGGER_ACKNOWLEDGE, + content=json.dumps({"id": "third-party-id", "status": "updated"}), + ) + + mock_response = MockResponse() + with patch("apps.webhooks.utils.socket.gethostbyname") as mock_gethostbyname: + mock_gethostbyname.return_value = "8.8.8.8" + with patch("apps.webhooks.models.webhook.requests") as mock_requests: + mock_requests.post.return_value = mock_response + execute_webhook(webhook.pk, alert_group.pk, user.pk) + + assert mock_requests.post.called + expected_data = {"value": "updated"} + expected_call = call( + "https://something/third-party-id/", + timeout=10, + headers={}, + json=expected_data, + ) + assert mock_requests.post.call_args == expected_call + # check logs + log = webhook.responses.all()[0] + assert log.status_code == 200 + assert log.content == json.dumps(mock_response.json()) + assert json.loads(log.request_data) == expected_data + assert log.url == "https://something/third-party-id/" + + @pytest.mark.django_db def test_execute_webhook_trigger_false( make_organization, make_alert_receive_channel, make_alert_group, make_custom_webhook @@ -139,7 +236,7 @@ def test_execute_webhook_trigger_false( assert not mock_requests.post.called # check no logs - assert webhook.logs.count() == 0 + assert webhook.responses.count() == 0 @pytest.mark.django_db @@ -155,11 +252,16 @@ def test_execute_webhook_trigger_false( ( "trigger_template", "{{ }}", - "trigger", + "request_trigger", "Trigger - Template Error: Expected an expression, got 'end of print statement'", ), - ("headers", '"{{foo|invalid}}"', "headers", "Headers - Template Error: No filter named 'invalid'."), - ("data", "{{ }}", "data", "Data - Template Error: Expected an expression, got 'end of print statement'"), + ("headers", '"{{foo|invalid}}"', "request_headers", "Headers - Template Error: No filter named 'invalid'."), + ( + "data", + "{{ }}", + "request_data", + "Data - Template Error: Expected an expression, got 'end of print statement'", + ), ], ) def test_execute_webhook_errors( @@ -194,19 +296,8 @@ def test_execute_webhook_errors( execute_webhook(webhook.pk, alert_group.pk, None) assert not mock_requests.post.called - log = webhook.logs.all()[0] - assert log.response_status is None - assert log.response is None - expected_data = { - "event": { - "type": "Resolve", - "time": alert_group.resolved_at.isoformat(), - }, - "user": None, - "alert_group": IncidentSerializer(alert_group).data, - "alert_group_id": alert_group.public_primary_key, - "alert_payload": "", - } - assert log.input_data == expected_data + log = webhook.responses.all()[0] + assert log.status_code is None + assert log.content is None error = getattr(log, log_field_name) assert error == expected_error diff --git a/engine/apps/webhooks/utils.py b/engine/apps/webhooks/utils.py index f9014990..092c169c 100644 --- a/engine/apps/webhooks/utils.py +++ b/engine/apps/webhooks/utils.py @@ -113,7 +113,7 @@ class EscapeDoubleQuotesDict(dict): return original_str -def serialize_event(event, alert_group, user): +def serialize_event(event, alert_group, user, responses=None): from apps.public_api.serializers import IncidentSerializer alert_payload = alert_group.alerts.first() @@ -128,4 +128,7 @@ def serialize_event(event, alert_group, user): "alert_group_id": alert_group.public_primary_key, "alert_payload": alert_payload_raw, } + if responses: + data["responses"] = responses + return data diff --git a/engine/common/database.py b/engine/common/database.py index dd40a39e..d2ca1929 100644 --- a/engine/common/database.py +++ b/engine/common/database.py @@ -5,11 +5,11 @@ from django.conf import settings def get_random_readonly_database_key_if_present_otherwise_default() -> str: """ - This function returns a string, representing a key in the DATABASES django settings. - If settings.READONLY_DATABASES is set, and non-empty, it randomly chooses one of the read-only databases, + This function returns a string, representing a key in the `DATABASES` django settings. + If `settings.READONLY_DATABASES` is set, and non-empty, it randomly chooses one of the read-only databases, otherwise it falls back to "default". - This is primarily intended to be used for django's QuerySet.using() function + This is primarily intended to be used for django's `QuerySet.using()` function """ using_db = "default" if hasattr(settings, "READONLY_DATABASES") and len(settings.READONLY_DATABASES) > 0: diff --git a/engine/conftest.py b/engine/conftest.py index b4795ded..a79e18b0 100644 --- a/engine/conftest.py +++ b/engine/conftest.py @@ -81,7 +81,7 @@ from apps.telegram.tests.factories import ( from apps.twilioapp.tests.factories import PhoneCallFactory, SMSFactory from apps.user_management.models.user import User, listen_for_user_model_save from apps.user_management.tests.factories import OrganizationFactory, RegionFactory, TeamFactory, UserFactory -from apps.webhooks.tests.factories import CustomWebhookFactory +from apps.webhooks.tests.factories import CustomWebhookFactory, WebhookResponseFactory register(OrganizationFactory) register(UserFactory) @@ -626,6 +626,15 @@ def make_custom_webhook(): return _make_custom_webhook +@pytest.fixture +def make_webhook_response(): + def _make_webhook_response(**kwargs): + webhook_response = WebhookResponseFactory(**kwargs) + return webhook_response + + return _make_webhook_response + + @pytest.fixture def make_slack_user_group(): def _make_slack_user_group(slack_team_identity, **kwargs): diff --git a/engine/engine/management/commands/restart_escalation.py b/engine/engine/management/commands/restart_escalation.py index 6c4c0823..8ceaf01d 100644 --- a/engine/engine/management/commands/restart_escalation.py +++ b/engine/engine/management/commands/restart_escalation.py @@ -49,9 +49,6 @@ class Command(BaseCommand): alert_group.unsilence_task_uuid = task_id escalation_start_time = max(now, alert_group.silenced_until) - alert_group.estimate_escalation_finish_time = alert_group.calculate_eta_for_finish_escalation( - start_time=escalation_start_time, - ) alert_groups_to_update.append(alert_group) tasks.append( @@ -65,9 +62,6 @@ class Command(BaseCommand): # otherwise start escalate_alert_group task else: if alert_group.escalation_snapshot: - alert_group.estimate_escalation_finish_time = alert_group.calculate_eta_for_finish_escalation( - escalation_started=True, - ) alert_group.active_escalation_id = task_id alert_groups_to_update.append(alert_group) @@ -82,7 +76,7 @@ class Command(BaseCommand): AlertGroup.all_objects.bulk_update( alert_groups_to_update, - ["estimate_escalation_finish_time", "active_escalation_id", "unsilence_task_uuid"], + ["active_escalation_id", "unsilence_task_uuid"], batch_size=5000, ) diff --git a/engine/engine/management/commands/start_celery.py b/engine/engine/management/commands/start_celery.py index e61a9430..23ced0ec 100644 --- a/engine/engine/management/commands/start_celery.py +++ b/engine/engine/management/commands/start_celery.py @@ -1,9 +1,12 @@ +import os import shlex import subprocess from django.core.management.base import BaseCommand from django.utils import autoreload +from common.utils import getenv_boolean + WORKER_ID = 0 @@ -11,8 +14,18 @@ def restart_celery(*args, **kwargs): global WORKER_ID kill_worker_cmd = "celery -A engine control shutdown" subprocess.call(shlex.split(kill_worker_cmd)) - start_worker_cmd = "celery -A engine worker -l info --concurrency=3 -Q celery,retry -n {}".format(WORKER_ID) - subprocess.call(shlex.split(start_worker_cmd)) + + queues = os.environ.get("CELERY_WORKER_QUEUE", "celery,retry") + max_tasks_per_child = os.environ.get("CELERY_WORKER_MAX_TASKS_PER_CHILD", 100) + concurrency = os.environ.get("CELERY_WORKER_CONCURRENCY", 3) + log_level = "debug" if getenv_boolean("CELERY_WORKER_DEBUG_LOGS", False) else "info" + + celery_args = f"-A engine worker -l {log_level} --concurrency={concurrency} -Q {queues} --max-tasks-per-child={max_tasks_per_child} -n {WORKER_ID}" + + if getenv_boolean("CELERY_WORKER_BEAT_ENABLED", False): + celery_args += " --beat" + + subprocess.call(shlex.split(f"celery {celery_args}")) WORKER_ID = 1 + WORKER_ID diff --git a/engine/requirements.txt b/engine/requirements.txt index 5a8a1837..bc2769f4 100644 --- a/engine/requirements.txt +++ b/engine/requirements.txt @@ -51,3 +51,4 @@ pyroscope-io==0.8.1 django-dbconn-retry==0.1.7 django-ipware==4.0.2 django-anymail==8.6 +django-deprecate-fields==0.1.1 diff --git a/engine/settings/base.py b/engine/settings/base.py index dc611872..c647fb51 100644 --- a/engine/settings/base.py +++ b/engine/settings/base.py @@ -395,6 +395,10 @@ CELERY_MAX_TASKS_PER_CHILD = 1 CELERY_WORKER_SEND_TASK_EVENTS = True CELERY_TASK_SEND_SENT_EVENT = True +ALERT_GROUP_ESCALATION_AUDITOR_CELERY_TASK_HEARTBEAT_URL = os.getenv( + "ALERT_GROUP_ESCALATION_AUDITOR_CELERY_TASK_HEARTBEAT_URL", None +) + CELERY_BEAT_SCHEDULE = { "restore_heartbeat_tasks": { "task": "apps.heartbeat.tasks.restore_heartbeat_tasks", @@ -403,7 +407,11 @@ CELERY_BEAT_SCHEDULE = { }, "check_escalations": { "task": "apps.alerts.tasks.check_escalation_finished.check_escalation_finished_task", - "schedule": 10 * 60, + # the task should be executed a minute or two less than the integration's configured interval + # + # ex. if the integration is configured to expect a heartbeat every 15 minutes then this value should be set + # to something like 13 * 60 (every 13 minutes) + "schedule": getenv_integer("ALERT_GROUP_ESCALATION_AUDITOR_CELERY_TASK_HEARTBEAT_INTERVAL", 13 * 60), "args": (), }, "start_refresh_ical_files": { @@ -509,6 +517,9 @@ SLACK_CLIENT_OAUTH_SECRET = os.environ.get("SLACK_CLIENT_OAUTH_SECRET") SLACK_SLASH_COMMAND_NAME = os.environ.get("SLACK_SLASH_COMMAND_NAME", "/oncall") SLACK_DIRECT_PAGING_SLASH_COMMAND = os.environ.get("SLACK_DIRECT_PAGING_SLASH_COMMAND", "/escalate") +# Controls if slack integration can be installed/uninstalled. +SLACK_INTEGRATION_MAINTENANCE_ENABLED = os.environ.get("SLACK_INTEGRATION_MAINTENANCE_ENABLED", False) + SOCIAL_AUTH_SLACK_LOGIN_KEY = SLACK_CLIENT_OAUTH_ID SOCIAL_AUTH_SLACK_LOGIN_SECRET = SLACK_CLIENT_OAUTH_SECRET diff --git a/grafana-plugin/integration-tests/escalationChains/searching.test.ts b/grafana-plugin/integration-tests/escalationChains/searching.test.ts new file mode 100644 index 00000000..e9eaba5e --- /dev/null +++ b/grafana-plugin/integration-tests/escalationChains/searching.test.ts @@ -0,0 +1,36 @@ +import { test, expect, Page } from '@playwright/test'; +import { configureOnCallPlugin } from '../utils/configurePlugin'; +import { generateRandomValue } from '../utils/forms'; +import { createEscalationChain } from '../utils/escalationChain'; + +test.beforeEach(async ({ page }) => { + await configureOnCallPlugin(page); +}); + +const assertEscalationChainSearchWorks = async ( + page: Page, + searchTerm: string, + escalationChainFullName: string +): Promise => { + await page.getByTestId('escalation-chain-search-input').fill(searchTerm); + + // wait for the API call(s) to finish + await page.waitForLoadState('networkidle'); + + await expect(page.getByTestId('escalation-chains-list')).toHaveText(escalationChainFullName); +}; + +test('searching allows case-insensitive partial matches', async ({ page }) => { + const escalationChainName = `${generateRandomValue()} ${generateRandomValue()}`; + const [firstHalf, secondHalf] = escalationChainName.split(' '); + + await createEscalationChain(page, escalationChainName); + + await assertEscalationChainSearchWorks(page, firstHalf, escalationChainName); + await assertEscalationChainSearchWorks(page, firstHalf.toUpperCase(), escalationChainName); + await assertEscalationChainSearchWorks(page, firstHalf.toLowerCase(), escalationChainName); + + await assertEscalationChainSearchWorks(page, secondHalf, escalationChainName); + await assertEscalationChainSearchWorks(page, secondHalf.toUpperCase(), escalationChainName); + await assertEscalationChainSearchWorks(page, secondHalf.toLowerCase(), escalationChainName); +}); diff --git a/grafana-plugin/integration-tests/integrations/uniqueIntegrationNames.test.ts b/grafana-plugin/integration-tests/integrations/uniqueIntegrationNames.test.ts new file mode 100644 index 00000000..06fc9284 --- /dev/null +++ b/grafana-plugin/integration-tests/integrations/uniqueIntegrationNames.test.ts @@ -0,0 +1,16 @@ +import { test, expect } from '@playwright/test'; +import { configureOnCallPlugin } from '../utils/configurePlugin'; +import { openCreateIntegrationModal } from '../utils/integrations'; + +test.beforeEach(async ({ page }) => { + await configureOnCallPlugin(page); +}); + +test('integrations have unique names', async ({ page }) => { + await openCreateIntegrationModal(page); + + const integrationNames = await page.getByTestId('integration-display-name').allInnerTexts(); + const uniqueLowercasedIntegrationNames = new Set(integrationNames.map((elem) => elem.toLowerCase())); + + expect(uniqueLowercasedIntegrationNames.size).toEqual(integrationNames.length); +}); diff --git a/grafana-plugin/integration-tests/utils/alertGroup.ts b/grafana-plugin/integration-tests/utils/alertGroup.ts index f729f1a7..7f9b39fc 100644 --- a/grafana-plugin/integration-tests/utils/alertGroup.ts +++ b/grafana-plugin/integration-tests/utils/alertGroup.ts @@ -15,7 +15,7 @@ const incidentTimelineContainsStep = async (page: Page, triggeredStepText: strin return Promise.resolve(false); } - if (!page.locator('div[data-testid="incident-timeline-list"]').getByText(triggeredStepText)) { + if (!page.getByTestId('incident-timeline-list').getByText(triggeredStepText)) { await page.reload({ waitUntil: 'networkidle' }); return incidentTimelineContainsStep(page, triggeredStepText, (retryNum += 1)); } diff --git a/grafana-plugin/integration-tests/utils/escalationChain.ts b/grafana-plugin/integration-tests/utils/escalationChain.ts index 3a4263ce..f2bec766 100644 --- a/grafana-plugin/integration-tests/utils/escalationChain.ts +++ b/grafana-plugin/integration-tests/utils/escalationChain.ts @@ -16,8 +16,8 @@ const escalationStepValuePlaceholder: Record = { export const createEscalationChain = async ( page: Page, escalationChainName: string, - escalationStep: EscalationStep | null, - escalationStepValue: string | null + escalationStep?: EscalationStep, + escalationStepValue?: string ): Promise => { // go to the escalation chains page await goToOnCallPage(page, 'escalations'); @@ -32,7 +32,7 @@ export const createEscalationChain = async ( await clickButton({ page, buttonText: 'Create' }); await page.waitForSelector(`text=${escalationChainName}`); - if (!escalationStep) { + if (!escalationStep || !escalationStepValue) { return; } diff --git a/grafana-plugin/integration-tests/utils/integrations.ts b/grafana-plugin/integration-tests/utils/integrations.ts index eb93ceff..8e54eaa1 100644 --- a/grafana-plugin/integration-tests/utils/integrations.ts +++ b/grafana-plugin/integration-tests/utils/integrations.ts @@ -2,19 +2,28 @@ import { Page } from '@playwright/test'; import { clickButton, fillInInput, selectDropdownValue } from './forms'; import { goToOnCallPage } from './navigation'; -export const createIntegrationAndSendDemoAlert = async ( - page: Page, - integrationName: string, - escalationChainName: string -): Promise => { +const CREATE_INTEGRATION_MODAL_TEST_ID_SELECTOR = 'div[data-testid="create-integration-modal"]'; + +export const openCreateIntegrationModal = async (page: Page): Promise => { // go to the integrations page await goToOnCallPage(page, 'integrations'); // open the create integration modal (await page.waitForSelector('text=New integration to receive alerts')).click(); + // wait for it to pop up + await page.waitForSelector(CREATE_INTEGRATION_MODAL_TEST_ID_SELECTOR); +}; + +export const createIntegrationAndSendDemoAlert = async ( + page: Page, + integrationName: string, + escalationChainName: string +): Promise => { + await openCreateIntegrationModal(page); + // create a webhook integration - (await page.waitForSelector('div[data-testid="create-integration-modal"] >> text=Webhook')).click(); + (await page.waitForSelector(`${CREATE_INTEGRATION_MODAL_TEST_ID_SELECTOR} >> text=Webhook`)).click(); // wait for the integrations settings modal to open up... and then close it await clickButton({ page, buttonText: 'Open Escalations Settings' }); @@ -24,7 +33,7 @@ export const createIntegrationAndSendDemoAlert = async ( await fillInInput(page, 'div[data-testid="edit-integration-name-modal"] >> input', integrationName); await clickButton({ page, buttonText: 'Update' }); - const integrationSettingsElement = page.locator('div[data-testid="integration-settings"]'); + const integrationSettingsElement = page.getByTestId('integration-settings'); // assign the escalation chain to the integration await selectDropdownValue({ diff --git a/grafana-plugin/src/components/AlertTemplates/AlertTemplatesForm.module.css b/grafana-plugin/src/components/AlertTemplates/AlertTemplatesForm.module.css index f5da34f8..924951cf 100644 --- a/grafana-plugin/src/components/AlertTemplates/AlertTemplatesForm.module.css +++ b/grafana-plugin/src/components/AlertTemplates/AlertTemplatesForm.module.css @@ -46,10 +46,6 @@ margin-top: 8px; } -.payloadExample { - margin-top: 24px; -} - .autoresolve-condition section { border: 1px solid var(--primary-text-link); } diff --git a/grafana-plugin/src/components/AlertTemplates/AlertTemplatesForm.tsx b/grafana-plugin/src/components/AlertTemplates/AlertTemplatesForm.tsx index 9cef300f..7f6637cf 100644 --- a/grafana-plugin/src/components/AlertTemplates/AlertTemplatesForm.tsx +++ b/grafana-plugin/src/components/AlertTemplates/AlertTemplatesForm.tsx @@ -18,7 +18,7 @@ import { AlertReceiveChannel } from 'models/alert_receive_channel/alert_receive_ import { Alert } from 'models/alertgroup/alertgroup.types'; import { makeRequest } from 'network'; import LocationHelper from 'utils/LocationHelper'; -import { UserActions } from 'utils/authorization'; +import { UserActions, isUserActionAllowed } from 'utils/authorization'; import styles from './AlertTemplatesForm.module.css'; @@ -254,29 +254,29 @@ const AlertTemplatesForm = (props: AlertTemplatesFormProps) => { {templates?.payload_example ? ( - - - - - {groups[activeGroup].map((template) => ( - - ))} - - -
+ + {isUserActionAllowed(UserActions.IntegrationsTest) && ( - - {JSON.stringify(templates?.payload_example, null, 4)} + + + {groups[activeGroup].map((template) => ( + + ))} + -
+ )} + + + {JSON.stringify(templates?.payload_example, null, 4)} +
) : ( sendDemoAlertBlock diff --git a/grafana-plugin/src/components/EscalationsFilters/EscalationsFilters.tsx b/grafana-plugin/src/components/EscalationsFilters/EscalationsFilters.tsx index bcbf3134..f2387261 100644 --- a/grafana-plugin/src/components/EscalationsFilters/EscalationsFilters.tsx +++ b/grafana-plugin/src/components/EscalationsFilters/EscalationsFilters.tsx @@ -39,6 +39,7 @@ const EscalationsFilters: FC = (props) => {
} placeholder="Search escalations..." diff --git a/grafana-plugin/src/components/Policy/NotificationPolicy.tsx b/grafana-plugin/src/components/Policy/NotificationPolicy.tsx index bc97f132..28b34aa9 100644 --- a/grafana-plugin/src/components/Policy/NotificationPolicy.tsx +++ b/grafana-plugin/src/components/Policy/NotificationPolicy.tsx @@ -13,6 +13,8 @@ import { NotificationPolicyType, prepareNotificationPolicy } from 'models/notifi import { NotifyBy } from 'models/notify_by'; import { User } from 'models/user/user.types'; import { WaitDelay } from 'models/wait_delay'; +import { RootStore } from 'state'; +import { AppFeature } from 'state/features'; import { UserAction } from 'utils/authorization'; import DragHandle from './DragHandle'; @@ -41,6 +43,7 @@ export interface NotificationPolicyProps { color: string; number: number; userAction: UserAction; + store: RootStore; } export class NotificationPolicy extends React.Component { @@ -149,7 +152,11 @@ export class NotificationPolicy extends React.ComponentTelegram is connected diff --git a/grafana-plugin/src/components/ScheduleCounter/ScheduleCounter.module.css b/grafana-plugin/src/components/ScheduleCounter/ScheduleCounter.module.css deleted file mode 100644 index 6dc02ce6..00000000 --- a/grafana-plugin/src/components/ScheduleCounter/ScheduleCounter.module.css +++ /dev/null @@ -1,31 +0,0 @@ -.root { - font-size: 12px; - line-height: 16px; - padding: 3px 4px; -} - -.root__type_link { - background: rgba(27, 133, 94, 0.15); - border: 1px solid var(--tag-border-success); - border-radius: 2px; -} - -.root__type_warning { - background: rgba(245, 183, 61, 0.18); - border: 1px solid var(--tag-border-warning); - border-radius: 2px; -} - -.text__type_link, -.icon__type_link { - color: var(--tag-text-success); -} - -.text__type_warning, -.icon__type_warning { - color: var(--tag-text-warning); -} - -.tooltip { - width: auto; -} diff --git a/grafana-plugin/src/components/ScheduleCounter/ScheduleCounter.module.scss b/grafana-plugin/src/components/ScheduleCounter/ScheduleCounter.module.scss new file mode 100644 index 00000000..2958c9f6 --- /dev/null +++ b/grafana-plugin/src/components/ScheduleCounter/ScheduleCounter.module.scss @@ -0,0 +1,35 @@ +.element { + font-size: 12px; + line-height: 16px; + padding: 3px 4px; + + &--link { + background: rgba(27, 133, 94, 0.15); + border: 1px solid var(--tag-border-success); + border-radius: 2px; + } + + &--warning { + background: rgba(245, 183, 61, 0.18); + border: 1px solid var(--tag-border-warning); + border-radius: 2px; + } + + &--padding { + padding: 3px 10px; + } +} + +.element__text--link, +.element__icon--link { + color: var(--tag-text-success); +} + +.element__text--warning, +.element__icon--warning { + color: var(--tag-text-warning); +} + +.tooltip { + width: auto; +} diff --git a/grafana-plugin/src/components/ScheduleCounter/ScheduleCounter.tsx b/grafana-plugin/src/components/ScheduleCounter/ScheduleCounter.tsx index 4762d5f8..e9688769 100644 --- a/grafana-plugin/src/components/ScheduleCounter/ScheduleCounter.tsx +++ b/grafana-plugin/src/components/ScheduleCounter/ScheduleCounter.tsx @@ -5,7 +5,7 @@ import cn from 'classnames/bind'; import Text, { TextType } from 'components/Text/Text'; -import styles from './ScheduleCounter.module.css'; +import styles from './ScheduleCounter.module.scss'; interface ScheduleCounterProps { type: Partial; @@ -31,7 +31,7 @@ const ScheduleCounter: FC = (props) => { placement="bottom-start" interactive content={ -
+
{tooltipTitle} {tooltipContent} @@ -39,10 +39,16 @@ const ScheduleCounter: FC = (props) => {
} > -
+
- - {count} + + {count}
diff --git a/grafana-plugin/src/components/ScheduleQuality/ScheduleQuality.module.scss b/grafana-plugin/src/components/ScheduleQuality/ScheduleQuality.module.scss index 1ca938e9..1cfd657b 100644 --- a/grafana-plugin/src/components/ScheduleQuality/ScheduleQuality.module.scss +++ b/grafana-plugin/src/components/ScheduleQuality/ScheduleQuality.module.scss @@ -13,10 +13,6 @@ $score-danger: rgba(209, 14, 92, 0.15); line-height: 16px; } -.link { - text-decoration: none !important; -} - .tag { font-size: 12px; padding: 4px 10px 3px 10px; diff --git a/grafana-plugin/src/components/ScheduleQuality/ScheduleQuality.tsx b/grafana-plugin/src/components/ScheduleQuality/ScheduleQuality.tsx index 6deecb69..59fa3759 100644 --- a/grafana-plugin/src/components/ScheduleQuality/ScheduleQuality.tsx +++ b/grafana-plugin/src/components/ScheduleQuality/ScheduleQuality.tsx @@ -49,7 +49,7 @@ const ScheduleQuality: FC = ({ schedule, lastUpdated }) => {relatedEscalationChains.map((escalationChain) => (
- + {escalationChain.name}
diff --git a/grafana-plugin/src/components/Text/Text.tsx b/grafana-plugin/src/components/Text/Text.tsx index a6c87f0f..ba7664f0 100644 --- a/grafana-plugin/src/components/Text/Text.tsx +++ b/grafana-plugin/src/components/Text/Text.tsx @@ -52,6 +52,7 @@ const Text: TextInterface = (props) => { hidden = false, editModalTitle = 'New value', style, + ...rest } = props; const [isEditMode, setIsEditMode] = useState(false); @@ -88,6 +89,7 @@ const Text: TextInterface = (props) => { keyboard, })} style={style} + {...rest} > {hidden ? PLACEHOLDER : children} {editable && ( diff --git a/grafana-plugin/src/components/UserGroups/UserGroups.tsx b/grafana-plugin/src/components/UserGroups/UserGroups.tsx index 7dfb9dcc..c91ef3a8 100644 --- a/grafana-plugin/src/components/UserGroups/UserGroups.tsx +++ b/grafana-plugin/src/components/UserGroups/UserGroups.tsx @@ -110,7 +110,7 @@ const UserGroups = (props: UserGroupsProps) => { key={items.length} showSearch placeholder="Add user" - href="/users/?filters=true" + href="/users/?roles=0&roles=1&filters=true" value={null} onChange={handleUserAdd} showError={showError} diff --git a/grafana-plugin/src/containers/AlertRules/parts/index.tsx b/grafana-plugin/src/containers/AlertRules/parts/index.tsx index 76d51bcc..5f861b4f 100644 --- a/grafana-plugin/src/containers/AlertRules/parts/index.tsx +++ b/grafana-plugin/src/containers/AlertRules/parts/index.tsx @@ -6,6 +6,7 @@ import Timeline from 'components/Timeline/Timeline'; import SlackConnector from 'containers/AlertRules/parts/connectors/SlackConnector'; import TelegramConnector from 'containers/AlertRules/parts/connectors/TelegramConnector'; import { ChannelFilter } from 'models/channel_filter'; +import { AppFeature } from 'state/features'; import { useStore } from 'state/useStore'; import { getVar } from 'utils/DOM'; @@ -20,7 +21,8 @@ export const ChatOpsConnectors = (props: ChatOpsConnectorsProps) => { const { telegramChannelStore } = store; const isSlackInstalled = Boolean(store.teamStore.currentTeam?.slack_team_identity); - const isTelegramInstalled = Boolean(telegramChannelStore.currentTeamToTelegramChannel?.length > 0); + const isTelegramInstalled = + store.hasFeature(AppFeature.Telegram) && telegramChannelStore.currentTeamToTelegramChannel?.length > 0; if (!isSlackInstalled && !isTelegramInstalled) { return null; diff --git a/grafana-plugin/src/containers/CreateAlertReceiveChannelContainer/CreateAlertReceiveChannelContainer.tsx b/grafana-plugin/src/containers/CreateAlertReceiveChannelContainer/CreateAlertReceiveChannelContainer.tsx index e0d3b179..900b51c0 100644 --- a/grafana-plugin/src/containers/CreateAlertReceiveChannelContainer/CreateAlertReceiveChannelContainer.tsx +++ b/grafana-plugin/src/containers/CreateAlertReceiveChannelContainer/CreateAlertReceiveChannelContainer.tsx @@ -79,7 +79,9 @@ const CreateAlertReceiveChannelContainer = observer((props: CreateAlertReceiveCh
- {alertReceiveChannelChoice.display_name} + + {alertReceiveChannelChoice.display_name} + {alertReceiveChannelChoice.short_description} diff --git a/grafana-plugin/src/containers/HeartbeatModal/HeartbeatForm.tsx b/grafana-plugin/src/containers/HeartbeatModal/HeartbeatForm.tsx index 936afea5..cd48a4ad 100644 --- a/grafana-plugin/src/containers/HeartbeatModal/HeartbeatForm.tsx +++ b/grafana-plugin/src/containers/HeartbeatModal/HeartbeatForm.tsx @@ -85,11 +85,11 @@ const HeartbeatForm = observer(({ alertReceveChannelId, onUpdate }: HeartBeatMod

- A heartbeat acts as a healthcheck for incident monitoring. You can configure OnCall to regularly send alerts to - the heartbeat endpoint. If you don't receive one of these alerts, OnCall will issue an incident. + A heartbeat acts as a healthcheck for alert group monitoring. You can configure OnCall to regularly send alerts + to the heartbeat endpoint. If you don't receive one of these alerts, OnCall will issue an alert group.

- OnCall will issue an incident if no alert is received every + OnCall will issue an alert group if no alert is received every