From 758c12790dc6119748be9a6e09bb1a2c6c93bdf4 Mon Sep 17 00:00:00 2001 From: Joey Orlando Date: Wed, 31 Jan 2024 13:42:52 -0500 Subject: [PATCH 1/5] fix slack API rate limit errors in `send_message_to_thread_if_bot_not_in_channel` task (#3803) # What this PR does See [this conversation](https://raintank-corp.slack.com/archives/C04JCU51NF8/p1706722752735009) for more context. Additionally, improves logging for this task + adds unit tests. ## Checklist - [x] Unit, integration, and e2e (if applicable) tests updated - [x] Documentation added (or `pr:no public docs` PR label added if not required) - [x] `CHANGELOG.md` updated (or `pr:no changelog` PR label added if not required) --- CHANGELOG.md | 5 + engine/apps/slack/tasks.py | 33 ++++++- engine/apps/slack/tests/tasks/__init__.py | 0 ...message_to_thread_if_bot_not_in_channel.py | 99 +++++++++++++++++++ 4 files changed, 133 insertions(+), 4 deletions(-) create mode 100644 engine/apps/slack/tests/tasks/__init__.py create mode 100644 engine/apps/slack/tests/tasks/test_send_message_to_thread_if_bot_not_in_channel.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 670760f1..9e1c2490 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## Unreleased +### Fixed + +- Address `SlackAPIRatelimitError` exceptions in `apps.slack.tasks.send_message_to_thread_if_bot_not_in_channel` task + by @joeyorlando ([#3803](https://github.com/grafana/oncall/pull/3803)) + ## v1.3.96 (2024-01-31) ### Added diff --git a/engine/apps/slack/tasks.py b/engine/apps/slack/tasks.py index 0a81c889..fe87f424 100644 --- a/engine/apps/slack/tasks.py +++ b/engine/apps/slack/tasks.py @@ -3,6 +3,7 @@ import random from typing import Optional from celery import uuid as celery_uuid +from celery.exceptions import Retry from celery.utils.log import get_task_logger from django.conf import settings from django.core.cache import cache @@ -142,8 +143,15 @@ def check_slack_message_exists_before_post_message_to_thread( ).save() -@shared_dedicated_queue_retry_task(autoretry_for=(Exception,), retry_backoff=True, max_retries=1) -def send_message_to_thread_if_bot_not_in_channel(alert_group_pk, slack_team_identity_pk, channel_id): +@shared_dedicated_queue_retry_task( + autoretry_for=(Exception,), + dont_autoretry_for=(Retry,), + retry_backoff=True, + max_retries=1, +) +def send_message_to_thread_if_bot_not_in_channel( + alert_group_pk: int, slack_team_identity_pk: int, channel_id: int +) -> None: """ Send message to alert group's thread if bot is not in current channel """ @@ -151,6 +159,11 @@ def send_message_to_thread_if_bot_not_in_channel(alert_group_pk, slack_team_iden from apps.alerts.models import AlertGroup from apps.slack.models import SlackTeamIdentity + logger.info( + f"Starting send_message_to_thread_if_bot_not_in_channel alert_group={alert_group_pk} " + f"slack_team_identity={slack_team_identity_pk} channel_id={channel_id}" + ) + slack_team_identity = SlackTeamIdentity.objects.get(pk=slack_team_identity_pk) alert_group = AlertGroup.objects.get(pk=alert_group_pk) @@ -159,8 +172,20 @@ def send_message_to_thread_if_bot_not_in_channel(alert_group_pk, slack_team_iden bot_user_id = slack_team_identity.bot_user_id members = slack_team_identity.get_conversation_members(sc, channel_id) if bot_user_id not in members: - text = f"Please invite <@{bot_user_id}> to this channel to make all features " f"available :wink:" - AlertGroupSlackService(slack_team_identity, sc).publish_message_to_alert_group_thread(alert_group, text=text) + text = f"Please invite <@{bot_user_id}> to this channel to make all features available :wink:" + + try: + logger.info("Attempting to send message to thread in Slack") + + AlertGroupSlackService(slack_team_identity, sc).publish_message_to_alert_group_thread( + alert_group, text=text + ) + except SlackAPIRatelimitError as e: + logger.warning(f"Slack API rate limit error: {e}, retrying task") + + raise send_message_to_thread_if_bot_not_in_channel.retry( + (alert_group_pk, slack_team_identity_pk, channel_id), countdown=e.retry_after, exc=e + ) @shared_dedicated_queue_retry_task(autoretry_for=(Exception,), retry_backoff=True, max_retries=0) diff --git a/engine/apps/slack/tests/tasks/__init__.py b/engine/apps/slack/tests/tasks/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/engine/apps/slack/tests/tasks/test_send_message_to_thread_if_bot_not_in_channel.py b/engine/apps/slack/tests/tasks/test_send_message_to_thread_if_bot_not_in_channel.py new file mode 100644 index 00000000..ea7b8186 --- /dev/null +++ b/engine/apps/slack/tests/tasks/test_send_message_to_thread_if_bot_not_in_channel.py @@ -0,0 +1,99 @@ +from unittest.mock import ANY, patch + +import pytest +from celery import exceptions + +from apps.slack.errors import SlackAPIRatelimitError +from apps.slack.tasks import send_message_to_thread_if_bot_not_in_channel +from apps.slack.tests.conftest import build_slack_response + +BOT_USER_ID = "U12345678" +TEXT = f"Please invite <@{BOT_USER_ID}> to this channel to make all features available :wink:" + + +@pytest.mark.parametrize("channel_members", [["U0909090"], [BOT_USER_ID]]) +@patch("apps.slack.models.SlackTeamIdentity.get_conversation_members") +@patch("apps.slack.tasks.SlackClient") +@patch("apps.slack.tasks.AlertGroupSlackService") +@pytest.mark.django_db +def test_send_message_to_thread_if_bot_not_in_channel( + MockAlertGroupSlackService, + MockSlackClient, + mock_get_conversation_members, + make_slack_team_identity, + make_slack_channel, + make_organization, + make_alert_receive_channel, + make_alert_group, + channel_members, +): + mock_get_conversation_members.return_value = channel_members + + slack_team_identity = make_slack_team_identity(bot_user_id=BOT_USER_ID) + slack_channel = make_slack_channel(slack_team_identity) + + organization = make_organization(slack_team_identity=slack_team_identity) + + alert_receive_channel = make_alert_receive_channel(organization) + alert_group = make_alert_group(alert_receive_channel) + + send_message_to_thread_if_bot_not_in_channel(alert_group.pk, slack_team_identity.pk, slack_channel.pk) + + MockSlackClient.assert_called_once_with(slack_team_identity, enable_ratelimit_retry=True) + mock_get_conversation_members.assert_called_once_with(MockSlackClient.return_value, slack_channel.pk) + + if BOT_USER_ID not in channel_members: + MockAlertGroupSlackService.assert_called_once_with(slack_team_identity, MockSlackClient.return_value) + + MockAlertGroupSlackService.return_value.publish_message_to_alert_group_thread.assert_called_once_with( + alert_group, text=TEXT + ) + else: + MockAlertGroupSlackService.return_value.publish_message_to_alert_group_thread.assert_not_called() + + +@patch("apps.slack.models.SlackTeamIdentity.get_conversation_members") +@patch("apps.slack.tasks.SlackClient") +@patch("apps.slack.tasks.AlertGroupSlackService") +@patch("apps.slack.tasks.send_message_to_thread_if_bot_not_in_channel.retry") +@pytest.mark.django_db +def test_send_message_to_thread_if_bot_not_in_channel_slack_api_rate_limit_error( + mock_send_message_to_thread_if_bot_not_in_channel_retry, + MockAlertGroupSlackService, + MockSlackClient, + mock_get_conversation_members, + make_slack_team_identity, + make_slack_channel, + make_organization, + make_alert_receive_channel, + make_alert_group, +): + mock_send_message_to_thread_if_bot_not_in_channel_retry.side_effect = exceptions.Retry() + mock_get_conversation_members.return_value = ["U0909090"] + + RETRY_AFTER = 42 + + MockAlertGroupSlackService.return_value.publish_message_to_alert_group_thread.side_effect = SlackAPIRatelimitError( + response=build_slack_response({"ok": False, "error": "ratelimited"}, headers={"Retry-After": RETRY_AFTER}) + ) + + slack_team_identity = make_slack_team_identity(bot_user_id=BOT_USER_ID) + slack_channel = make_slack_channel(slack_team_identity) + + organization = make_organization(slack_team_identity=slack_team_identity) + + alert_receive_channel = make_alert_receive_channel(organization) + alert_group = make_alert_group(alert_receive_channel) + + with pytest.raises(exceptions.Retry): + send_message_to_thread_if_bot_not_in_channel(alert_group.pk, slack_team_identity.pk, slack_channel.pk) + + MockAlertGroupSlackService.assert_called_once_with(slack_team_identity, MockSlackClient.return_value) + + MockAlertGroupSlackService.return_value.publish_message_to_alert_group_thread.assert_called_once_with( + alert_group, text=TEXT + ) + + mock_send_message_to_thread_if_bot_not_in_channel_retry.assert_called_once_with( + (alert_group.pk, slack_team_identity.pk, slack_channel.pk), countdown=RETRY_AFTER, exc=ANY + ) From 8427953fad7e4d9a1bb4cf17173211f92092d35a Mon Sep 17 00:00:00 2001 From: Michael Derynck Date: Wed, 31 Jan 2024 11:52:20 -0700 Subject: [PATCH 2/5] Fix Incident plugin status sync (#3802) # What this PR does - Handle case where key exists for jsonData but explicitly set to None - Disable incident if plugin disabled after or in the case it was removed completely from the Grafana instance ## Which issue(s) this PR fixes ## Checklist - [x] Unit, integration, and e2e (if applicable) tests updated - [x] Documentation added (or `pr:no public docs` PR label added if not required) - [x] `CHANGELOG.md` updated (or `pr:no changelog` PR label added if not required) --- CHANGELOG.md | 1 + engine/apps/user_management/sync.py | 5 ++++- engine/apps/user_management/tests/test_sync.py | 1 + 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9e1c2490..666b54a1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Address `SlackAPIRatelimitError` exceptions in `apps.slack.tasks.send_message_to_thread_if_bot_not_in_channel` task by @joeyorlando ([#3803](https://github.com/grafana/oncall/pull/3803)) +- Fix exception when parsing incident plugin config @mderynck ([#3802](https://github.com/grafana/oncall/pull/3802)) ## v1.3.96 (2024-01-31) diff --git a/engine/apps/user_management/sync.py b/engine/apps/user_management/sync.py index fd8256e1..df0ce19b 100644 --- a/engine/apps/user_management/sync.py +++ b/engine/apps/user_management/sync.py @@ -111,9 +111,12 @@ def _sync_grafana_incident_plugin(organization: Organization, grafana_api_client It intended to use only inside _sync_organization. It mutates, but not saves org, it's saved in _sync_organization. """ grafana_incident_settings, _ = grafana_api_client.get_grafana_incident_plugin_settings() + organization.is_grafana_incident_enabled = False + organization.grafana_incident_backend_url = None + if grafana_incident_settings is not None: organization.is_grafana_incident_enabled = grafana_incident_settings["enabled"] - organization.grafana_incident_backend_url = grafana_incident_settings.get("jsonData", {}).get( + organization.grafana_incident_backend_url = (grafana_incident_settings.get("jsonData") or {}).get( GrafanaAPIClient.GRAFANA_INCIDENT_PLUGIN_BACKEND_URL_KEY ) diff --git a/engine/apps/user_management/tests/test_sync.py b/engine/apps/user_management/tests/test_sync.py index 158f1378..96a86bae 100644 --- a/engine/apps/user_management/tests/test_sync.py +++ b/engine/apps/user_management/tests/test_sync.py @@ -541,6 +541,7 @@ class TestSyncGrafanaIncidentParams: MOCK_GRAFANA_INCIDENT_BACKEND_URL, ), TestSyncGrafanaIncidentParams(({"enabled": True}, None), True, None), + TestSyncGrafanaIncidentParams(({"enabled": True, "jsonData": None}, None), True, None), # missing jsonData (sometimes this is what we get back from the Grafana API) TestSyncGrafanaIncidentParams(({"enabled": False}, None), False, None), # plugin is disabled for some reason ], From bc0f51c0711d0eeb8209b65fb6e29b7b2c4de68e Mon Sep 17 00:00:00 2001 From: Joey Orlando Date: Wed, 31 Jan 2024 15:48:00 -0500 Subject: [PATCH 3/5] update tests for going_oncall_notification --- docker-compose-developer.yml | 3 + .../tasks/test_going_oncall_notification.py | 166 ++++++++++-------- 2 files changed, 93 insertions(+), 76 deletions(-) diff --git a/docker-compose-developer.yml b/docker-compose-developer.yml index f6d62e8f..e7189dcc 100644 --- a/docker-compose-developer.yml +++ b/docker-compose-developer.yml @@ -9,6 +9,9 @@ x-oncall-build: &oncall-build-args context: ./engine target: ${ONCALL_IMAGE_TARGET:-dev} labels: *oncall-labels + cache_from: + - grafana/oncall:latest + - grafana/oncall:dev x-oncall-volumes: &oncall-volumes - ./engine:/etc/app diff --git a/engine/apps/mobile_app/tests/tasks/test_going_oncall_notification.py b/engine/apps/mobile_app/tests/tasks/test_going_oncall_notification.py index 67b51628..c01ec261 100644 --- a/engine/apps/mobile_app/tests/tasks/test_going_oncall_notification.py +++ b/engine/apps/mobile_app/tests/tasks/test_going_oncall_notification.py @@ -56,86 +56,60 @@ def test_shift_starts_within_range(timing_window_lower, timing_window_upper, sec assert _shift_starts_within_range(timing_window_lower, timing_window_upper, seconds_until_shift_starts) == expected +@pytest.mark.parametrize( + "seconds_until_going_oncall,humanized_time_until_going_oncall", + [ + (600, "10 minutes"), + # TODO: right now this returns 11 hours but it should probably round up to 12 hours instead.. + # 11 hours and 53 minutes + ((60 * 60 * 11) + (53 * 60), "11 hours"), + # 12 hours and 10 minutes + ((60 * 60 * 12) + (10 * 60), "12 hours"), + ], +) @pytest.mark.django_db -def test_get_notification_title(make_organization_and_user, make_user, make_schedule): - schedule_name = "asdfasdfasdfasdf" - - organization, user = make_organization_and_user() - user2 = make_user(organization=organization) - schedule = make_schedule(organization, name=schedule_name, schedule_class=OnCallScheduleWeb) - shift_pk = "mncvmnvc" - user_pk = user.public_primary_key - user_locale = "fr_CA" - seconds_until_going_oncall = 600 - humanized_time_until_going_oncall = "10 minutes" - - same_day_shift_start = timezone.datetime(2023, 7, 8, 9, 0, 0) - same_day_shift_end = timezone.datetime(2023, 7, 8, 17, 0, 0) - - multiple_day_shift_start = timezone.datetime(2023, 7, 8, 9, 0, 0) - multiple_day_shift_end = timezone.datetime(2023, 7, 12, 17, 0, 0) - - same_day_shift = _create_schedule_event( - same_day_shift_start, - same_day_shift_end, - shift_pk, - [ - { - "pk": user_pk, - }, - ], - ) - - multiple_day_shift = _create_schedule_event( - multiple_day_shift_start, - multiple_day_shift_end, - shift_pk, - [ - { - "pk": user_pk, - }, - ], - ) - - maus = MobileAppUserSettings.objects.create(user=user, locale=user_locale) - maus_no_locale = MobileAppUserSettings.objects.create(user=user2) - - ################## - # same day shift - ################## - same_day_shift_title = _get_notification_title(seconds_until_going_oncall) - same_day_shift_subtitle = _get_notification_subtitle(schedule, same_day_shift, maus) - same_day_shift_no_locale_subtitle = _get_notification_subtitle(schedule, same_day_shift, maus_no_locale) - - assert same_day_shift_title == f"Your on-call shift starts in {humanized_time_until_going_oncall}" - assert same_day_shift_subtitle == f"09 h 00 - 17 h 00\nSchedule {schedule_name}" - assert same_day_shift_no_locale_subtitle == f"9:00\u202fAM - 5:00\u202fPM\nSchedule {schedule_name}" - - ################## - # multiple day shift - ################## - multiple_day_shift_title = _get_notification_title(seconds_until_going_oncall) - multiple_day_shift_subtitle = _get_notification_subtitle(schedule, multiple_day_shift, maus) - multiple_day_shift_no_locale_subtitle = _get_notification_subtitle(schedule, multiple_day_shift, maus_no_locale) - - assert multiple_day_shift_title == f"Your on-call shift starts in {humanized_time_until_going_oncall}" - assert multiple_day_shift_subtitle == f"2023-07-08 09 h 00 - 2023-07-12 17 h 00\nSchedule {schedule_name}" +def test_get_notification_title(seconds_until_going_oncall, humanized_time_until_going_oncall): assert ( - multiple_day_shift_no_locale_subtitle - == f"7/8/23, 9:00\u202fAM - 7/12/23, 5:00\u202fPM\nSchedule {schedule_name}" + _get_notification_title(seconds_until_going_oncall) + == f"Your on-call shift starts in {humanized_time_until_going_oncall}" ) @pytest.mark.parametrize( - "user_timezone,expected_shift_times", + "user_timezone,shift_start_time,shift_end_time,expected", [ - ("UTC", "9:00 AM - 5:00 PM"), - ("Europe/Amsterdam", "11:00 AM - 7:00 PM"), + # same day shift + ("UTC", timezone.datetime(2023, 7, 8, 9, 0, 0), timezone.datetime(2023, 7, 8, 17, 0, 0), "9:00 AM - 5:00 PM"), + ( + "Europe/Amsterdam", + timezone.datetime(2023, 7, 8, 9, 0, 0), + timezone.datetime(2023, 7, 8, 17, 0, 0), + "11:00 AM - 7:00 PM", + ), + # multi-day shift + ( + "UTC", + timezone.datetime(2023, 7, 8, 9, 0, 0), + timezone.datetime(2023, 7, 12, 17, 0, 0), + "7/8/23, 9:00 AM - 7/12/23, 5:00 PM", + ), + ( + "Europe/Amsterdam", + timezone.datetime(2023, 7, 8, 9, 0, 0), + timezone.datetime(2023, 7, 12, 17, 0, 0), + "7/8/23, 11:00 AM - 7/12/23, 7:00 PM", + ), ], ) @pytest.mark.django_db def test_get_notification_subtitle( - make_organization, make_user_for_organization, make_schedule, user_timezone, expected_shift_times + make_organization, + make_user_for_organization, + make_schedule, + user_timezone, + shift_start_time, + shift_end_time, + expected, ): schedule_name = "asdfasdfasdfasdf" @@ -143,15 +117,11 @@ def test_get_notification_subtitle( user = make_user_for_organization(organization) user_pk = user.public_primary_key maus = MobileAppUserSettings.objects.create(user=user, time_zone=user_timezone) - schedule = make_schedule(organization, name=schedule_name, schedule_class=OnCallScheduleWeb) - shift_start = timezone.datetime(2023, 7, 8, 9, 0, 0) - shift_end = timezone.datetime(2023, 7, 8, 17, 0, 0) - shift = _create_schedule_event( - shift_start, - shift_end, + shift_start_time, + shift_end_time, "asdfasdfasdf", [ { @@ -160,7 +130,51 @@ def test_get_notification_subtitle( ], ) - assert _get_notification_subtitle(schedule, shift, maus) == f"{expected_shift_times}\nSchedule {schedule_name}" + assert _get_notification_subtitle(schedule, shift, maus) == f"{expected}\nSchedule {schedule_name}" + + +@pytest.mark.parametrize( + "shift_start_time,shift_end_time,expected", + [ + # same day shift + (timezone.datetime(2023, 7, 8, 9, 0, 0), timezone.datetime(2023, 7, 8, 17, 0, 0), "9:00 AM - 5:00 PM"), + # multi-day shift + ( + timezone.datetime(2023, 7, 8, 9, 0, 0), + timezone.datetime(2023, 7, 12, 17, 0, 0), + "7/8/23, 9:00 AM - 7/12/23, 5:00 PM", + ), + ], +) +@pytest.mark.django_db +def test_get_notification_subtitle_no_locale( + make_organization, + make_user_for_organization, + make_schedule, + shift_start_time, + shift_end_time, + expected, +): + schedule_name = "asdfasdfasdfasdf" + + organization = make_organization() + user = make_user_for_organization(organization) + user_pk = user.public_primary_key + maus = MobileAppUserSettings.objects.create(user=user) + schedule = make_schedule(organization, name=schedule_name, schedule_class=OnCallScheduleWeb) + + shift = _create_schedule_event( + shift_start_time, + shift_end_time, + "asdfasdfasdf", + [ + { + "pk": user_pk, + }, + ], + ) + + assert _get_notification_subtitle(schedule, shift, maus) == f"{expected}\nSchedule {schedule_name}" @mock.patch("apps.mobile_app.tasks.going_oncall_notification._get_notification_subtitle") From 2a466a0c4f7209ffe22b300639b1377cdc84189d Mon Sep 17 00:00:00 2001 From: Michael Derynck Date: Wed, 31 Jan 2024 15:54:50 -0700 Subject: [PATCH 4/5] Add transaction on_commit before signals for alert group actions (#3731) # What this PR does Add transactions around log record creation and check transaction on_commit before sending signals passing DB id of alert group log records. In cases for delete we can then assume any missing IDs on tasks are from intentionally deleted alert groups and we can stop tasks from retrying endlessly. ## Which issue(s) this PR fixes ## Checklist - [x] Unit, integration, and e2e (if applicable) tests updated - [x] Documentation added (or `pr:no public docs` PR label added if not required) - [x] `CHANGELOG.md` updated (or `pr:no changelog` PR label added if not required) --- CHANGELOG.md | 4 + engine/apps/alerts/models/alert_group.py | 431 ++++++++---------- .../alerts/models/alert_group_log_record.py | 6 + engine/apps/alerts/tasks/__init__.py | 2 + .../apps/alerts/tasks/acknowledge_reminder.py | 1 + .../apps/alerts/tasks/delete_alert_group.py | 33 +- .../alerts/tasks/send_alert_group_signal.py | 4 +- engine/apps/alerts/tests/test_alert_group.py | 82 +++- .../tests/test_update_metrics_cache.py | 4 +- .../alert_group_representative.py | 32 +- .../telegram/alert_group_representative.py | 7 +- engine/apps/telegram/tasks.py | 20 +- engine/apps/webhooks/listeners.py | 6 +- engine/settings/celery_task_routes.py | 2 + 14 files changed, 352 insertions(+), 282 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 666b54a1..77134072 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## Unreleased +### Changed + +- Ensure alert group log records are committed to DB before signalling about them @mderynck([#3731](https://github.com/grafana/oncall/pull/3731)) + ### Fixed - Address `SlackAPIRatelimitError` exceptions in `apps.slack.tasks.send_message_to_thread_if_bot_not_in_channel` task diff --git a/engine/apps/alerts/models/alert_group.py b/engine/apps/alerts/models/alert_group.py index d3d2479d..a663b571 100644 --- a/engine/apps/alerts/models/alert_group.py +++ b/engine/apps/alerts/models/alert_group.py @@ -20,8 +20,13 @@ from apps.alerts.escalation_snapshot.escalation_snapshot_mixin import START_ESCA from apps.alerts.incident_appearance.renderers.constants import DEFAULT_BACKUP_TITLE from apps.alerts.incident_appearance.renderers.slack_renderer import AlertGroupSlackRenderer from apps.alerts.incident_log_builder import IncidentLogBuilder -from apps.alerts.signals import alert_group_action_triggered_signal, alert_group_created_signal -from apps.alerts.tasks import acknowledge_reminder_task, send_alert_group_signal, unsilence_task +from apps.alerts.signals import alert_group_created_signal +from apps.alerts.tasks import ( + acknowledge_reminder_task, + send_alert_group_signal, + send_alert_group_signal_for_delete, + unsilence_task, +) from apps.metrics_exporter.tasks import update_metrics_for_alert_group from apps.slack.slack_formatter import SlackFormatter from apps.user_management.models import User @@ -639,20 +644,17 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. self.stop_escalation() self.start_ack_reminder_if_needed() - log_record = self.log_records.create( - type=AlertGroupLogRecord.TYPE_ACK, author=user, action_source=action_source - ) + with transaction.atomic(): + log_record = self.log_records.create( + type=AlertGroupLogRecord.TYPE_ACK, author=user, action_source=action_source + ) - logger.debug( - f"send alert_group_action_triggered_signal for alert_group {self.pk}, " - f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: {action_source}" - ) + logger.debug( + f"send alert_group_action_triggered_signal for alert_group {self.pk}, " + f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: {action_source}" + ) - alert_group_action_triggered_signal.send( - sender=self.acknowledge_by_user, - log_record=log_record.pk, - action_source=action_source, - ) + transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) for dependent_alert_group in self.dependent_alert_groups.all(): dependent_alert_group.acknowledge_by_user(user, action_source=action_source) @@ -679,18 +681,15 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. ) self.stop_escalation() - log_record = self.log_records.create(type=AlertGroupLogRecord.TYPE_ACK) + with transaction.atomic(): + log_record = self.log_records.create(type=AlertGroupLogRecord.TYPE_ACK) - logger.debug( - f"send alert_group_action_triggered_signal for alert_group {self.pk}, " - f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: alert" - ) + logger.debug( + f"send alert_group_action_triggered_signal for alert_group {self.pk}, " + f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: alert" + ) - alert_group_action_triggered_signal.send( - sender=self.acknowledge_by_source, - log_record=log_record.pk, - action_source=None, - ) + transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) for dependent_alert_group in self.dependent_alert_groups.all(): dependent_alert_group.acknowledge_by_source() @@ -707,20 +706,17 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. if self.is_root_alert_group: self.start_escalation_if_needed() - log_record = self.log_records.create( - type=AlertGroupLogRecord.TYPE_UN_ACK, author=user, action_source=action_source - ) + with transaction.atomic(): + log_record = self.log_records.create( + type=AlertGroupLogRecord.TYPE_UN_ACK, author=user, action_source=action_source + ) - logger.debug( - f"send alert_group_action_triggered_signal for alert_group {self.pk}, " - f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: {action_source}" - ) + logger.debug( + f"send alert_group_action_triggered_signal for alert_group {self.pk}, " + f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: {action_source}" + ) - alert_group_action_triggered_signal.send( - sender=self.un_acknowledge_by_user, - log_record=log_record.pk, - action_source=action_source, - ) + transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) for dependent_alert_group in self.dependent_alert_groups.all(): dependent_alert_group.un_acknowledge_by_user(user, action_source=action_source) @@ -745,20 +741,18 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. # Update alert group state and response time metrics cache self._update_metrics(organization_id=user.organization_id, previous_state=initial_state, state=self.state) self.stop_escalation() - log_record = self.log_records.create( - type=AlertGroupLogRecord.TYPE_RESOLVED, author=user, action_source=action_source - ) - logger.debug( - f"send alert_group_action_triggered_signal for alert_group {self.pk}, " - f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: {action_source}" - ) + with transaction.atomic(): + log_record = self.log_records.create( + type=AlertGroupLogRecord.TYPE_RESOLVED, author=user, action_source=action_source + ) - alert_group_action_triggered_signal.send( - sender=self.resolve_by_user, - log_record=log_record.pk, - action_source=action_source, - ) + logger.debug( + f"send alert_group_action_triggered_signal for alert_group {self.pk}, " + f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: {action_source}" + ) + + transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) for dependent_alert_group in self.dependent_alert_groups.all(): dependent_alert_group.resolve_by_user(user, action_source=action_source) @@ -782,18 +776,16 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. organization_id=self.channel.organization_id, previous_state=initial_state, state=self.state ) self.stop_escalation() - log_record = self.log_records.create(type=AlertGroupLogRecord.TYPE_RESOLVED) - logger.debug( - f"send alert_group_action_triggered_signal for alert_group {self.pk}, " - f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: alert" - ) + with transaction.atomic(): + log_record = self.log_records.create(type=AlertGroupLogRecord.TYPE_RESOLVED) - alert_group_action_triggered_signal.send( - sender=self.resolve_by_source, - log_record=log_record.pk, - action_source=None, - ) + logger.debug( + f"send alert_group_action_triggered_signal for alert_group {self.pk}, " + f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: alert" + ) + + transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) for dependent_alert_group in self.dependent_alert_groups.all(): dependent_alert_group.resolve_by_source() @@ -809,18 +801,16 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. organization_id=self.channel.organization_id, previous_state=initial_state, state=self.state ) self.stop_escalation() - log_record = self.log_records.create(type=AlertGroupLogRecord.TYPE_RESOLVED) - logger.debug( - f"send alert_group_action_triggered_signal for alert_group {self.pk}, " - f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: resolve step" - ) + with transaction.atomic(): + log_record = self.log_records.create(type=AlertGroupLogRecord.TYPE_RESOLVED) - alert_group_action_triggered_signal.send( - sender=self.resolve_by_last_step, - log_record=log_record.pk, - action_source=None, - ) + logger.debug( + f"send alert_group_action_triggered_signal for alert_group {self.pk}, " + f"log record {log_record.pk} with type '{log_record.get_type_display()}', action source: resolve step" + ) + + transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) for dependent_alert_group in self.dependent_alert_groups.all(): dependent_alert_group.resolve_by_last_step() @@ -830,19 +820,17 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. self.resolve(resolved_by=AlertGroup.DISABLE_MAINTENANCE) self.stop_escalation() - log_record = self.log_records.create(type=AlertGroupLogRecord.TYPE_RESOLVED) - logger.debug( - f"send alert_group_action_triggered_signal for alert_group {self.pk}, " - f"log record {log_record.pk} with type '{log_record.get_type_display()}', " - f"action source: disable maintenance" - ) + with transaction.atomic(): + log_record = self.log_records.create(type=AlertGroupLogRecord.TYPE_RESOLVED) - alert_group_action_triggered_signal.send( - sender=self.resolve_by_disable_maintenance, - log_record=log_record.pk, - action_source=None, - ) + logger.debug( + f"send alert_group_action_triggered_signal for alert_group {self.pk}, " + f"log record {log_record.pk} with type '{log_record.get_type_display()}', " + f"action source: disable maintenance" + ) + + transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) for dependent_alert_group in self.dependent_alert_groups.all(): dependent_alert_group.resolve_by_disable_maintenance() @@ -856,24 +844,21 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. # Update alert group state metric cache self._update_metrics(organization_id=user.organization_id, previous_state=initial_state, state=self.state) - log_record = self.log_records.create( - type=AlertGroupLogRecord.TYPE_UN_RESOLVED, author=user, action_source=action_source - ) + with transaction.atomic(): + log_record = self.log_records.create( + type=AlertGroupLogRecord.TYPE_UN_RESOLVED, author=user, action_source=action_source + ) - if self.is_root_alert_group: - self.start_escalation_if_needed() + if self.is_root_alert_group: + self.start_escalation_if_needed() - logger.debug( - f"send alert_group_action_triggered_signal for alert_group {self.pk}, " - f"log record {log_record.pk} with type '{log_record.get_type_display()}', " - f"action source: {action_source}" - ) + logger.debug( + f"send alert_group_action_triggered_signal for alert_group {self.pk}, " + f"log record {log_record.pk} with type '{log_record.get_type_display()}', " + f"action source: {action_source}" + ) - alert_group_action_triggered_signal.send( - sender=self.un_resolve_by_user, - log_record=log_record.pk, - action_source=action_source, - ) + transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) for dependent_alert_group in self.dependent_alert_groups.all(): dependent_alert_group.un_resolve_by_user(user, action_source=action_source) @@ -898,25 +883,22 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. if not root_alert_group.silenced and self.silenced: self.un_silence_by_user(user, action_source=action_source) - log_record = self.log_records.create( - type=AlertGroupLogRecord.TYPE_ATTACHED, - author=user, - root_alert_group=root_alert_group, - reason="Attach dropdown", - action_source=action_source, - ) + with transaction.atomic(): + log_record = self.log_records.create( + type=AlertGroupLogRecord.TYPE_ATTACHED, + author=user, + root_alert_group=root_alert_group, + reason="Attach dropdown", + action_source=action_source, + ) - logger.debug( - f"send alert_group_action_triggered_signal for alert_group {self.pk}, " - f"log record {log_record.pk} with type '{log_record.get_type_display()}', " - f"action source: {action_source}" - ) + logger.debug( + f"send alert_group_action_triggered_signal for alert_group {self.pk}, " + f"log record {log_record.pk} with type '{log_record.get_type_display()}', " + f"action source: {action_source}" + ) - alert_group_action_triggered_signal.send( - sender=self.attach_by_user, - log_record=log_record.pk, - action_source=action_source, - ) + transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) log_record_for_root_incident = root_alert_group.log_records.create( type=AlertGroupLogRecord.TYPE_ATTACHED, @@ -932,11 +914,7 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. f"'{log_record_for_root_incident.get_type_display()}', action source: {action_source}" ) - alert_group_action_triggered_signal.send( - sender=self.attach_by_user, - log_record=log_record_for_root_incident.pk, - action_source=action_source, - ) + transaction.on_commit(partial(send_alert_group_signal.delay, log_record_for_root_incident.pk)) else: log_record = self.log_records.create( @@ -953,11 +931,7 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. f"action source: {action_source}" ) - alert_group_action_triggered_signal.send( - sender=self.attach_by_user, - log_record=log_record.pk, - action_source=action_source, - ) + transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) def un_attach_by_user(self, user: User, action_source: typing.Optional[ActionSource] = None) -> None: from apps.alerts.models import AlertGroupLogRecord @@ -968,45 +942,38 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. self.start_escalation_if_needed() - log_record = self.log_records.create( - type=AlertGroupLogRecord.TYPE_UNATTACHED, - author=user, - root_alert_group=root_alert_group, - reason="Unattach button", - action_source=action_source, - ) + with transaction.atomic(): + log_record = self.log_records.create( + type=AlertGroupLogRecord.TYPE_UNATTACHED, + author=user, + root_alert_group=root_alert_group, + reason="Unattach button", + action_source=action_source, + ) - logger.debug( - f"send alert_group_action_triggered_signal for alert_group {self.pk}, " - f"log record {log_record.pk} with type '{log_record.get_type_display()}', " - f"action source: {action_source}" - ) + logger.debug( + f"send alert_group_action_triggered_signal for alert_group {self.pk}, " + f"log record {log_record.pk} with type '{log_record.get_type_display()}', " + f"action source: {action_source}" + ) - alert_group_action_triggered_signal.send( - sender=self.un_attach_by_user, - log_record=log_record.pk, - action_source=action_source, - ) + transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) - log_record_for_root_incident = root_alert_group.log_records.create( - type=AlertGroupLogRecord.TYPE_UNATTACHED, - author=user, - dependent_alert_group=self, - reason="Unattach dropdown", - action_source=action_source, - ) + log_record_for_root_incident = root_alert_group.log_records.create( + type=AlertGroupLogRecord.TYPE_UNATTACHED, + author=user, + dependent_alert_group=self, + reason="Unattach dropdown", + action_source=action_source, + ) - logger.debug( - f"send alert_group_action_triggered_signal for alert_group {root_alert_group.pk}, " - f"log record {log_record_for_root_incident.pk} " - f"with type '{log_record_for_root_incident.get_type_display()}', action source: {action_source}" - ) + logger.debug( + f"send alert_group_action_triggered_signal for alert_group {root_alert_group.pk}, " + f"log record {log_record_for_root_incident.pk} " + f"with type '{log_record_for_root_incident.get_type_display()}', action source: {action_source}" + ) - alert_group_action_triggered_signal.send( - sender=self.un_attach_by_user, - log_record=log_record_for_root_incident.pk, - action_source=action_source, - ) + transaction.on_commit(partial(send_alert_group_signal.delay, log_record_for_root_incident.pk)) def un_attach_by_delete(self): from apps.alerts.models import AlertGroupLogRecord @@ -1016,22 +983,19 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. self.start_escalation_if_needed() - log_record = self.log_records.create( - type=AlertGroupLogRecord.TYPE_UNATTACHED, - reason="Unattach by deleting root incident", - ) + with transaction.atomic(): + log_record = self.log_records.create( + type=AlertGroupLogRecord.TYPE_UNATTACHED, + reason="Unattach by deleting root incident", + ) - logger.debug( - f"send alert_group_action_triggered_signal for alert_group {self.pk}, " - f"log record {log_record.pk} with type '{log_record.get_type_display()}', " - f"action source: delete" - ) + logger.debug( + f"send alert_group_action_triggered_signal for alert_group {self.pk}, " + f"log record {log_record.pk} with type '{log_record.get_type_display()}', " + f"action source: delete" + ) - alert_group_action_triggered_signal.send( - sender=self.un_attach_by_delete, - log_record=log_record.pk, - action_source=None, - ) + transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) def silence_by_user( self, user: User, silence_delay: typing.Optional[int], action_source: typing.Optional[ActionSource] = None @@ -1086,25 +1050,23 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. # Update alert group state and response time metrics cache self._update_metrics(organization_id=user.organization_id, previous_state=initial_state, state=self.state) - log_record = self.log_records.create( - type=AlertGroupLogRecord.TYPE_SILENCE, - author=user, - silence_delay=silence_delay_timedelta, - reason="Silence button", - action_source=action_source, - ) + with transaction.atomic(): + log_record = self.log_records.create( + type=AlertGroupLogRecord.TYPE_SILENCE, + author=user, + silence_delay=silence_delay_timedelta, + reason="Silence button", + action_source=action_source, + ) - logger.debug( - f"send alert_group_action_triggered_signal for alert_group {self.pk}, " - f"log record {log_record.pk} with type '{log_record.get_type_display()}', " - f"action source: {action_source}" - ) + logger.debug( + f"send alert_group_action_triggered_signal for alert_group {self.pk}, " + f"log record {log_record.pk} with type '{log_record.get_type_display()}', " + f"action source: {action_source}" + ) + + transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) - alert_group_action_triggered_signal.send( - sender=self.silence_by_user, - log_record=log_record.pk, - action_source=action_source, - ) for dependent_alert_group in self.dependent_alert_groups.all(): dependent_alert_group.silence_by_user(user, silence_delay, action_source) @@ -1120,26 +1082,24 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. if self.is_root_alert_group: self.start_escalation_if_needed() - log_record = self.log_records.create( - type=AlertGroupLogRecord.TYPE_UN_SILENCE, - author=user, - silence_delay=None, - # 2.Look like some time ago there was no TYPE_UN_SILENCE - reason="Unsilence button", - action_source=action_source, - ) + with transaction.atomic(): + log_record = self.log_records.create( + type=AlertGroupLogRecord.TYPE_UN_SILENCE, + author=user, + silence_delay=None, + # 2.Look like some time ago there was no TYPE_UN_SILENCE + reason="Unsilence button", + action_source=action_source, + ) - logger.debug( - f"send alert_group_action_triggered_signal for alert_group {self.pk}, " - f"log record {log_record.pk} with type '{log_record.get_type_display()}', " - f"action source: {action_source}" - ) + logger.debug( + f"send alert_group_action_triggered_signal for alert_group {self.pk}, " + f"log record {log_record.pk} with type '{log_record.get_type_display()}', " + f"action source: {action_source}" + ) + + transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) - alert_group_action_triggered_signal.send( - sender=self.un_silence_by_user, - log_record=log_record.pk, - action_source=action_source, - ) for dependent_alert_group in self.dependent_alert_groups.all(): dependent_alert_group.un_silence_by_user(user, action_source=action_source) @@ -1169,22 +1129,19 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. # Update alert group state and response time metrics cache self._update_metrics(organization_id=user.organization_id, previous_state=initial_state, state=self.state) - log_record = self.log_records.create( - type=AlertGroupLogRecord.TYPE_WIPED, - author=user, - ) + with transaction.atomic(): + log_record = self.log_records.create( + type=AlertGroupLogRecord.TYPE_WIPED, + author=user, + ) - logger.debug( - f"send alert_group_action_triggered_signal for alert_group {self.pk}, " - f"log record {log_record.pk} with type '{log_record.get_type_display()}', " - f"action source: wipe" - ) + logger.debug( + f"send alert_group_action_triggered_signal for alert_group {self.pk}, " + f"log record {log_record.pk} with type '{log_record.get_type_display()}', " + f"action source: wipe" + ) - alert_group_action_triggered_signal.send( - sender=self.wipe_by_user, - log_record=log_record.pk, - action_source=None, - ) + transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) for dependent_alert_group in self.dependent_alert_groups.all(): dependent_alert_group.wipe_by_user(user) @@ -1193,31 +1150,27 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. from apps.alerts.models import AlertGroupLogRecord self.stop_escalation() - # prevent creating multiple logs - # filter instead of get_or_create cause it can be multiple logs of this type due deleting error - log_record = self.log_records.filter(type=AlertGroupLogRecord.TYPE_DELETED).last() - if not log_record: - log_record = self.log_records.create( - type=AlertGroupLogRecord.TYPE_DELETED, - author=user, + with transaction.atomic(): + # prevent creating multiple logs + # filter instead of get_or_create cause it can be multiple logs of this type due deleting error + log_record = self.log_records.filter(type=AlertGroupLogRecord.TYPE_DELETED).last() + + if not log_record: + log_record = self.log_records.create( + type=AlertGroupLogRecord.TYPE_DELETED, + author=user, + ) + + logger.debug( + f"send alert_group_action_triggered_signal for alert_group {self.pk}, " + f"log record {log_record.pk} with type '{log_record.get_type_display()}', " + f"action source: delete" ) - logger.debug( - f"send alert_group_action_triggered_signal for alert_group {self.pk}, " - f"log record {log_record.pk} with type '{log_record.get_type_display()}', " - f"action source: delete" - ) - - alert_group_action_triggered_signal.send( - sender=self.delete_by_user, - log_record=log_record.pk, - action_source=None, # TODO: Action source is none - it is suspicious - # this flag forces synchrony call for action handler in representatives - # (for now it is actual only for Slack representative) - force_sync=True, - ) + transaction.on_commit(partial(send_alert_group_signal_for_delete.delay, self.pk, log_record.pk)) + def finish_delete_by_user(self): dependent_alerts = list(self.dependent_alert_groups.all()) self.hard_delete() diff --git a/engine/apps/alerts/models/alert_group_log_record.py b/engine/apps/alerts/models/alert_group_log_record.py index dfcabfa7..3187ebb7 100644 --- a/engine/apps/alerts/models/alert_group_log_record.py +++ b/engine/apps/alerts/models/alert_group_log_record.py @@ -594,6 +594,12 @@ class AlertGroupLogRecord(models.Model): step_specific_info = json.loads(self.step_specific_info) return step_specific_info + def delete(self): + logger.debug( + f"alert_group_log_record for alert_group deleted" f"alert_group={self.alert_group.pk} log_id={self.pk}" + ) + super().delete() + @receiver(post_save, sender=AlertGroupLogRecord) def listen_for_alertgrouplogrecord(sender, instance, created, *args, **kwargs): diff --git a/engine/apps/alerts/tasks/__init__.py b/engine/apps/alerts/tasks/__init__.py index 0dce9b3a..a736c921 100644 --- a/engine/apps/alerts/tasks/__init__.py +++ b/engine/apps/alerts/tasks/__init__.py @@ -7,6 +7,8 @@ from .check_escalation_finished import check_escalation_finished_task # noqa: F from .custom_button_result import custom_button_result # noqa: F401 from .custom_webhook_result import custom_webhook_result # noqa: F401 from .delete_alert_group import delete_alert_group # noqa: F401 +from .delete_alert_group import finish_delete_alert_group # noqa: F401 +from .delete_alert_group import send_alert_group_signal_for_delete # noqa: F401 from .distribute_alert import distribute_alert # noqa: F401 from .escalate_alert_group import escalate_alert_group # noqa: F401 from .invite_user_to_join_incident import invite_user_to_join_incident # noqa: F401 diff --git a/engine/apps/alerts/tasks/acknowledge_reminder.py b/engine/apps/alerts/tasks/acknowledge_reminder.py index e0e41500..73d86491 100644 --- a/engine/apps/alerts/tasks/acknowledge_reminder.py +++ b/engine/apps/alerts/tasks/acknowledge_reminder.py @@ -76,6 +76,7 @@ def acknowledge_reminder_task(alert_group_pk: int, unacknowledge_process_id: str log_record = alert_group.log_records.create( type=AlertGroupLogRecord.TYPE_ACK_REMINDER_TRIGGERED, author=alert_group.acknowledged_by_user ) + task_logger.info(f"created log record {log_record.pk}, sending signal...") transaction.on_commit(partial(send_alert_group_signal.delay, log_record.pk)) diff --git a/engine/apps/alerts/tasks/delete_alert_group.py b/engine/apps/alerts/tasks/delete_alert_group.py index e97ce515..cd4d5807 100644 --- a/engine/apps/alerts/tasks/delete_alert_group.py +++ b/engine/apps/alerts/tasks/delete_alert_group.py @@ -1,6 +1,7 @@ from celery.utils.log import get_task_logger from django.conf import settings +from apps.alerts.signals import alert_group_action_triggered_signal from apps.slack.errors import SlackAPIRatelimitError from common.custom_celery_tasks import shared_dedicated_queue_retry_task @@ -10,7 +11,7 @@ logger = get_task_logger(__name__) @shared_dedicated_queue_retry_task( autoretry_for=(Exception,), retry_backoff=True, max_retries=1 if settings.DEBUG else None ) -def delete_alert_group(alert_group_pk, user_pk): +def delete_alert_group(alert_group_pk: int, user_pk: int) -> None: from apps.alerts.models import AlertGroup from apps.user_management.models import User @@ -25,9 +26,35 @@ def delete_alert_group(alert_group_pk, user_pk): return logger.debug(f"User {user} is deleting alert group {alert_group} (channel: {alert_group.channel})") + alert_group.delete_by_user(user) + +@shared_dedicated_queue_retry_task( + autoretry_for=(Exception,), retry_backoff=True, max_retries=1 if settings.DEBUG else None +) +def send_alert_group_signal_for_delete(alert_group_pk: int, log_record_pk: int) -> None: try: - alert_group.delete_by_user(user) + alert_group_action_triggered_signal.send( + sender=send_alert_group_signal_for_delete, + log_record=log_record_pk, + force_sync=True, + ) except SlackAPIRatelimitError as e: # Handle Slack API ratelimit raised in apps.slack.scenarios.distribute_alerts.DeleteGroupStep.process_signal - delete_alert_group.apply_async((alert_group_pk, user_pk), countdown=e.retry_after) + send_alert_group_signal_for_delete.apply_async((alert_group_pk, log_record_pk), countdown=e.retry_after) + return + + finish_delete_alert_group.apply_async((alert_group_pk,)) + + +@shared_dedicated_queue_retry_task( + autoretry_for=(Exception,), retry_backoff=True, max_retries=1 if settings.DEBUG else None +) +def finish_delete_alert_group(alert_group_pk: int) -> None: + from apps.alerts.models import AlertGroup + + alert_group = AlertGroup.objects.filter(pk=alert_group_pk).first() + if not alert_group: + logger.debug(f"Alert group id={alert_group_pk} not found, already deleted") + return + alert_group.finish_delete_by_user() diff --git a/engine/apps/alerts/tasks/send_alert_group_signal.py b/engine/apps/alerts/tasks/send_alert_group_signal.py index be632a2b..65f0b8f2 100644 --- a/engine/apps/alerts/tasks/send_alert_group_signal.py +++ b/engine/apps/alerts/tasks/send_alert_group_signal.py @@ -5,13 +5,15 @@ from django.conf import settings from apps.alerts.signals import alert_group_action_triggered_signal from common.custom_celery_tasks import shared_dedicated_queue_retry_task +from .task_logger import task_logger + @shared_dedicated_queue_retry_task( autoretry_for=(Exception,), retry_backoff=True, max_retries=0 if settings.DEBUG else None ) def send_alert_group_signal(log_record_id): start_time = time.time() - + task_logger.info(f"sending signal for log record {log_record_id}") alert_group_action_triggered_signal.send(sender=send_alert_group_signal, log_record=log_record_id) print("--- %s seconds ---" % (time.time() - start_time)) diff --git a/engine/apps/alerts/tests/test_alert_group.py b/engine/apps/alerts/tests/test_alert_group.py index d2ba04d2..0d7166c6 100644 --- a/engine/apps/alerts/tests/test_alert_group.py +++ b/engine/apps/alerts/tests/test_alert_group.py @@ -6,7 +6,11 @@ from apps.alerts.constants import ActionSource from apps.alerts.incident_appearance.renderers.phone_call_renderer import AlertGroupPhoneCallRenderer from apps.alerts.models import AlertGroup, AlertGroupLogRecord from apps.alerts.tasks import wipe -from apps.alerts.tasks.delete_alert_group import delete_alert_group +from apps.alerts.tasks.delete_alert_group import ( + delete_alert_group, + finish_delete_alert_group, + send_alert_group_signal_for_delete, +) from apps.slack.client import SlackClient from apps.slack.errors import SlackAPIMessageNotFoundError, SlackAPIRatelimitError from apps.slack.models import SlackMessage @@ -85,9 +89,9 @@ def test_delete( make_alert, make_slack_message, make_resolution_note_slack_message, + django_capture_on_commit_callbacks, ): """test alert group deleting""" - organization, slack_team_identity = make_organization_with_slack_team_identity() user = make_user(organization=organization) @@ -119,7 +123,20 @@ def test_delete( assert alert_group.slack_messages.count() == 1 assert alert_group.resolution_note_slack_messages.count() == 2 - delete_alert_group(alert_group.pk, user.pk) + with patch( + "apps.alerts.tasks.delete_alert_group.send_alert_group_signal_for_delete.delay", return_value=None + ) as mock_send_alert_group_signal: + with django_capture_on_commit_callbacks(execute=True): + delete_alert_group(alert_group.pk, user.pk) + assert mock_send_alert_group_signal.call_count == 1 + + with patch( + "apps.alerts.tasks.delete_alert_group.finish_delete_alert_group.apply_async", return_value=None + ) as mock_finish_delete_alert_group: + send_alert_group_signal_for_delete(*mock_send_alert_group_signal.call_args.args) + assert mock_finish_delete_alert_group.call_count == 1 + + finish_delete_alert_group(alert_group.pk) assert not alert_group.alerts.exists() assert not alert_group.slack_messages.exists() @@ -140,10 +157,10 @@ def test_delete( @pytest.mark.parametrize("api_method", ["reactions_remove", "chat_delete"]) -@patch.object(delete_alert_group, "apply_async") +@patch.object(send_alert_group_signal_for_delete, "apply_async") @pytest.mark.django_db def test_delete_slack_ratelimit( - mock_delete_alert_group, + mock_send_alert_group_signal_for_delete, api_method, make_organization_with_slack_team_identity, make_user, @@ -152,6 +169,7 @@ def test_delete_slack_ratelimit( make_alert, make_slack_message, make_resolution_note_slack_message, + django_capture_on_commit_callbacks, ): organization, slack_team_identity = make_organization_with_slack_team_identity() user = make_user(organization=organization) @@ -180,17 +198,31 @@ def test_delete_slack_ratelimit( ts="test2_ts", ) - with patch.object( - SlackClient, - api_method, - side_effect=SlackAPIRatelimitError( - response=build_slack_response({"ok": False, "error": "ratelimited"}, headers={"Retry-After": 42}) - ), - ): - delete_alert_group(alert_group.pk, user.pk) + with patch( + "apps.alerts.tasks.delete_alert_group.send_alert_group_signal_for_delete.delay", return_value=None + ) as mock_send_alert_group_signal: + with django_capture_on_commit_callbacks(execute=True): + delete_alert_group(alert_group.pk, user.pk) + assert mock_send_alert_group_signal.call_count == 1 + + with patch( + "apps.alerts.tasks.delete_alert_group.finish_delete_alert_group.apply_async", return_value=None + ) as mock_finish_delete_alert_group: + with patch.object( + SlackClient, + api_method, + side_effect=SlackAPIRatelimitError( + response=build_slack_response({"ok": False, "error": "ratelimited"}, headers={"Retry-After": 42}) + ), + ): + send_alert_group_signal_for_delete(*mock_send_alert_group_signal.call_args.args) + + assert mock_finish_delete_alert_group.call_count == 0 # Check task is retried gracefully - mock_delete_alert_group.assert_called_once_with((alert_group.pk, user.pk), countdown=42) + mock_send_alert_group_signal_for_delete.assert_called_once_with( + mock_send_alert_group_signal.call_args.args, countdown=42 + ) @pytest.mark.parametrize("api_method", ["reactions_remove", "chat_delete"]) @@ -582,7 +614,7 @@ def test_filter_active_alert_groups( @patch("apps.alerts.models.AlertGroup.hard_delete") @patch("apps.alerts.models.AlertGroup.un_attach_by_delete") @patch("apps.alerts.models.AlertGroup.stop_escalation") -@patch("apps.alerts.models.alert_group.alert_group_action_triggered_signal") +@patch("apps.alerts.tasks.delete_alert_group.alert_group_action_triggered_signal") @pytest.mark.django_db def test_delete_by_user( mock_alert_group_action_triggered_signal, @@ -592,6 +624,7 @@ def test_delete_by_user( make_organization_and_user, make_alert_receive_channel, make_alert_group, + django_capture_on_commit_callbacks, ): organization, user = make_organization_and_user() alert_receive_channel = make_alert_receive_channel(organization) @@ -603,20 +636,31 @@ def test_delete_by_user( assert alert_group.log_records.filter(type=AlertGroupLogRecord.TYPE_DELETED).count() == 0 - alert_group.delete_by_user(user) + with patch( + "apps.alerts.tasks.delete_alert_group.send_alert_group_signal_for_delete.delay", return_value=None + ) as mock_send_alert_group_signal: + with django_capture_on_commit_callbacks(execute=True): + delete_alert_group(alert_group.pk, user.pk) + assert mock_send_alert_group_signal.call_count == 1 assert alert_group.log_records.filter(type=AlertGroupLogRecord.TYPE_DELETED).count() == 1 deleted_log_record = alert_group.log_records.get(type=AlertGroupLogRecord.TYPE_DELETED) - alert_group.stop_escalation.assert_called_once_with() + with patch( + "apps.alerts.tasks.delete_alert_group.finish_delete_alert_group.apply_async", return_value=None + ) as mock_finish_delete_alert_group: + send_alert_group_signal_for_delete(*mock_send_alert_group_signal.call_args.args) + assert mock_finish_delete_alert_group.call_count == 1 + mock_alert_group_action_triggered_signal.send.assert_called_once_with( - sender=alert_group.delete_by_user, + sender=send_alert_group_signal_for_delete, log_record=deleted_log_record.pk, - action_source=None, force_sync=True, ) + finish_delete_alert_group(alert_group.pk) + alert_group.hard_delete.assert_called_once_with() for dependent_alert_group in dependent_alert_groups: diff --git a/engine/apps/metrics_exporter/tests/test_update_metrics_cache.py b/engine/apps/metrics_exporter/tests/test_update_metrics_cache.py index 29fb52f7..fa05949b 100644 --- a/engine/apps/metrics_exporter/tests/test_update_metrics_cache.py +++ b/engine/apps/metrics_exporter/tests/test_update_metrics_cache.py @@ -31,7 +31,7 @@ def mock_apply_async(monkeypatch): @patch("apps.alerts.models.alert_group_log_record.tasks.send_update_log_report_signal.apply_async") -@patch("apps.alerts.models.alert_group.alert_group_action_triggered_signal.send") +@patch("apps.alerts.tasks.send_alert_group_signal.alert_group_action_triggered_signal.send") @pytest.mark.django_db @override_settings(CELERY_TASK_ALWAYS_EAGER=True) def test_update_metric_alert_groups_total_cache_on_action( @@ -142,7 +142,7 @@ def test_update_metric_alert_groups_total_cache_on_action( @patch("apps.alerts.models.alert_group_log_record.tasks.send_update_log_report_signal.apply_async") -@patch("apps.alerts.models.alert_group.alert_group_action_triggered_signal.send") +@patch("apps.alerts.tasks.send_alert_group_signal.alert_group_action_triggered_signal.send") @pytest.mark.django_db @override_settings(CELERY_TASK_ALWAYS_EAGER=True) def test_update_metric_alert_groups_response_time_cache_on_action( diff --git a/engine/apps/slack/representatives/alert_group_representative.py b/engine/apps/slack/representatives/alert_group_representative.py index 1ef8b5c5..67fd4ce6 100644 --- a/engine/apps/slack/representatives/alert_group_representative.py +++ b/engine/apps/slack/representatives/alert_group_representative.py @@ -2,6 +2,7 @@ import logging from celery.utils.log import get_task_logger from django.conf import settings +from django.core.exceptions import ObjectDoesNotExist from apps.alerts.constants import ActionSource from apps.alerts.representative import AlertGroupAbstractRepresentative @@ -49,14 +50,20 @@ def on_create_alert_slack_representative_async(alert_pk): @shared_dedicated_queue_retry_task( - autoretry_for=(Exception,), retry_backoff=True, max_retries=1 if settings.DEBUG else None + autoretry_for=(Exception,), + retry_backoff=True, + dont_autoretry_for=(ObjectDoesNotExist,), + max_retries=1 if settings.DEBUG else None, ) def on_alert_group_action_triggered_async(log_record_id): from apps.alerts.models import AlertGroupLogRecord - logger.debug(f"SLACK representative: get log record {log_record_id}") + try: + log_record = AlertGroupLogRecord.objects.get(pk=log_record_id) + except AlertGroupLogRecord.DoesNotExist as e: + logger.warning(f"SLACK representative: log record {log_record_id} never created or has been deleted") + raise e - log_record = AlertGroupLogRecord.objects.get(pk=log_record_id) alert_group_id = log_record.alert_group_id logger.debug(f"Start on_alert_group_action_triggered for alert_group {alert_group_id}, log record {log_record_id}") instance = AlertGroupSlackRepresentative(log_record) @@ -145,16 +152,25 @@ class AlertGroupSlackRepresentative(AlertGroupAbstractRepresentative): from apps.alerts.models import AlertGroupLogRecord log_record = kwargs["log_record"] - action_source = kwargs.get("action_source") force_sync = kwargs.get("force_sync", False) if isinstance(log_record, AlertGroupLogRecord): log_record_id = log_record.pk else: log_record_id = log_record - if action_source == ActionSource.SLACK or force_sync: + try: + log_record = AlertGroupLogRecord.objects.get(pk=log_record_id) + except AlertGroupLogRecord.DoesNotExist: + logger.warning( + f"on_alert_group_action_triggered: log record {log_record_id} never created or has been deleted" + ) + return + + if log_record.action_source == ActionSource.SLACK or force_sync: + logger.debug(f"SLACK on_alert_group_action_triggered: sync {log_record_id} {force_sync}") on_alert_group_action_triggered_async(log_record_id) else: + logger.debug(f"SLACK on_alert_group_action_triggered: async {log_record_id} {force_sync}") on_alert_group_action_triggered_async.apply_async((log_record_id,)) @classmethod @@ -167,7 +183,11 @@ class AlertGroupSlackRepresentative(AlertGroupAbstractRepresentative): alert_group_id = alert_group.pk else: alert_group_id = alert_group - alert_group = AlertGroup.objects.get(pk=alert_group_id) + try: + alert_group = AlertGroup.objects.get(pk=alert_group_id) + except AlertGroup.DoesNotExist as e: + logger.warning(f"SLACK update log report: alert group {alert_group_id} has been deleted") + raise e logger.debug( f"Received alert_group_update_log_report signal in SLACK representative for alert_group {alert_group_id}" diff --git a/engine/apps/telegram/alert_group_representative.py b/engine/apps/telegram/alert_group_representative.py index 635420c6..ef3ae992 100644 --- a/engine/apps/telegram/alert_group_representative.py +++ b/engine/apps/telegram/alert_group_representative.py @@ -64,8 +64,13 @@ class AlertGroupTelegramRepresentative(AlertGroupAbstractRepresentative): def on_alert_group_update_log_report(cls, **kwargs): logger.info("AlertGroupTelegramRepresentative UPDATE LOG REPORT SIGNAL") alert_group = kwargs["alert_group"] + if not isinstance(alert_group, AlertGroup): - alert_group = AlertGroup.objects.get(pk=alert_group) + try: + alert_group = AlertGroup.objects.get(pk=alert_group) + except AlertGroup.DoesNotExist as e: + logger.warning(f"Telegram update log report: alert group {alert_group} has been deleted") + raise e messages_to_edit = alert_group.telegram_messages.filter( message_type__in=( diff --git a/engine/apps/telegram/tasks.py b/engine/apps/telegram/tasks.py index 41073848..5aeb9948 100644 --- a/engine/apps/telegram/tasks.py +++ b/engine/apps/telegram/tasks.py @@ -3,6 +3,7 @@ import logging from celery import uuid as celery_uuid from celery.utils.log import get_task_logger from django.conf import settings +from django.core.exceptions import ObjectDoesNotExist from telegram import error from apps.alerts.models import Alert, AlertGroup @@ -230,7 +231,10 @@ def on_create_alert_telegram_representative_async(self, alert_pk): @shared_dedicated_queue_retry_task( - autoretry_for=(Exception,), retry_backoff=True, max_retries=1 if settings.DEBUG else None + autoretry_for=(Exception,), + retry_backoff=True, + dont_autoretry_for=(ObjectDoesNotExist,), + max_retries=1 if settings.DEBUG else None, ) def on_alert_group_action_triggered_async(log_record_id): from apps.alerts.models import AlertGroupLogRecord @@ -239,18 +243,14 @@ def on_alert_group_action_triggered_async(log_record_id): logger.info(f"AlertGroupTelegramRepresentative ACTION SIGNAL, log record {log_record_id}") # temporary solution to handle cases when alert group and related log records were deleted + try: log_record = AlertGroupLogRecord.objects.get(pk=log_record_id) except AlertGroupLogRecord.DoesNotExist as e: - retries_count = on_alert_group_action_triggered_async.request.retries - if retries_count >= 10: - logger.error( - f"AlertGroupTelegramRepresentative: was not able to get AlertGroupLogRecord, probably alert group " - f"was deleted. log record {log_record_id}, retries: {retries_count}" - ) - return - else: - raise e + logger.warning( + f"AlertGroupTelegramRepresentative: log record {log_record_id} never created or has been deleted" + ) + raise e instance = AlertGroupTelegramRepresentative(log_record) if instance.is_applicable(): diff --git a/engine/apps/webhooks/listeners.py b/engine/apps/webhooks/listeners.py index 5ff9b192..ebbed55b 100644 --- a/engine/apps/webhooks/listeners.py +++ b/engine/apps/webhooks/listeners.py @@ -15,5 +15,9 @@ def on_action_triggered(**kwargs): log_record = kwargs["log_record"] if not isinstance(log_record, AlertGroupLogRecord): - log_record = AlertGroupLogRecord.objects.get(pk=log_record) + try: + log_record = AlertGroupLogRecord.objects.get(pk=log_record) + except AlertGroupLogRecord.DoesNotExist as e: + logger.warning(f"Webhook action triggered: log record {log_record} never created or has been deleted") + raise e alert_group_status_change.apply_async((log_record.type, log_record.alert_group_id, log_record.author_id)) diff --git a/engine/settings/celery_task_routes.py b/engine/settings/celery_task_routes.py index 0a6f319c..73dea53d 100644 --- a/engine/settings/celery_task_routes.py +++ b/engine/settings/celery_task_routes.py @@ -4,6 +4,8 @@ CELERY_TASK_ROUTES = { "queue": "default" }, "apps.alerts.tasks.delete_alert_group.delete_alert_group": {"queue": "default"}, + "apps.alerts.tasks.delete_alert_group.send_alert_group_signal_for_delete": {"queue": "default"}, + "apps.alerts.tasks.delete_alert_group.finish_delete_alert_group": {"queue": "default"}, "apps.alerts.tasks.invalidate_web_cache_for_alert_group.invalidate_web_cache_for_alert_group": {"queue": "default"}, "apps.alerts.tasks.send_alert_group_signal.send_alert_group_signal": {"queue": "default"}, "apps.alerts.tasks.wipe.wipe": {"queue": "default"}, From 1c2a6bcdb4aa74f72e5d9d092e838a023fe422d7 Mon Sep 17 00:00:00 2001 From: Joey Orlando Date: Wed, 31 Jan 2024 19:51:31 -0500 Subject: [PATCH 5/5] Update CHANGELOG.md --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 77134072..7bab77f4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## Unreleased +## v1.3.97 (2024-01-31) + ### Changed - Ensure alert group log records are committed to DB before signalling about them @mderynck([#3731](https://github.com/grafana/oncall/pull/3731))