From e053eb084d84419159c0376be4de3a11a867e5ab Mon Sep 17 00:00:00 2001 From: Matias Bordese Date: Wed, 6 Dec 2023 09:20:03 -0300 Subject: [PATCH] Track alert received timestamp on alert group creation (#3513) Keep record of the timestamp when the alert group creation task is triggered, allowing to track the delta time between alert received datetime and alert group creation timestamp. Related to https://github.com/grafana/oncall-private/issues/2347 --- CHANGELOG.md | 4 ++ .../migrations/0042_alertgroup_received_at.py | 18 ++++++ engine/apps/alerts/models/alert.py | 2 + engine/apps/alerts/models/alert_group.py | 9 ++- .../alerts/models/alert_receive_channel.py | 2 + engine/apps/alerts/tests/test_alert.py | 23 ++++++++ engine/apps/email/inbound.py | 3 + engine/apps/heartbeat/tasks.py | 4 ++ engine/apps/integrations/tasks.py | 8 ++- engine/apps/integrations/tests/test_views.py | 56 +++++++++++++------ engine/apps/integrations/views.py | 24 ++++++-- 11 files changed, 129 insertions(+), 24 deletions(-) create mode 100644 engine/apps/alerts/migrations/0042_alertgroup_received_at.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 6716f7ac..711082b6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## Unreleased +### Added + +- Track alert received timestamp on alert group creation ([#3513](https://github.com/grafana/oncall/pull/3513)) + ## v1.3.72 (2023-12-05) ### Fixed diff --git a/engine/apps/alerts/migrations/0042_alertgroup_received_at.py b/engine/apps/alerts/migrations/0042_alertgroup_received_at.py new file mode 100644 index 00000000..9d693b9c --- /dev/null +++ b/engine/apps/alerts/migrations/0042_alertgroup_received_at.py @@ -0,0 +1,18 @@ +# Generated by Django 4.2.7 on 2023-12-05 18:06 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('alerts', '0041_alertreceivechannel_unique_direct_paging_integration_per_team'), + ] + + operations = [ + migrations.AddField( + model_name='alertgroup', + name='received_at', + field=models.DateTimeField(blank=True, default=None, null=True), + ), + ] diff --git a/engine/apps/alerts/models/alert.py b/engine/apps/alerts/models/alert.py index 203f208a..a3b5573e 100644 --- a/engine/apps/alerts/models/alert.py +++ b/engine/apps/alerts/models/alert.py @@ -90,6 +90,7 @@ class Alert(models.Model): is_demo=False, channel_filter=None, force_route_id=None, + received_at=None, ): """ Creates an alert and a group if needed. @@ -105,6 +106,7 @@ class Alert(models.Model): channel=alert_receive_channel, channel_filter=channel_filter, group_data=group_data, + received_at=received_at, ) if group_created: diff --git a/engine/apps/alerts/models/alert_group.py b/engine/apps/alerts/models/alert_group.py index 4a91a902..cb1036d0 100644 --- a/engine/apps/alerts/models/alert_group.py +++ b/engine/apps/alerts/models/alert_group.py @@ -101,7 +101,7 @@ class AlertGroupQuerySet(models.QuerySet): inside_organization_number = AlertGroupCounter.objects.get_value(organization=organization) + 1 return super().create(**kwargs, inside_organization_number=inside_organization_number) - def get_or_create_grouping(self, channel, channel_filter, group_data): + def get_or_create_grouping(self, channel, channel_filter, group_data, received_at=None): """ This method is similar to default Django QuerySet.get_or_create(), please see the original get_or_create method. The difference is that this method is trying to get an object using multiple queries with different filters. @@ -131,7 +131,10 @@ class AlertGroupQuerySet(models.QuerySet): # Create a new group if we couldn't group it to any existing ones try: alert_group = self.create( - **search_params, is_open_for_grouping=True, web_title_cache=group_data.web_title_cache + **search_params, + is_open_for_grouping=True, + web_title_cache=group_data.web_title_cache, + received_at=received_at, ) alert_group_created_signal.send(sender=self.__class__, alert_group=alert_group) return (alert_group, True) @@ -334,6 +337,8 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. response_time = models.DurationField(null=True, default=None) + received_at = models.DateTimeField(blank=True, null=True, default=None) + @property def is_silenced_forever(self): return self.silenced and self.silenced_until is None diff --git a/engine/apps/alerts/models/alert_receive_channel.py b/engine/apps/alerts/models/alert_receive_channel.py index ec2bae66..3a7973f4 100644 --- a/engine/apps/alerts/models/alert_receive_channel.py +++ b/engine/apps/alerts/models/alert_receive_channel.py @@ -650,6 +650,7 @@ class AlertReceiveChannel(IntegrationOptionsMixin, MaintainableObject): for alert in alerts: create_alertmanager_alerts.delay(alert_receive_channel_pk=self.pk, alert=alert, is_demo=True) else: + timestamp = timezone.now().isoformat() create_alert.delay( title="Demo alert", message="Demo alert", @@ -659,6 +660,7 @@ class AlertReceiveChannel(IntegrationOptionsMixin, MaintainableObject): integration_unique_data=None, raw_request_data=payload, is_demo=True, + received_at=timestamp, ) @property diff --git a/engine/apps/alerts/tests/test_alert.py b/engine/apps/alerts/tests/test_alert.py index 1e33abe4..cb2a2d68 100644 --- a/engine/apps/alerts/tests/test_alert.py +++ b/engine/apps/alerts/tests/test_alert.py @@ -1,6 +1,7 @@ from unittest.mock import PropertyMock, patch import pytest +from django.utils import timezone from apps.alerts.models import Alert, EscalationPolicy from apps.alerts.tasks import distribute_alert, escalate_alert_group @@ -56,6 +57,28 @@ def test_alert_create_custom_channel_filter(make_organization, make_alert_receiv assert alert.group.channel_filter == other_channel_filter +@pytest.mark.django_db +def test_alert_create_track_received_at_timestamp(make_organization, make_alert_receive_channel): + organization = make_organization() + alert_receive_channel = make_alert_receive_channel(organization) + + now = timezone.now() + alert = Alert.create( + title="the title", + message="the message", + alert_receive_channel=alert_receive_channel, + raw_request_data={}, + integration_unique_data={}, + image_url=None, + link_to_upstream_details=None, + received_at=now.isoformat(), + ) + + alert_group = alert.group + alert_group.refresh_from_db() + assert alert_group.received_at == now + + @pytest.mark.django_db def test_distribute_alert_escalate_alert_group( make_organization, diff --git a/engine/apps/email/inbound.py b/engine/apps/email/inbound.py index 6a4a91bf..baa89905 100644 --- a/engine/apps/email/inbound.py +++ b/engine/apps/email/inbound.py @@ -6,6 +6,7 @@ from anymail.inbound import AnymailInboundMessage from anymail.signals import AnymailInboundEvent from anymail.webhooks import amazon_ses, mailgun, mailjet, mandrill, postal, postmark, sendgrid, sparkpost from django.http import HttpResponse, HttpResponseNotAllowed +from django.utils import timezone from rest_framework import status from rest_framework.request import Request from rest_framework.response import Response @@ -61,6 +62,7 @@ class InboundEmailWebhookView(AlertChannelDefiningMixin, APIView): return super().dispatch(request, alert_channel_key=integration_token) def post(self, request): + timestamp = timezone.now().isoformat() for message in self.get_messages_from_esp_request(request): payload = self.get_alert_payload_from_email_message(message) create_alert.delay( @@ -71,6 +73,7 @@ class InboundEmailWebhookView(AlertChannelDefiningMixin, APIView): link_to_upstream_details=None, integration_unique_data=None, raw_request_data=payload, + received_at=timestamp, ) return Response("OK", status=status.HTTP_200_OK) diff --git a/engine/apps/heartbeat/tasks.py b/engine/apps/heartbeat/tasks.py index 8c918d9f..7939290e 100644 --- a/engine/apps/heartbeat/tasks.py +++ b/engine/apps/heartbeat/tasks.py @@ -56,6 +56,7 @@ def check_heartbeats() -> str: .select_related("alert_receive_channel") ) # Schedule alert creation for each expired heartbeat after transaction commit + timestamp = timezone.now().isoformat() for heartbeat in expired_heartbeats: transaction.on_commit( partial( @@ -68,6 +69,7 @@ def check_heartbeats() -> str: "alert_receive_channel_pk": heartbeat.alert_receive_channel.pk, "integration_unique_data": {}, "raw_request_data": heartbeat.alert_receive_channel.heartbeat_expired_payload, + "received_at": timestamp, }, ) ) @@ -82,6 +84,7 @@ def check_heartbeats() -> str: last_heartbeat_time__gte=F("period_start"), previous_alerted_state_was_life=False ) # Schedule auto-resolve alert creation for each expired heartbeat after transaction commit + timestamp = timezone.now().isoformat() for heartbeat in restored_heartbeats: transaction.on_commit( partial( @@ -94,6 +97,7 @@ def check_heartbeats() -> str: "alert_receive_channel_pk": heartbeat.alert_receive_channel.pk, "integration_unique_data": {}, "raw_request_data": heartbeat.alert_receive_channel.heartbeat_restored_payload, + "received_at": timestamp, }, ) ) diff --git a/engine/apps/integrations/tasks.py b/engine/apps/integrations/tasks.py index 46101b93..476e9c58 100644 --- a/engine/apps/integrations/tasks.py +++ b/engine/apps/integrations/tasks.py @@ -23,7 +23,7 @@ logger.setLevel(logging.DEBUG) retry_backoff=True, max_retries=1 if settings.DEBUG else None, ) -def create_alertmanager_alerts(alert_receive_channel_pk, alert, is_demo=False, force_route_id=None): +def create_alertmanager_alerts(alert_receive_channel_pk, alert, is_demo=False, force_route_id=None, received_at=None): from apps.alerts.models import Alert, AlertReceiveChannel alert_receive_channel = AlertReceiveChannel.objects_with_deleted.get(pk=alert_receive_channel_pk) @@ -46,6 +46,7 @@ def create_alertmanager_alerts(alert_receive_channel_pk, alert, is_demo=False, f enable_autoresolve=False, is_demo=is_demo, force_route_id=force_route_id, + received_at=received_at, ) except ConcurrentUpdateError: # This error is raised when there are concurrent updates on AlertGroupCounter due to optimistic lock on it. @@ -83,6 +84,7 @@ def create_alert( raw_request_data, is_demo=False, force_route_id=None, + received_at=None, ): from apps.alerts.models import Alert, AlertReceiveChannel @@ -105,6 +107,7 @@ def create_alert( raw_request_data=raw_request_data, force_route_id=force_route_id, is_demo=is_demo, + received_at=received_at, ) logger.info( f"Created alert alert_id={alert.pk} alert_group_id={alert.group.pk} channel_id={alert_receive_channel.pk}" @@ -123,6 +126,9 @@ def create_alert( integration_unique_data, raw_request_data, ), + kwargs={ + "received_at": received_at, + }, countdown=countdown, ) logger.warning(f"Retrying the task gracefully in {countdown} seconds due to ConcurrentUpdateError") diff --git a/engine/apps/integrations/tests/test_views.py b/engine/apps/integrations/tests/test_views.py index 97fe1a59..7cfaa61a 100644 --- a/engine/apps/integrations/tests/test_views.py +++ b/engine/apps/integrations/tests/test_views.py @@ -4,6 +4,7 @@ import pytest from django.core.files.uploadedfile import SimpleUploadedFile from django.db import OperationalError from django.urls import reverse +from django.utils import timezone from pytest_django.plugin import _DatabaseBlocker from rest_framework import status from rest_framework.test import APIClient @@ -98,7 +99,10 @@ def test_integration_universal_endpoint( ) data = {"foo": "bar"} - response = client.post(url, data, format="json") + now = timezone.now() + with patch("django.utils.timezone.now") as mock_now: + mock_now.return_value = now + response = client.post(url, data, format="json") assert response.status_code == status.HTTP_200_OK mock_create_alert.apply_async.assert_called_once_with( @@ -111,6 +115,7 @@ def test_integration_universal_endpoint( "alert_receive_channel_pk": alert_receive_channel.pk, "integration_unique_data": None, "raw_request_data": data, + "received_at": now.isoformat(), }, ) @@ -165,13 +170,16 @@ def test_integration_grafana_endpoint_has_alerts( }, ] } - response = client.post(url, data, format="json") + now = timezone.now() + with patch("django.utils.timezone.now") as mock_now: + mock_now.return_value = now + response = client.post(url, data, format="json") assert response.status_code == status.HTTP_200_OK mock_create_alertmanager_alerts.apply_async.assert_has_calls( [ - call((alert_receive_channel.pk, data["alerts"][0])), - call((alert_receive_channel.pk, data["alerts"][1])), + call((alert_receive_channel.pk, data["alerts"][0]), kwargs={"received_at": now.isoformat()}), + call((alert_receive_channel.pk, data["alerts"][1]), kwargs={"received_at": now.isoformat()}), ] ) @@ -239,10 +247,13 @@ def test_integration_universal_endpoint_works_without_db( # populate cache AlertChannelDefiningMixin().update_alert_receive_channel_cache() - # disable DB access - with DatabaseBlocker().block(): - data = {"foo": "bar"} - response = client.post(url, data, format="json") + now = timezone.now() + with patch("django.utils.timezone.now") as mock_now: + mock_now.return_value = now + # disable DB access + with DatabaseBlocker().block(): + data = {"foo": "bar"} + response = client.post(url, data, format="json") assert response.status_code == status.HTTP_200_OK @@ -256,6 +267,7 @@ def test_integration_universal_endpoint_works_without_db( "alert_receive_channel_pk": alert_receive_channel.pk, "integration_unique_data": None, "raw_request_data": data, + "received_at": now.isoformat(), }, ) @@ -292,16 +304,19 @@ def test_integration_grafana_endpoint_without_db_has_alerts( # populate cache AlertChannelDefiningMixin().update_alert_receive_channel_cache() - # disable DB access - with DatabaseBlocker().block(): - response = client.post(url, data, format="json") + now = timezone.now() + with patch("django.utils.timezone.now") as mock_now: + mock_now.return_value = now + # disable DB access + with DatabaseBlocker().block(): + response = client.post(url, data, format="json") assert response.status_code == status.HTTP_200_OK mock_create_alertmanager_alerts.apply_async.assert_has_calls( [ - call((alert_receive_channel.pk, data["alerts"][0])), - call((alert_receive_channel.pk, data["alerts"][1])), + call((alert_receive_channel.pk, data["alerts"][0]), kwargs={"received_at": now.isoformat()}), + call((alert_receive_channel.pk, data["alerts"][1]), kwargs={"received_at": now.isoformat()}), ] ) @@ -339,7 +354,10 @@ def test_integration_universal_endpoint_works_without_cache( kwargs={"integration_type": integration_type, "alert_channel_key": alert_receive_channel.token}, ) data = {"foo": "bar"} - response = client.post(url, data, format="json") + now = timezone.now() + with patch("django.utils.timezone.now") as mock_now: + mock_now.return_value = now + response = client.post(url, data, format="json") assert response.status_code == status.HTTP_200_OK @@ -353,6 +371,7 @@ def test_integration_universal_endpoint_works_without_cache( "alert_receive_channel_pk": alert_receive_channel.pk, "integration_unique_data": None, "raw_request_data": data, + "received_at": now.isoformat(), }, ) @@ -387,13 +406,16 @@ def test_integration_grafana_endpoint_without_cache_has_alerts( }, ] } - response = client.post(url, data, format="json") + now = timezone.now() + with patch("django.utils.timezone.now") as mock_now: + mock_now.return_value = now + response = client.post(url, data, format="json") assert response.status_code == status.HTTP_200_OK mock_create_alertmanager_alerts.apply_async.assert_has_calls( [ - call((alert_receive_channel.pk, data["alerts"][0])), - call((alert_receive_channel.pk, data["alerts"][1])), + call((alert_receive_channel.pk, data["alerts"][0]), kwargs={"received_at": now.isoformat()}), + call((alert_receive_channel.pk, data["alerts"][1]), kwargs={"received_at": now.isoformat()}), ] ) diff --git a/engine/apps/integrations/views.py b/engine/apps/integrations/views.py index 61e51bea..05ea76f3 100644 --- a/engine/apps/integrations/views.py +++ b/engine/apps/integrations/views.py @@ -4,6 +4,7 @@ import logging from django.conf import settings from django.core.exceptions import PermissionDenied from django.http import HttpResponseBadRequest, JsonResponse +from django.utils import timezone from django.utils.decorators import method_decorator from django.views.decorators.csrf import csrf_exempt from django_sns_view.views import SNSEndpoint @@ -75,6 +76,7 @@ class AmazonSNS(BrowsableInstructionMixin, AlertChannelDefiningMixin, Integratio link_to_upstream = None raw_request_data = {"message": message} + timestamp = timezone.now().isoformat() create_alert.apply_async( [], { @@ -85,6 +87,7 @@ class AmazonSNS(BrowsableInstructionMixin, AlertChannelDefiningMixin, Integratio "alert_receive_channel_pk": alert_receive_channel.pk, "integration_unique_data": None, "raw_request_data": raw_request_data, + "received_at": timestamp, }, ) @@ -119,16 +122,19 @@ class AlertManagerAPIView( """ process_v1 creates alerts from each alert in incoming AlertManager payload. """ + now = timezone.now() for alert in request.data.get("alerts", []): if settings.DEBUG: - create_alertmanager_alerts(alert_receive_channel.pk, alert) + create_alertmanager_alerts(alert_receive_channel.pk, alert, received_at=now.isoformat()) else: self.execute_rate_limit_with_notification_logic() if self.request.limited and not is_ratelimit_ignored(alert_receive_channel): return self.get_ratelimit_http_response() - create_alertmanager_alerts.apply_async((alert_receive_channel.pk, alert)) + create_alertmanager_alerts.apply_async( + (alert_receive_channel.pk, alert), kwargs={"received_at": now.isoformat()} + ) def process_v2(self, request, alert_receive_channel): """ @@ -143,6 +149,7 @@ class AlertManagerAPIView( num_resolved = len(list(filter(lambda a: a.get("status", "") == "resolved", alerts))) data = {**request.data, "numFiring": num_firing, "numResolved": num_resolved} + timestamp = timezone.now().isoformat() create_alert.apply_async( [], { @@ -153,6 +160,7 @@ class AlertManagerAPIView( "alert_receive_channel_pk": alert_receive_channel.pk, "integration_unique_data": None, "raw_request_data": data, + "received_at": timestamp, }, ) @@ -191,16 +199,19 @@ class GrafanaAPIView( # Grafana Alerting 9 has the same payload structure as AlertManager if "alerts" in request.data: + now = timezone.now() for alert in request.data.get("alerts", []): if settings.DEBUG: - create_alertmanager_alerts(alert_receive_channel.pk, alert) + create_alertmanager_alerts(alert_receive_channel.pk, alert, received_at=now.isoformat()) else: self.execute_rate_limit_with_notification_logic() if self.request.limited and not is_ratelimit_ignored(alert_receive_channel): return self.get_ratelimit_http_response() - create_alertmanager_alerts.apply_async((alert_receive_channel.pk, alert)) + create_alertmanager_alerts.apply_async( + (alert_receive_channel.pk, alert), kwargs={"received_at": now.isoformat()} + ) return Response("Ok.") """ @@ -254,6 +265,7 @@ class GrafanaAPIView( """ attachment = request.data["attachments"][0] + timestamp = timezone.now().isoformat() create_alert.apply_async( [], { @@ -273,6 +285,7 @@ class GrafanaAPIView( } ), "raw_request_data": request.data, + "received_at": timestamp, }, ) else: @@ -286,6 +299,7 @@ class GrafanaAPIView( "alert_receive_channel_pk": alert_receive_channel.pk, "integration_unique_data": json.dumps({"evalMatches": request.data.get("evalMatches", [])}), "raw_request_data": request.data, + "received_at": timestamp, }, ) return Response("Ok.") @@ -305,6 +319,7 @@ class UniversalAPIView(BrowsableInstructionMixin, AlertChannelDefiningMixin, Int f"This url is for integration with {alert_receive_channel.config.title}." f"Key is for {alert_receive_channel.get_integration_display()}" ) + timestamp = timezone.now().isoformat() create_alert.apply_async( [], { @@ -315,6 +330,7 @@ class UniversalAPIView(BrowsableInstructionMixin, AlertChannelDefiningMixin, Int "alert_receive_channel_pk": alert_receive_channel.pk, "integration_unique_data": None, "raw_request_data": request.data, + "received_at": timestamp, }, ) return Response("Ok.")