Track alert received timestamp on alert group creation (#3513)
Keep record of the timestamp when the alert group creation task is triggered, allowing to track the delta time between alert received datetime and alert group creation timestamp. Related to https://github.com/grafana/oncall-private/issues/2347
This commit is contained in:
parent
147503eea8
commit
e053eb084d
11 changed files with 129 additions and 24 deletions
|
|
@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|||
|
||||
## Unreleased
|
||||
|
||||
### Added
|
||||
|
||||
- Track alert received timestamp on alert group creation ([#3513](https://github.com/grafana/oncall/pull/3513))
|
||||
|
||||
## v1.3.72 (2023-12-05)
|
||||
|
||||
### Fixed
|
||||
|
|
|
|||
18
engine/apps/alerts/migrations/0042_alertgroup_received_at.py
Normal file
18
engine/apps/alerts/migrations/0042_alertgroup_received_at.py
Normal file
|
|
@ -0,0 +1,18 @@
|
|||
# Generated by Django 4.2.7 on 2023-12-05 18:06
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('alerts', '0041_alertreceivechannel_unique_direct_paging_integration_per_team'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name='alertgroup',
|
||||
name='received_at',
|
||||
field=models.DateTimeField(blank=True, default=None, null=True),
|
||||
),
|
||||
]
|
||||
|
|
@ -90,6 +90,7 @@ class Alert(models.Model):
|
|||
is_demo=False,
|
||||
channel_filter=None,
|
||||
force_route_id=None,
|
||||
received_at=None,
|
||||
):
|
||||
"""
|
||||
Creates an alert and a group if needed.
|
||||
|
|
@ -105,6 +106,7 @@ class Alert(models.Model):
|
|||
channel=alert_receive_channel,
|
||||
channel_filter=channel_filter,
|
||||
group_data=group_data,
|
||||
received_at=received_at,
|
||||
)
|
||||
|
||||
if group_created:
|
||||
|
|
|
|||
|
|
@ -101,7 +101,7 @@ class AlertGroupQuerySet(models.QuerySet):
|
|||
inside_organization_number = AlertGroupCounter.objects.get_value(organization=organization) + 1
|
||||
return super().create(**kwargs, inside_organization_number=inside_organization_number)
|
||||
|
||||
def get_or_create_grouping(self, channel, channel_filter, group_data):
|
||||
def get_or_create_grouping(self, channel, channel_filter, group_data, received_at=None):
|
||||
"""
|
||||
This method is similar to default Django QuerySet.get_or_create(), please see the original get_or_create method.
|
||||
The difference is that this method is trying to get an object using multiple queries with different filters.
|
||||
|
|
@ -131,7 +131,10 @@ class AlertGroupQuerySet(models.QuerySet):
|
|||
# Create a new group if we couldn't group it to any existing ones
|
||||
try:
|
||||
alert_group = self.create(
|
||||
**search_params, is_open_for_grouping=True, web_title_cache=group_data.web_title_cache
|
||||
**search_params,
|
||||
is_open_for_grouping=True,
|
||||
web_title_cache=group_data.web_title_cache,
|
||||
received_at=received_at,
|
||||
)
|
||||
alert_group_created_signal.send(sender=self.__class__, alert_group=alert_group)
|
||||
return (alert_group, True)
|
||||
|
|
@ -334,6 +337,8 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models.
|
|||
|
||||
response_time = models.DurationField(null=True, default=None)
|
||||
|
||||
received_at = models.DateTimeField(blank=True, null=True, default=None)
|
||||
|
||||
@property
|
||||
def is_silenced_forever(self):
|
||||
return self.silenced and self.silenced_until is None
|
||||
|
|
|
|||
|
|
@ -650,6 +650,7 @@ class AlertReceiveChannel(IntegrationOptionsMixin, MaintainableObject):
|
|||
for alert in alerts:
|
||||
create_alertmanager_alerts.delay(alert_receive_channel_pk=self.pk, alert=alert, is_demo=True)
|
||||
else:
|
||||
timestamp = timezone.now().isoformat()
|
||||
create_alert.delay(
|
||||
title="Demo alert",
|
||||
message="Demo alert",
|
||||
|
|
@ -659,6 +660,7 @@ class AlertReceiveChannel(IntegrationOptionsMixin, MaintainableObject):
|
|||
integration_unique_data=None,
|
||||
raw_request_data=payload,
|
||||
is_demo=True,
|
||||
received_at=timestamp,
|
||||
)
|
||||
|
||||
@property
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
from unittest.mock import PropertyMock, patch
|
||||
|
||||
import pytest
|
||||
from django.utils import timezone
|
||||
|
||||
from apps.alerts.models import Alert, EscalationPolicy
|
||||
from apps.alerts.tasks import distribute_alert, escalate_alert_group
|
||||
|
|
@ -56,6 +57,28 @@ def test_alert_create_custom_channel_filter(make_organization, make_alert_receiv
|
|||
assert alert.group.channel_filter == other_channel_filter
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
def test_alert_create_track_received_at_timestamp(make_organization, make_alert_receive_channel):
|
||||
organization = make_organization()
|
||||
alert_receive_channel = make_alert_receive_channel(organization)
|
||||
|
||||
now = timezone.now()
|
||||
alert = Alert.create(
|
||||
title="the title",
|
||||
message="the message",
|
||||
alert_receive_channel=alert_receive_channel,
|
||||
raw_request_data={},
|
||||
integration_unique_data={},
|
||||
image_url=None,
|
||||
link_to_upstream_details=None,
|
||||
received_at=now.isoformat(),
|
||||
)
|
||||
|
||||
alert_group = alert.group
|
||||
alert_group.refresh_from_db()
|
||||
assert alert_group.received_at == now
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
def test_distribute_alert_escalate_alert_group(
|
||||
make_organization,
|
||||
|
|
|
|||
|
|
@ -6,6 +6,7 @@ from anymail.inbound import AnymailInboundMessage
|
|||
from anymail.signals import AnymailInboundEvent
|
||||
from anymail.webhooks import amazon_ses, mailgun, mailjet, mandrill, postal, postmark, sendgrid, sparkpost
|
||||
from django.http import HttpResponse, HttpResponseNotAllowed
|
||||
from django.utils import timezone
|
||||
from rest_framework import status
|
||||
from rest_framework.request import Request
|
||||
from rest_framework.response import Response
|
||||
|
|
@ -61,6 +62,7 @@ class InboundEmailWebhookView(AlertChannelDefiningMixin, APIView):
|
|||
return super().dispatch(request, alert_channel_key=integration_token)
|
||||
|
||||
def post(self, request):
|
||||
timestamp = timezone.now().isoformat()
|
||||
for message in self.get_messages_from_esp_request(request):
|
||||
payload = self.get_alert_payload_from_email_message(message)
|
||||
create_alert.delay(
|
||||
|
|
@ -71,6 +73,7 @@ class InboundEmailWebhookView(AlertChannelDefiningMixin, APIView):
|
|||
link_to_upstream_details=None,
|
||||
integration_unique_data=None,
|
||||
raw_request_data=payload,
|
||||
received_at=timestamp,
|
||||
)
|
||||
|
||||
return Response("OK", status=status.HTTP_200_OK)
|
||||
|
|
|
|||
|
|
@ -56,6 +56,7 @@ def check_heartbeats() -> str:
|
|||
.select_related("alert_receive_channel")
|
||||
)
|
||||
# Schedule alert creation for each expired heartbeat after transaction commit
|
||||
timestamp = timezone.now().isoformat()
|
||||
for heartbeat in expired_heartbeats:
|
||||
transaction.on_commit(
|
||||
partial(
|
||||
|
|
@ -68,6 +69,7 @@ def check_heartbeats() -> str:
|
|||
"alert_receive_channel_pk": heartbeat.alert_receive_channel.pk,
|
||||
"integration_unique_data": {},
|
||||
"raw_request_data": heartbeat.alert_receive_channel.heartbeat_expired_payload,
|
||||
"received_at": timestamp,
|
||||
},
|
||||
)
|
||||
)
|
||||
|
|
@ -82,6 +84,7 @@ def check_heartbeats() -> str:
|
|||
last_heartbeat_time__gte=F("period_start"), previous_alerted_state_was_life=False
|
||||
)
|
||||
# Schedule auto-resolve alert creation for each expired heartbeat after transaction commit
|
||||
timestamp = timezone.now().isoformat()
|
||||
for heartbeat in restored_heartbeats:
|
||||
transaction.on_commit(
|
||||
partial(
|
||||
|
|
@ -94,6 +97,7 @@ def check_heartbeats() -> str:
|
|||
"alert_receive_channel_pk": heartbeat.alert_receive_channel.pk,
|
||||
"integration_unique_data": {},
|
||||
"raw_request_data": heartbeat.alert_receive_channel.heartbeat_restored_payload,
|
||||
"received_at": timestamp,
|
||||
},
|
||||
)
|
||||
)
|
||||
|
|
|
|||
|
|
@ -23,7 +23,7 @@ logger.setLevel(logging.DEBUG)
|
|||
retry_backoff=True,
|
||||
max_retries=1 if settings.DEBUG else None,
|
||||
)
|
||||
def create_alertmanager_alerts(alert_receive_channel_pk, alert, is_demo=False, force_route_id=None):
|
||||
def create_alertmanager_alerts(alert_receive_channel_pk, alert, is_demo=False, force_route_id=None, received_at=None):
|
||||
from apps.alerts.models import Alert, AlertReceiveChannel
|
||||
|
||||
alert_receive_channel = AlertReceiveChannel.objects_with_deleted.get(pk=alert_receive_channel_pk)
|
||||
|
|
@ -46,6 +46,7 @@ def create_alertmanager_alerts(alert_receive_channel_pk, alert, is_demo=False, f
|
|||
enable_autoresolve=False,
|
||||
is_demo=is_demo,
|
||||
force_route_id=force_route_id,
|
||||
received_at=received_at,
|
||||
)
|
||||
except ConcurrentUpdateError:
|
||||
# This error is raised when there are concurrent updates on AlertGroupCounter due to optimistic lock on it.
|
||||
|
|
@ -83,6 +84,7 @@ def create_alert(
|
|||
raw_request_data,
|
||||
is_demo=False,
|
||||
force_route_id=None,
|
||||
received_at=None,
|
||||
):
|
||||
from apps.alerts.models import Alert, AlertReceiveChannel
|
||||
|
||||
|
|
@ -105,6 +107,7 @@ def create_alert(
|
|||
raw_request_data=raw_request_data,
|
||||
force_route_id=force_route_id,
|
||||
is_demo=is_demo,
|
||||
received_at=received_at,
|
||||
)
|
||||
logger.info(
|
||||
f"Created alert alert_id={alert.pk} alert_group_id={alert.group.pk} channel_id={alert_receive_channel.pk}"
|
||||
|
|
@ -123,6 +126,9 @@ def create_alert(
|
|||
integration_unique_data,
|
||||
raw_request_data,
|
||||
),
|
||||
kwargs={
|
||||
"received_at": received_at,
|
||||
},
|
||||
countdown=countdown,
|
||||
)
|
||||
logger.warning(f"Retrying the task gracefully in {countdown} seconds due to ConcurrentUpdateError")
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@ import pytest
|
|||
from django.core.files.uploadedfile import SimpleUploadedFile
|
||||
from django.db import OperationalError
|
||||
from django.urls import reverse
|
||||
from django.utils import timezone
|
||||
from pytest_django.plugin import _DatabaseBlocker
|
||||
from rest_framework import status
|
||||
from rest_framework.test import APIClient
|
||||
|
|
@ -98,7 +99,10 @@ def test_integration_universal_endpoint(
|
|||
)
|
||||
|
||||
data = {"foo": "bar"}
|
||||
response = client.post(url, data, format="json")
|
||||
now = timezone.now()
|
||||
with patch("django.utils.timezone.now") as mock_now:
|
||||
mock_now.return_value = now
|
||||
response = client.post(url, data, format="json")
|
||||
assert response.status_code == status.HTTP_200_OK
|
||||
|
||||
mock_create_alert.apply_async.assert_called_once_with(
|
||||
|
|
@ -111,6 +115,7 @@ def test_integration_universal_endpoint(
|
|||
"alert_receive_channel_pk": alert_receive_channel.pk,
|
||||
"integration_unique_data": None,
|
||||
"raw_request_data": data,
|
||||
"received_at": now.isoformat(),
|
||||
},
|
||||
)
|
||||
|
||||
|
|
@ -165,13 +170,16 @@ def test_integration_grafana_endpoint_has_alerts(
|
|||
},
|
||||
]
|
||||
}
|
||||
response = client.post(url, data, format="json")
|
||||
now = timezone.now()
|
||||
with patch("django.utils.timezone.now") as mock_now:
|
||||
mock_now.return_value = now
|
||||
response = client.post(url, data, format="json")
|
||||
assert response.status_code == status.HTTP_200_OK
|
||||
|
||||
mock_create_alertmanager_alerts.apply_async.assert_has_calls(
|
||||
[
|
||||
call((alert_receive_channel.pk, data["alerts"][0])),
|
||||
call((alert_receive_channel.pk, data["alerts"][1])),
|
||||
call((alert_receive_channel.pk, data["alerts"][0]), kwargs={"received_at": now.isoformat()}),
|
||||
call((alert_receive_channel.pk, data["alerts"][1]), kwargs={"received_at": now.isoformat()}),
|
||||
]
|
||||
)
|
||||
|
||||
|
|
@ -239,10 +247,13 @@ def test_integration_universal_endpoint_works_without_db(
|
|||
# populate cache
|
||||
AlertChannelDefiningMixin().update_alert_receive_channel_cache()
|
||||
|
||||
# disable DB access
|
||||
with DatabaseBlocker().block():
|
||||
data = {"foo": "bar"}
|
||||
response = client.post(url, data, format="json")
|
||||
now = timezone.now()
|
||||
with patch("django.utils.timezone.now") as mock_now:
|
||||
mock_now.return_value = now
|
||||
# disable DB access
|
||||
with DatabaseBlocker().block():
|
||||
data = {"foo": "bar"}
|
||||
response = client.post(url, data, format="json")
|
||||
|
||||
assert response.status_code == status.HTTP_200_OK
|
||||
|
||||
|
|
@ -256,6 +267,7 @@ def test_integration_universal_endpoint_works_without_db(
|
|||
"alert_receive_channel_pk": alert_receive_channel.pk,
|
||||
"integration_unique_data": None,
|
||||
"raw_request_data": data,
|
||||
"received_at": now.isoformat(),
|
||||
},
|
||||
)
|
||||
|
||||
|
|
@ -292,16 +304,19 @@ def test_integration_grafana_endpoint_without_db_has_alerts(
|
|||
# populate cache
|
||||
AlertChannelDefiningMixin().update_alert_receive_channel_cache()
|
||||
|
||||
# disable DB access
|
||||
with DatabaseBlocker().block():
|
||||
response = client.post(url, data, format="json")
|
||||
now = timezone.now()
|
||||
with patch("django.utils.timezone.now") as mock_now:
|
||||
mock_now.return_value = now
|
||||
# disable DB access
|
||||
with DatabaseBlocker().block():
|
||||
response = client.post(url, data, format="json")
|
||||
|
||||
assert response.status_code == status.HTTP_200_OK
|
||||
|
||||
mock_create_alertmanager_alerts.apply_async.assert_has_calls(
|
||||
[
|
||||
call((alert_receive_channel.pk, data["alerts"][0])),
|
||||
call((alert_receive_channel.pk, data["alerts"][1])),
|
||||
call((alert_receive_channel.pk, data["alerts"][0]), kwargs={"received_at": now.isoformat()}),
|
||||
call((alert_receive_channel.pk, data["alerts"][1]), kwargs={"received_at": now.isoformat()}),
|
||||
]
|
||||
)
|
||||
|
||||
|
|
@ -339,7 +354,10 @@ def test_integration_universal_endpoint_works_without_cache(
|
|||
kwargs={"integration_type": integration_type, "alert_channel_key": alert_receive_channel.token},
|
||||
)
|
||||
data = {"foo": "bar"}
|
||||
response = client.post(url, data, format="json")
|
||||
now = timezone.now()
|
||||
with patch("django.utils.timezone.now") as mock_now:
|
||||
mock_now.return_value = now
|
||||
response = client.post(url, data, format="json")
|
||||
|
||||
assert response.status_code == status.HTTP_200_OK
|
||||
|
||||
|
|
@ -353,6 +371,7 @@ def test_integration_universal_endpoint_works_without_cache(
|
|||
"alert_receive_channel_pk": alert_receive_channel.pk,
|
||||
"integration_unique_data": None,
|
||||
"raw_request_data": data,
|
||||
"received_at": now.isoformat(),
|
||||
},
|
||||
)
|
||||
|
||||
|
|
@ -387,13 +406,16 @@ def test_integration_grafana_endpoint_without_cache_has_alerts(
|
|||
},
|
||||
]
|
||||
}
|
||||
response = client.post(url, data, format="json")
|
||||
now = timezone.now()
|
||||
with patch("django.utils.timezone.now") as mock_now:
|
||||
mock_now.return_value = now
|
||||
response = client.post(url, data, format="json")
|
||||
|
||||
assert response.status_code == status.HTTP_200_OK
|
||||
|
||||
mock_create_alertmanager_alerts.apply_async.assert_has_calls(
|
||||
[
|
||||
call((alert_receive_channel.pk, data["alerts"][0])),
|
||||
call((alert_receive_channel.pk, data["alerts"][1])),
|
||||
call((alert_receive_channel.pk, data["alerts"][0]), kwargs={"received_at": now.isoformat()}),
|
||||
call((alert_receive_channel.pk, data["alerts"][1]), kwargs={"received_at": now.isoformat()}),
|
||||
]
|
||||
)
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@ import logging
|
|||
from django.conf import settings
|
||||
from django.core.exceptions import PermissionDenied
|
||||
from django.http import HttpResponseBadRequest, JsonResponse
|
||||
from django.utils import timezone
|
||||
from django.utils.decorators import method_decorator
|
||||
from django.views.decorators.csrf import csrf_exempt
|
||||
from django_sns_view.views import SNSEndpoint
|
||||
|
|
@ -75,6 +76,7 @@ class AmazonSNS(BrowsableInstructionMixin, AlertChannelDefiningMixin, Integratio
|
|||
link_to_upstream = None
|
||||
raw_request_data = {"message": message}
|
||||
|
||||
timestamp = timezone.now().isoformat()
|
||||
create_alert.apply_async(
|
||||
[],
|
||||
{
|
||||
|
|
@ -85,6 +87,7 @@ class AmazonSNS(BrowsableInstructionMixin, AlertChannelDefiningMixin, Integratio
|
|||
"alert_receive_channel_pk": alert_receive_channel.pk,
|
||||
"integration_unique_data": None,
|
||||
"raw_request_data": raw_request_data,
|
||||
"received_at": timestamp,
|
||||
},
|
||||
)
|
||||
|
||||
|
|
@ -119,16 +122,19 @@ class AlertManagerAPIView(
|
|||
"""
|
||||
process_v1 creates alerts from each alert in incoming AlertManager payload.
|
||||
"""
|
||||
now = timezone.now()
|
||||
for alert in request.data.get("alerts", []):
|
||||
if settings.DEBUG:
|
||||
create_alertmanager_alerts(alert_receive_channel.pk, alert)
|
||||
create_alertmanager_alerts(alert_receive_channel.pk, alert, received_at=now.isoformat())
|
||||
else:
|
||||
self.execute_rate_limit_with_notification_logic()
|
||||
|
||||
if self.request.limited and not is_ratelimit_ignored(alert_receive_channel):
|
||||
return self.get_ratelimit_http_response()
|
||||
|
||||
create_alertmanager_alerts.apply_async((alert_receive_channel.pk, alert))
|
||||
create_alertmanager_alerts.apply_async(
|
||||
(alert_receive_channel.pk, alert), kwargs={"received_at": now.isoformat()}
|
||||
)
|
||||
|
||||
def process_v2(self, request, alert_receive_channel):
|
||||
"""
|
||||
|
|
@ -143,6 +149,7 @@ class AlertManagerAPIView(
|
|||
num_resolved = len(list(filter(lambda a: a.get("status", "") == "resolved", alerts)))
|
||||
data = {**request.data, "numFiring": num_firing, "numResolved": num_resolved}
|
||||
|
||||
timestamp = timezone.now().isoformat()
|
||||
create_alert.apply_async(
|
||||
[],
|
||||
{
|
||||
|
|
@ -153,6 +160,7 @@ class AlertManagerAPIView(
|
|||
"alert_receive_channel_pk": alert_receive_channel.pk,
|
||||
"integration_unique_data": None,
|
||||
"raw_request_data": data,
|
||||
"received_at": timestamp,
|
||||
},
|
||||
)
|
||||
|
||||
|
|
@ -191,16 +199,19 @@ class GrafanaAPIView(
|
|||
|
||||
# Grafana Alerting 9 has the same payload structure as AlertManager
|
||||
if "alerts" in request.data:
|
||||
now = timezone.now()
|
||||
for alert in request.data.get("alerts", []):
|
||||
if settings.DEBUG:
|
||||
create_alertmanager_alerts(alert_receive_channel.pk, alert)
|
||||
create_alertmanager_alerts(alert_receive_channel.pk, alert, received_at=now.isoformat())
|
||||
else:
|
||||
self.execute_rate_limit_with_notification_logic()
|
||||
|
||||
if self.request.limited and not is_ratelimit_ignored(alert_receive_channel):
|
||||
return self.get_ratelimit_http_response()
|
||||
|
||||
create_alertmanager_alerts.apply_async((alert_receive_channel.pk, alert))
|
||||
create_alertmanager_alerts.apply_async(
|
||||
(alert_receive_channel.pk, alert), kwargs={"received_at": now.isoformat()}
|
||||
)
|
||||
return Response("Ok.")
|
||||
|
||||
"""
|
||||
|
|
@ -254,6 +265,7 @@ class GrafanaAPIView(
|
|||
"""
|
||||
attachment = request.data["attachments"][0]
|
||||
|
||||
timestamp = timezone.now().isoformat()
|
||||
create_alert.apply_async(
|
||||
[],
|
||||
{
|
||||
|
|
@ -273,6 +285,7 @@ class GrafanaAPIView(
|
|||
}
|
||||
),
|
||||
"raw_request_data": request.data,
|
||||
"received_at": timestamp,
|
||||
},
|
||||
)
|
||||
else:
|
||||
|
|
@ -286,6 +299,7 @@ class GrafanaAPIView(
|
|||
"alert_receive_channel_pk": alert_receive_channel.pk,
|
||||
"integration_unique_data": json.dumps({"evalMatches": request.data.get("evalMatches", [])}),
|
||||
"raw_request_data": request.data,
|
||||
"received_at": timestamp,
|
||||
},
|
||||
)
|
||||
return Response("Ok.")
|
||||
|
|
@ -305,6 +319,7 @@ class UniversalAPIView(BrowsableInstructionMixin, AlertChannelDefiningMixin, Int
|
|||
f"This url is for integration with {alert_receive_channel.config.title}."
|
||||
f"Key is for {alert_receive_channel.get_integration_display()}"
|
||||
)
|
||||
timestamp = timezone.now().isoformat()
|
||||
create_alert.apply_async(
|
||||
[],
|
||||
{
|
||||
|
|
@ -315,6 +330,7 @@ class UniversalAPIView(BrowsableInstructionMixin, AlertChannelDefiningMixin, Int
|
|||
"alert_receive_channel_pk": alert_receive_channel.pk,
|
||||
"integration_unique_data": None,
|
||||
"raw_request_data": request.data,
|
||||
"received_at": timestamp,
|
||||
},
|
||||
)
|
||||
return Response("Ok.")
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue