Track alert received timestamp on alert group creation (#3513)

Keep record of the timestamp when the alert group creation task is
triggered, allowing to track the delta time between alert received
datetime and alert group creation timestamp.

Related to https://github.com/grafana/oncall-private/issues/2347
This commit is contained in:
Matias Bordese 2023-12-06 09:20:03 -03:00 committed by GitHub
parent 147503eea8
commit e053eb084d
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
11 changed files with 129 additions and 24 deletions

View file

@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## Unreleased
### Added
- Track alert received timestamp on alert group creation ([#3513](https://github.com/grafana/oncall/pull/3513))
## v1.3.72 (2023-12-05)
### Fixed

View file

@ -0,0 +1,18 @@
# Generated by Django 4.2.7 on 2023-12-05 18:06
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('alerts', '0041_alertreceivechannel_unique_direct_paging_integration_per_team'),
]
operations = [
migrations.AddField(
model_name='alertgroup',
name='received_at',
field=models.DateTimeField(blank=True, default=None, null=True),
),
]

View file

@ -90,6 +90,7 @@ class Alert(models.Model):
is_demo=False,
channel_filter=None,
force_route_id=None,
received_at=None,
):
"""
Creates an alert and a group if needed.
@ -105,6 +106,7 @@ class Alert(models.Model):
channel=alert_receive_channel,
channel_filter=channel_filter,
group_data=group_data,
received_at=received_at,
)
if group_created:

View file

@ -101,7 +101,7 @@ class AlertGroupQuerySet(models.QuerySet):
inside_organization_number = AlertGroupCounter.objects.get_value(organization=organization) + 1
return super().create(**kwargs, inside_organization_number=inside_organization_number)
def get_or_create_grouping(self, channel, channel_filter, group_data):
def get_or_create_grouping(self, channel, channel_filter, group_data, received_at=None):
"""
This method is similar to default Django QuerySet.get_or_create(), please see the original get_or_create method.
The difference is that this method is trying to get an object using multiple queries with different filters.
@ -131,7 +131,10 @@ class AlertGroupQuerySet(models.QuerySet):
# Create a new group if we couldn't group it to any existing ones
try:
alert_group = self.create(
**search_params, is_open_for_grouping=True, web_title_cache=group_data.web_title_cache
**search_params,
is_open_for_grouping=True,
web_title_cache=group_data.web_title_cache,
received_at=received_at,
)
alert_group_created_signal.send(sender=self.__class__, alert_group=alert_group)
return (alert_group, True)
@ -334,6 +337,8 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models.
response_time = models.DurationField(null=True, default=None)
received_at = models.DateTimeField(blank=True, null=True, default=None)
@property
def is_silenced_forever(self):
return self.silenced and self.silenced_until is None

View file

@ -650,6 +650,7 @@ class AlertReceiveChannel(IntegrationOptionsMixin, MaintainableObject):
for alert in alerts:
create_alertmanager_alerts.delay(alert_receive_channel_pk=self.pk, alert=alert, is_demo=True)
else:
timestamp = timezone.now().isoformat()
create_alert.delay(
title="Demo alert",
message="Demo alert",
@ -659,6 +660,7 @@ class AlertReceiveChannel(IntegrationOptionsMixin, MaintainableObject):
integration_unique_data=None,
raw_request_data=payload,
is_demo=True,
received_at=timestamp,
)
@property

View file

@ -1,6 +1,7 @@
from unittest.mock import PropertyMock, patch
import pytest
from django.utils import timezone
from apps.alerts.models import Alert, EscalationPolicy
from apps.alerts.tasks import distribute_alert, escalate_alert_group
@ -56,6 +57,28 @@ def test_alert_create_custom_channel_filter(make_organization, make_alert_receiv
assert alert.group.channel_filter == other_channel_filter
@pytest.mark.django_db
def test_alert_create_track_received_at_timestamp(make_organization, make_alert_receive_channel):
organization = make_organization()
alert_receive_channel = make_alert_receive_channel(organization)
now = timezone.now()
alert = Alert.create(
title="the title",
message="the message",
alert_receive_channel=alert_receive_channel,
raw_request_data={},
integration_unique_data={},
image_url=None,
link_to_upstream_details=None,
received_at=now.isoformat(),
)
alert_group = alert.group
alert_group.refresh_from_db()
assert alert_group.received_at == now
@pytest.mark.django_db
def test_distribute_alert_escalate_alert_group(
make_organization,

View file

@ -6,6 +6,7 @@ from anymail.inbound import AnymailInboundMessage
from anymail.signals import AnymailInboundEvent
from anymail.webhooks import amazon_ses, mailgun, mailjet, mandrill, postal, postmark, sendgrid, sparkpost
from django.http import HttpResponse, HttpResponseNotAllowed
from django.utils import timezone
from rest_framework import status
from rest_framework.request import Request
from rest_framework.response import Response
@ -61,6 +62,7 @@ class InboundEmailWebhookView(AlertChannelDefiningMixin, APIView):
return super().dispatch(request, alert_channel_key=integration_token)
def post(self, request):
timestamp = timezone.now().isoformat()
for message in self.get_messages_from_esp_request(request):
payload = self.get_alert_payload_from_email_message(message)
create_alert.delay(
@ -71,6 +73,7 @@ class InboundEmailWebhookView(AlertChannelDefiningMixin, APIView):
link_to_upstream_details=None,
integration_unique_data=None,
raw_request_data=payload,
received_at=timestamp,
)
return Response("OK", status=status.HTTP_200_OK)

View file

@ -56,6 +56,7 @@ def check_heartbeats() -> str:
.select_related("alert_receive_channel")
)
# Schedule alert creation for each expired heartbeat after transaction commit
timestamp = timezone.now().isoformat()
for heartbeat in expired_heartbeats:
transaction.on_commit(
partial(
@ -68,6 +69,7 @@ def check_heartbeats() -> str:
"alert_receive_channel_pk": heartbeat.alert_receive_channel.pk,
"integration_unique_data": {},
"raw_request_data": heartbeat.alert_receive_channel.heartbeat_expired_payload,
"received_at": timestamp,
},
)
)
@ -82,6 +84,7 @@ def check_heartbeats() -> str:
last_heartbeat_time__gte=F("period_start"), previous_alerted_state_was_life=False
)
# Schedule auto-resolve alert creation for each expired heartbeat after transaction commit
timestamp = timezone.now().isoformat()
for heartbeat in restored_heartbeats:
transaction.on_commit(
partial(
@ -94,6 +97,7 @@ def check_heartbeats() -> str:
"alert_receive_channel_pk": heartbeat.alert_receive_channel.pk,
"integration_unique_data": {},
"raw_request_data": heartbeat.alert_receive_channel.heartbeat_restored_payload,
"received_at": timestamp,
},
)
)

View file

@ -23,7 +23,7 @@ logger.setLevel(logging.DEBUG)
retry_backoff=True,
max_retries=1 if settings.DEBUG else None,
)
def create_alertmanager_alerts(alert_receive_channel_pk, alert, is_demo=False, force_route_id=None):
def create_alertmanager_alerts(alert_receive_channel_pk, alert, is_demo=False, force_route_id=None, received_at=None):
from apps.alerts.models import Alert, AlertReceiveChannel
alert_receive_channel = AlertReceiveChannel.objects_with_deleted.get(pk=alert_receive_channel_pk)
@ -46,6 +46,7 @@ def create_alertmanager_alerts(alert_receive_channel_pk, alert, is_demo=False, f
enable_autoresolve=False,
is_demo=is_demo,
force_route_id=force_route_id,
received_at=received_at,
)
except ConcurrentUpdateError:
# This error is raised when there are concurrent updates on AlertGroupCounter due to optimistic lock on it.
@ -83,6 +84,7 @@ def create_alert(
raw_request_data,
is_demo=False,
force_route_id=None,
received_at=None,
):
from apps.alerts.models import Alert, AlertReceiveChannel
@ -105,6 +107,7 @@ def create_alert(
raw_request_data=raw_request_data,
force_route_id=force_route_id,
is_demo=is_demo,
received_at=received_at,
)
logger.info(
f"Created alert alert_id={alert.pk} alert_group_id={alert.group.pk} channel_id={alert_receive_channel.pk}"
@ -123,6 +126,9 @@ def create_alert(
integration_unique_data,
raw_request_data,
),
kwargs={
"received_at": received_at,
},
countdown=countdown,
)
logger.warning(f"Retrying the task gracefully in {countdown} seconds due to ConcurrentUpdateError")

View file

@ -4,6 +4,7 @@ import pytest
from django.core.files.uploadedfile import SimpleUploadedFile
from django.db import OperationalError
from django.urls import reverse
from django.utils import timezone
from pytest_django.plugin import _DatabaseBlocker
from rest_framework import status
from rest_framework.test import APIClient
@ -98,7 +99,10 @@ def test_integration_universal_endpoint(
)
data = {"foo": "bar"}
response = client.post(url, data, format="json")
now = timezone.now()
with patch("django.utils.timezone.now") as mock_now:
mock_now.return_value = now
response = client.post(url, data, format="json")
assert response.status_code == status.HTTP_200_OK
mock_create_alert.apply_async.assert_called_once_with(
@ -111,6 +115,7 @@ def test_integration_universal_endpoint(
"alert_receive_channel_pk": alert_receive_channel.pk,
"integration_unique_data": None,
"raw_request_data": data,
"received_at": now.isoformat(),
},
)
@ -165,13 +170,16 @@ def test_integration_grafana_endpoint_has_alerts(
},
]
}
response = client.post(url, data, format="json")
now = timezone.now()
with patch("django.utils.timezone.now") as mock_now:
mock_now.return_value = now
response = client.post(url, data, format="json")
assert response.status_code == status.HTTP_200_OK
mock_create_alertmanager_alerts.apply_async.assert_has_calls(
[
call((alert_receive_channel.pk, data["alerts"][0])),
call((alert_receive_channel.pk, data["alerts"][1])),
call((alert_receive_channel.pk, data["alerts"][0]), kwargs={"received_at": now.isoformat()}),
call((alert_receive_channel.pk, data["alerts"][1]), kwargs={"received_at": now.isoformat()}),
]
)
@ -239,10 +247,13 @@ def test_integration_universal_endpoint_works_without_db(
# populate cache
AlertChannelDefiningMixin().update_alert_receive_channel_cache()
# disable DB access
with DatabaseBlocker().block():
data = {"foo": "bar"}
response = client.post(url, data, format="json")
now = timezone.now()
with patch("django.utils.timezone.now") as mock_now:
mock_now.return_value = now
# disable DB access
with DatabaseBlocker().block():
data = {"foo": "bar"}
response = client.post(url, data, format="json")
assert response.status_code == status.HTTP_200_OK
@ -256,6 +267,7 @@ def test_integration_universal_endpoint_works_without_db(
"alert_receive_channel_pk": alert_receive_channel.pk,
"integration_unique_data": None,
"raw_request_data": data,
"received_at": now.isoformat(),
},
)
@ -292,16 +304,19 @@ def test_integration_grafana_endpoint_without_db_has_alerts(
# populate cache
AlertChannelDefiningMixin().update_alert_receive_channel_cache()
# disable DB access
with DatabaseBlocker().block():
response = client.post(url, data, format="json")
now = timezone.now()
with patch("django.utils.timezone.now") as mock_now:
mock_now.return_value = now
# disable DB access
with DatabaseBlocker().block():
response = client.post(url, data, format="json")
assert response.status_code == status.HTTP_200_OK
mock_create_alertmanager_alerts.apply_async.assert_has_calls(
[
call((alert_receive_channel.pk, data["alerts"][0])),
call((alert_receive_channel.pk, data["alerts"][1])),
call((alert_receive_channel.pk, data["alerts"][0]), kwargs={"received_at": now.isoformat()}),
call((alert_receive_channel.pk, data["alerts"][1]), kwargs={"received_at": now.isoformat()}),
]
)
@ -339,7 +354,10 @@ def test_integration_universal_endpoint_works_without_cache(
kwargs={"integration_type": integration_type, "alert_channel_key": alert_receive_channel.token},
)
data = {"foo": "bar"}
response = client.post(url, data, format="json")
now = timezone.now()
with patch("django.utils.timezone.now") as mock_now:
mock_now.return_value = now
response = client.post(url, data, format="json")
assert response.status_code == status.HTTP_200_OK
@ -353,6 +371,7 @@ def test_integration_universal_endpoint_works_without_cache(
"alert_receive_channel_pk": alert_receive_channel.pk,
"integration_unique_data": None,
"raw_request_data": data,
"received_at": now.isoformat(),
},
)
@ -387,13 +406,16 @@ def test_integration_grafana_endpoint_without_cache_has_alerts(
},
]
}
response = client.post(url, data, format="json")
now = timezone.now()
with patch("django.utils.timezone.now") as mock_now:
mock_now.return_value = now
response = client.post(url, data, format="json")
assert response.status_code == status.HTTP_200_OK
mock_create_alertmanager_alerts.apply_async.assert_has_calls(
[
call((alert_receive_channel.pk, data["alerts"][0])),
call((alert_receive_channel.pk, data["alerts"][1])),
call((alert_receive_channel.pk, data["alerts"][0]), kwargs={"received_at": now.isoformat()}),
call((alert_receive_channel.pk, data["alerts"][1]), kwargs={"received_at": now.isoformat()}),
]
)

View file

@ -4,6 +4,7 @@ import logging
from django.conf import settings
from django.core.exceptions import PermissionDenied
from django.http import HttpResponseBadRequest, JsonResponse
from django.utils import timezone
from django.utils.decorators import method_decorator
from django.views.decorators.csrf import csrf_exempt
from django_sns_view.views import SNSEndpoint
@ -75,6 +76,7 @@ class AmazonSNS(BrowsableInstructionMixin, AlertChannelDefiningMixin, Integratio
link_to_upstream = None
raw_request_data = {"message": message}
timestamp = timezone.now().isoformat()
create_alert.apply_async(
[],
{
@ -85,6 +87,7 @@ class AmazonSNS(BrowsableInstructionMixin, AlertChannelDefiningMixin, Integratio
"alert_receive_channel_pk": alert_receive_channel.pk,
"integration_unique_data": None,
"raw_request_data": raw_request_data,
"received_at": timestamp,
},
)
@ -119,16 +122,19 @@ class AlertManagerAPIView(
"""
process_v1 creates alerts from each alert in incoming AlertManager payload.
"""
now = timezone.now()
for alert in request.data.get("alerts", []):
if settings.DEBUG:
create_alertmanager_alerts(alert_receive_channel.pk, alert)
create_alertmanager_alerts(alert_receive_channel.pk, alert, received_at=now.isoformat())
else:
self.execute_rate_limit_with_notification_logic()
if self.request.limited and not is_ratelimit_ignored(alert_receive_channel):
return self.get_ratelimit_http_response()
create_alertmanager_alerts.apply_async((alert_receive_channel.pk, alert))
create_alertmanager_alerts.apply_async(
(alert_receive_channel.pk, alert), kwargs={"received_at": now.isoformat()}
)
def process_v2(self, request, alert_receive_channel):
"""
@ -143,6 +149,7 @@ class AlertManagerAPIView(
num_resolved = len(list(filter(lambda a: a.get("status", "") == "resolved", alerts)))
data = {**request.data, "numFiring": num_firing, "numResolved": num_resolved}
timestamp = timezone.now().isoformat()
create_alert.apply_async(
[],
{
@ -153,6 +160,7 @@ class AlertManagerAPIView(
"alert_receive_channel_pk": alert_receive_channel.pk,
"integration_unique_data": None,
"raw_request_data": data,
"received_at": timestamp,
},
)
@ -191,16 +199,19 @@ class GrafanaAPIView(
# Grafana Alerting 9 has the same payload structure as AlertManager
if "alerts" in request.data:
now = timezone.now()
for alert in request.data.get("alerts", []):
if settings.DEBUG:
create_alertmanager_alerts(alert_receive_channel.pk, alert)
create_alertmanager_alerts(alert_receive_channel.pk, alert, received_at=now.isoformat())
else:
self.execute_rate_limit_with_notification_logic()
if self.request.limited and not is_ratelimit_ignored(alert_receive_channel):
return self.get_ratelimit_http_response()
create_alertmanager_alerts.apply_async((alert_receive_channel.pk, alert))
create_alertmanager_alerts.apply_async(
(alert_receive_channel.pk, alert), kwargs={"received_at": now.isoformat()}
)
return Response("Ok.")
"""
@ -254,6 +265,7 @@ class GrafanaAPIView(
"""
attachment = request.data["attachments"][0]
timestamp = timezone.now().isoformat()
create_alert.apply_async(
[],
{
@ -273,6 +285,7 @@ class GrafanaAPIView(
}
),
"raw_request_data": request.data,
"received_at": timestamp,
},
)
else:
@ -286,6 +299,7 @@ class GrafanaAPIView(
"alert_receive_channel_pk": alert_receive_channel.pk,
"integration_unique_data": json.dumps({"evalMatches": request.data.get("evalMatches", [])}),
"raw_request_data": request.data,
"received_at": timestamp,
},
)
return Response("Ok.")
@ -305,6 +319,7 @@ class UniversalAPIView(BrowsableInstructionMixin, AlertChannelDefiningMixin, Int
f"This url is for integration with {alert_receive_channel.config.title}."
f"Key is for {alert_receive_channel.get_integration_display()}"
)
timestamp = timezone.now().isoformat()
create_alert.apply_async(
[],
{
@ -315,6 +330,7 @@ class UniversalAPIView(BrowsableInstructionMixin, AlertChannelDefiningMixin, Int
"alert_receive_channel_pk": alert_receive_channel.pk,
"integration_unique_data": None,
"raw_request_data": request.data,
"received_at": timestamp,
},
)
return Response("Ok.")