Use periodic task for heartbeats (#2723)

# What this PR does

## Which issue(s) this PR fixes

## Checklist

- [ ] Unit, integration, and e2e (if applicable) tests updated
- [ ] Documentation added (or `pr:no public docs` PR label added if not
required)
- [ ] `CHANGELOG.md` updated (or `pr:no changelog` PR label added if not
required)

---------

Co-authored-by: Joey Orlando <joey.orlando@grafana.com>
Co-authored-by: Michael Derynck <michael.derynck@grafana.com>
This commit is contained in:
Ildar Iskhakov 2023-08-10 10:25:00 +08:00 committed by GitHub
parent 638c9a3142
commit fd19dd422a
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
15 changed files with 239 additions and 230 deletions

View file

@ -17,6 +17,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Add stack slug to organization options for direct paging Slash command by @vadimkerr ([#2743](https://github.com/grafana/oncall/pull/2743)) - Add stack slug to organization options for direct paging Slash command by @vadimkerr ([#2743](https://github.com/grafana/oncall/pull/2743))
- Avoid creating (or notifying about) potential event splits resulting from untaken swap requests ([#2748](https://github.com/grafana/oncall/pull/2748)) - Avoid creating (or notifying about) potential event splits resulting from untaken swap requests ([#2748](https://github.com/grafana/oncall/pull/2748))
- Refactor heartbeats into a periodic task ([2723](https://github.com/grafana/oncall/pull/2723))
### Fixed ### Fixed

View file

@ -4,10 +4,9 @@ from urllib.parse import urljoin
from django.conf import settings from django.conf import settings
from django.core.validators import MinLengthValidator from django.core.validators import MinLengthValidator
from django.db import models, transaction from django.db import models
from django.utils import timezone from django.utils import timezone
from apps.integrations.tasks import create_alert
from common.public_primary_keys import generate_public_primary_key, increase_public_primary_key_length from common.public_primary_keys import generate_public_primary_key, increase_public_primary_key_length
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -43,10 +42,26 @@ class IntegrationHeartBeat(models.Model):
created_at = models.DateTimeField(auto_now_add=True) created_at = models.DateTimeField(auto_now_add=True)
timeout_seconds = models.IntegerField(default=0) timeout_seconds = models.IntegerField(default=0)
last_heartbeat_time = models.DateTimeField(default=None, null=True) last_heartbeat_time = models.DateTimeField(default=None, null=True)
"""
Stores the latest received heartbeat signal time
"""
last_checkup_task_time = models.DateTimeField(default=None, null=True) last_checkup_task_time = models.DateTimeField(default=None, null=True)
"""
Deprecated. This field is not used. TODO: remove it
"""
actual_check_up_task_id = models.CharField(max_length=100) actual_check_up_task_id = models.CharField(max_length=100)
"""
Deprecated. Stored the latest scheduled `integration_heartbeat_checkup` task id. TODO: remove it
"""
previous_alerted_state_was_life = models.BooleanField(default=True) previous_alerted_state_was_life = models.BooleanField(default=True)
"""
Last status of the heartbeat. Determines if integration was alive on latest checkup
"""
public_primary_key = models.CharField( public_primary_key = models.CharField(
max_length=20, max_length=20,
@ -83,73 +98,6 @@ class IntegrationHeartBeat(models.Model):
def link(self) -> str: def link(self) -> str:
return urljoin(self.alert_receive_channel.integration_url, "heartbeat/") return urljoin(self.alert_receive_channel.integration_url, "heartbeat/")
@classmethod
def perform_heartbeat_check(cls, heartbeat_id: int, task_request_id: str) -> None:
with transaction.atomic():
heartbeats = cls.objects.filter(pk=heartbeat_id).select_for_update()
if len(heartbeats) == 0:
logger.info(f"Heartbeat {heartbeat_id} not found {task_request_id}")
return
heartbeat = heartbeats[0]
if task_request_id == heartbeat.actual_check_up_task_id:
heartbeat.check_heartbeat_state_and_save()
else:
logger.info(f"Heartbeat {heartbeat_id} is not actual {task_request_id}")
def check_heartbeat_state_and_save(self) -> bool:
"""
Use this method if you want just check heartbeat status.
"""
state_changed = self.check_heartbeat_state()
if state_changed:
self.save(update_fields=["previous_alerted_state_was_life"])
return state_changed
def check_heartbeat_state(self) -> bool:
"""
Actually checking heartbeat.
Use this method if you want to do changes of heartbeat instance while checking its status.
( See IntegrationHeartBeatAPIView.post() for example )
"""
state_changed = False
if self.is_expired:
if self.previous_alerted_state_was_life:
self.on_heartbeat_expired()
self.previous_alerted_state_was_life = False
state_changed = True
else:
if not self.previous_alerted_state_was_life:
self.on_heartbeat_restored()
self.previous_alerted_state_was_life = True
state_changed = True
return state_changed
def on_heartbeat_restored(self) -> None:
create_alert.apply_async(
kwargs={
"title": self.alert_receive_channel.heartbeat_restored_title,
"message": self.alert_receive_channel.heartbeat_restored_message,
"image_url": None,
"link_to_upstream_details": None,
"alert_receive_channel_pk": self.alert_receive_channel.pk,
"integration_unique_data": {},
"raw_request_data": self.alert_receive_channel.heartbeat_restored_payload,
},
)
def on_heartbeat_expired(self) -> None:
create_alert.apply_async(
kwargs={
"title": self.alert_receive_channel.heartbeat_expired_title,
"message": self.alert_receive_channel.heartbeat_expired_message,
"image_url": None,
"link_to_upstream_details": None,
"alert_receive_channel_pk": self.alert_receive_channel.pk,
"integration_unique_data": {},
"raw_request_data": self.alert_receive_channel.heartbeat_expired_payload,
},
)
# Insight logs # Insight logs
@property @property
def insight_logs_type_verbal(self) -> str: def insight_logs_type_verbal(self) -> str:

View file

@ -1,57 +1,105 @@
from time import perf_counter import datetime
from celery.utils.log import get_task_logger from celery.utils.log import get_task_logger
from django.conf import settings
from django.db import transaction from django.db import transaction
from django.db.models import DateTimeField, DurationField, ExpressionWrapper, F
from django.db.models.functions import Cast
from django.utils import timezone from django.utils import timezone
from apps.heartbeat.models import IntegrationHeartBeat
from apps.integrations.tasks import create_alert
from common.custom_celery_tasks import shared_dedicated_queue_retry_task from common.custom_celery_tasks import shared_dedicated_queue_retry_task
from settings.base import DatabaseTypes
logger = get_task_logger(__name__) logger = get_task_logger(__name__)
@shared_dedicated_queue_retry_task() @shared_dedicated_queue_retry_task()
def integration_heartbeat_checkup(heartbeat_id: int) -> None: def check_heartbeats() -> str:
from apps.heartbeat.models import IntegrationHeartBeat """
Periodic task to check heartbeats status change and create alerts (or auto-resolve alerts) if needed
"""
# Heartbeat is considered enabled if it
# * has timeout_seconds set to non-zero (non-default) value,
# * received at least one checkup (last_heartbeat_time set to non-null value)\
IntegrationHeartBeat.perform_heartbeat_check(heartbeat_id, integration_heartbeat_checkup.request.id) def _get_timeout_expression() -> ExpressionWrapper:
if settings.DATABASES["default"]["ENGINE"] == f"django.db.backends.{DatabaseTypes.POSTGRESQL}":
# DurationField: When used on PostgreSQL, the data type used is an interval
# https://docs.djangoproject.com/en/3.2/ref/models/fields/#durationfield
return ExpressionWrapper(datetime.timedelta(seconds=1) * F("timeout_seconds"), output_field=DurationField())
else:
# DurationField: ...Otherwise a bigint of microseconds is used...
# microseconds = seconds * 10**6
# https://docs.djangoproject.com/en/3.2/ref/models/fields/#durationfield
return ExpressionWrapper(F("timeout_seconds") * 10**6, output_field=DurationField())
enabled_heartbeats = (
IntegrationHeartBeat.objects.filter(last_heartbeat_time__isnull=False)
.exclude(timeout_seconds=0)
.annotate(period_start=(Cast(timezone.now() - _get_timeout_expression(), DateTimeField())))
)
with transaction.atomic():
# Heartbeat is considered expired if it
# * is enabled,
# * is not already expired,
# * last check in was before the timeout period start
expired_heartbeats = enabled_heartbeats.select_for_update().filter(
last_heartbeat_time__lte=F("period_start"), previous_alerted_state_was_life=True
)
# Schedule alert creation for each expired heartbeat after transaction commit
for heartbeat in expired_heartbeats:
transaction.on_commit(
lambda: create_alert.apply_async(
kwargs={
"title": heartbeat.alert_receive_channel.heartbeat_expired_title,
"message": heartbeat.alert_receive_channel.heartbeat_expired_message,
"image_url": None,
"link_to_upstream_details": None,
"alert_receive_channel_pk": heartbeat.alert_receive_channel.pk,
"integration_unique_data": {},
"raw_request_data": heartbeat.alert_receive_channel.heartbeat_expired_payload,
},
)
)
# Update previous_alerted_state_was_life to False
expired_count = expired_heartbeats.update(previous_alerted_state_was_life=False)
with transaction.atomic():
# Heartbeat is considered restored if it
# * is enabled,
# * last check in was after the timeout period start,
# * was is alerted state (previous_alerted_state_was_life is False), i.e. was expired
restored_heartbeats = enabled_heartbeats.select_for_update().filter(
last_heartbeat_time__gte=F("period_start"), previous_alerted_state_was_life=False
)
# Schedule auto-resolve alert creation for each expired heartbeat after transaction commit
for heartbeat in restored_heartbeats:
transaction.on_commit(
lambda: create_alert.apply_async(
kwargs={
"title": heartbeat.alert_receive_channel.heartbeat_restored_title,
"message": heartbeat.alert_receive_channel.heartbeat_restored_message,
"image_url": None,
"link_to_upstream_details": None,
"alert_receive_channel_pk": heartbeat.alert_receive_channel.pk,
"integration_unique_data": {},
"raw_request_data": heartbeat.alert_receive_channel.heartbeat_restored_payload,
},
)
)
restored_count = restored_heartbeats.update(previous_alerted_state_was_life=True)
return f"Found {expired_count} expired and {restored_count} restored heartbeats"
@shared_dedicated_queue_retry_task()
def integration_heartbeat_checkup(heartbeat_id: int) -> None:
"""Deprecated. TODO: Remove this task after this task cleared from queue"""
pass
@shared_dedicated_queue_retry_task() @shared_dedicated_queue_retry_task()
def process_heartbeat_task(alert_receive_channel_pk): def process_heartbeat_task(alert_receive_channel_pk):
start = perf_counter() IntegrationHeartBeat.objects.filter(
from apps.heartbeat.models import IntegrationHeartBeat alert_receive_channel__pk=alert_receive_channel_pk,
).update(last_heartbeat_time=timezone.now())
with transaction.atomic():
heartbeats = IntegrationHeartBeat.objects.filter(
alert_receive_channel__pk=alert_receive_channel_pk,
).select_for_update()
if len(heartbeats) == 0:
logger.info(f"Integration Heartbeat for alert_receive_channel {alert_receive_channel_pk} was not found.")
return
else:
heartbeat = heartbeats[0]
heartbeat_selected = perf_counter()
logger.info(
f"IntegrationHeartBeat selected for alert_receive_channel {alert_receive_channel_pk} in {heartbeat_selected - start}"
)
task = integration_heartbeat_checkup.apply_async(
(heartbeat.pk,),
countdown=heartbeat.timeout_seconds + 1,
)
is_touched = heartbeat.last_heartbeat_time is not None
heartbeat.actual_check_up_task_id = task.id
heartbeat.last_heartbeat_time = timezone.now()
update_fields = ["actual_check_up_task_id", "last_heartbeat_time"]
task_started = perf_counter()
logger.info(
f"heartbeat_checkup task started for alert_receive_channel {alert_receive_channel_pk} in {task_started - start}"
)
if is_touched:
state_changed = heartbeat.check_heartbeat_state()
state_checked = perf_counter()
logger.info(
f"state checked for alert_receive_channel {alert_receive_channel_pk} in {state_checked - start}"
)
if state_changed:
update_fields.append("previous_alerted_state_was_life")
heartbeat.save(update_fields=update_fields)

View file

@ -4,83 +4,77 @@ import pytest
from django.utils import timezone from django.utils import timezone
from apps.alerts.models import AlertReceiveChannel from apps.alerts.models import AlertReceiveChannel
from apps.heartbeat.tasks import check_heartbeats
from apps.integrations.tasks import create_alert
@pytest.mark.django_db @pytest.mark.django_db
@patch("apps.heartbeat.models.IntegrationHeartBeat.on_heartbeat_expired", return_value=None)
@pytest.mark.parametrize("integration", [AlertReceiveChannel.INTEGRATION_FORMATTED_WEBHOOK]) @pytest.mark.parametrize("integration", [AlertReceiveChannel.INTEGRATION_FORMATTED_WEBHOOK])
def test_integration_heartbeat_expired( def test_check_heartbeats(
mocked_handler, make_organization_and_user, make_alert_receive_channel, make_integration_heartbeat, integration make_organization_and_user,
make_alert_receive_channel,
make_integration_heartbeat,
integration,
django_capture_on_commit_callbacks,
): ):
# No heartbeats, nothing happens
with patch.object(create_alert, "apply_async") as mock_create_alert_apply_async:
with django_capture_on_commit_callbacks(execute=True):
result = check_heartbeats()
assert result == "Found 0 expired and 0 restored heartbeats"
assert mock_create_alert_apply_async.call_count == 0
# Prepare heartbeat
team, _ = make_organization_and_user() team, _ = make_organization_and_user()
# Some short timeout and last_heartbeat_time to make sure that heartbeat is expired timeout = 60
timeout = 1
last_heartbeat_time = timezone.now() - timezone.timedelta(seconds=timeout * 10)
alert_receive_channel = make_alert_receive_channel(team, integration=integration)
integration_heartbeat = make_integration_heartbeat(
alert_receive_channel, timeout, last_heartbeat_time=last_heartbeat_time
)
integration_heartbeat.check_heartbeat_state_and_save()
assert mocked_handler.called
@pytest.mark.django_db
@patch("apps.heartbeat.models.IntegrationHeartBeat.on_heartbeat_expired", return_value=None)
@pytest.mark.parametrize("integration", [AlertReceiveChannel.INTEGRATION_FORMATTED_WEBHOOK])
def test_integration_heartbeat_already_expired(
mocked_handler, make_organization_and_user, make_alert_receive_channel, make_integration_heartbeat, integration
):
team, _ = make_organization_and_user()
# Some short timeout and last_heartbeat_time to make sure that heartbeat is expired
timeout = 1
last_heartbeat_time = timezone.now() - timezone.timedelta(seconds=timeout * 10)
alert_receive_channel = make_alert_receive_channel(team, integration=integration)
integration_heartbeat = make_integration_heartbeat(
alert_receive_channel,
timeout,
last_heartbeat_time=last_heartbeat_time,
previous_alerted_state_was_life=False,
)
integration_heartbeat.check_heartbeat_state_and_save()
assert mocked_handler.called is False
@pytest.mark.django_db
@patch("apps.heartbeat.models.IntegrationHeartBeat.on_heartbeat_restored", return_value=None)
@pytest.mark.parametrize("integration", [AlertReceiveChannel.INTEGRATION_FORMATTED_WEBHOOK])
def test_integration_heartbeat_restored(
mocked_handler, make_organization_and_user, make_alert_receive_channel, make_integration_heartbeat, integration
):
team, _ = make_organization_and_user()
# Some long timeout and last_heartbeat_time to make sure that heartbeat is not expired
timeout = 1000
last_heartbeat_time = timezone.now() last_heartbeat_time = timezone.now()
alert_receive_channel = make_alert_receive_channel(team, integration=integration) alert_receive_channel = make_alert_receive_channel(team, integration=integration)
integration_heartbeat = make_integration_heartbeat( integration_heartbeat = make_integration_heartbeat(
alert_receive_channel, alert_receive_channel, timeout, last_heartbeat_time=last_heartbeat_time, previous_alerted_state_was_life=True
timeout,
last_heartbeat_time=last_heartbeat_time,
previous_alerted_state_was_life=False,
) )
integration_heartbeat.check_heartbeat_state_and_save()
assert mocked_handler.called
# Heartbeat is alive, nothing happens
with patch.object(create_alert, "apply_async") as mock_create_alert_apply_async:
with django_capture_on_commit_callbacks(execute=True):
result = check_heartbeats()
assert result == "Found 0 expired and 0 restored heartbeats"
assert mock_create_alert_apply_async.call_count == 0
@pytest.mark.django_db # Hearbeat expires, send an alert
@patch("apps.heartbeat.models.IntegrationHeartBeat.on_heartbeat_restored", return_value=None) integration_heartbeat.refresh_from_db()
@pytest.mark.parametrize("integration", [AlertReceiveChannel.INTEGRATION_FORMATTED_WEBHOOK]) integration_heartbeat.last_heartbeat_time = timezone.now() - timezone.timedelta(seconds=timeout * 10)
def test_integration_heartbeat_restored_and_alert_was_not_sent( integration_heartbeat.save()
mocked_handler, make_organization_and_user, make_alert_receive_channel, make_integration_heartbeat, integration with patch.object(create_alert, "apply_async") as mock_create_alert_apply_async:
): with django_capture_on_commit_callbacks(execute=True):
team, _ = make_organization_and_user() result = check_heartbeats()
# Some long timeout and last_heartbeat_time to make sure that heartbeat is not expired assert result == "Found 1 expired and 0 restored heartbeats"
timeout = 1000 assert mock_create_alert_apply_async.call_count == 1
last_heartbeat_time = timezone.now()
alert_receive_channel = make_alert_receive_channel(team, integration=integration) # Heartbeat is still expired, nothing happens
integration_heartbeat = make_integration_heartbeat( integration_heartbeat.refresh_from_db()
alert_receive_channel, with patch.object(create_alert, "apply_async") as mock_create_alert_apply_async:
timeout, with django_capture_on_commit_callbacks(execute=True):
last_heartbeat_time=last_heartbeat_time, result = check_heartbeats()
) assert result == "Found 0 expired and 0 restored heartbeats"
integration_heartbeat.check_heartbeat_state_and_save() assert mock_create_alert_apply_async.call_count == 0
assert mocked_handler.called is False
# Hearbeat restored, send an auto-resolve alert
integration_heartbeat.refresh_from_db()
integration_heartbeat.last_heartbeat_time = timezone.now()
integration_heartbeat.save()
with patch.object(create_alert, "apply_async") as mock_create_alert_apply_async:
with django_capture_on_commit_callbacks(execute=True):
result = check_heartbeats()
assert result == "Found 0 expired and 1 restored heartbeats"
assert mock_create_alert_apply_async.call_count == 1
# Heartbeat is alive, nothing happens
integration_heartbeat.refresh_from_db()
integration_heartbeat.last_heartbeat_time = timezone.now()
integration_heartbeat.save()
integration_heartbeat.refresh_from_db()
with patch.object(create_alert, "apply_async") as mock_create_alert_apply_async:
with django_capture_on_commit_callbacks(execute=True):
result = check_heartbeats()
assert result == "Found 0 expired and 0 restored heartbeats"
assert mock_create_alert_apply_async.call_count == 0

View file

@ -12,12 +12,12 @@ heartbeat_expired_message = heartbeat_text.heartbeat_expired_message
heartbeat_expired_payload = { heartbeat_expired_payload = {
"alert_uid": "0eaf37c8-e1eb-4714-b79e-7c648b6a96fa", "alert_uid": "0eaf37c8-e1eb-4714-b79e-7c648b6a96fa",
"title": heartbeat_expired_title, "title": heartbeat_expired_title,
"image_url": None,
"state": "alerting", "state": "alerting",
"link_to_upstream_details": None,
"message": heartbeat_expired_message, "message": heartbeat_expired_message,
"is_amixr_heartbeat": True, "is_oncall_heartbeat": True,
"is_amixr_heartbeat_restored": False, "is_oncall_heartbeat_restored": False,
"is_amixr_heartbeat": True, # Keep for backwards compatibility
"is_amixr_heartbeat_restored": False, # Keep for backwards compatibility
} }
heartbeat_restored_title = heartbeat_text.heartbeat_restored_title heartbeat_restored_title = heartbeat_text.heartbeat_restored_title
@ -26,10 +26,10 @@ heartbeat_restored_message = heartbeat_text.heartbeat_restored_message
heartbeat_restored_payload = { heartbeat_restored_payload = {
"alert_uid": "0eaf37c8-e1eb-4714-b79e-7c648b6a96fa", "alert_uid": "0eaf37c8-e1eb-4714-b79e-7c648b6a96fa",
"title": heartbeat_restored_title, "title": heartbeat_restored_title,
"image_url": None,
"state": "ok", "state": "ok",
"link_to_upstream_details": None,
"message": heartbeat_restored_message, "message": heartbeat_restored_message,
"is_amixr_heartbeat": True, "is_oncall_heartbeat": True,
"is_amixr_heartbeat_restored": True, "is_oncall_heartbeat_restored": True,
"is_amixr_heartbeat": True, # Keep for backwards compatibility
"is_amixr_heartbeat_restored": True, # Keep for backwards compatibility
} }

View file

@ -17,8 +17,10 @@ heartbeat_expired_payload = {
"state": "alerting", "state": "alerting",
"link_to_upstream_details": None, "link_to_upstream_details": None,
"message": heartbeat_expired_message, "message": heartbeat_expired_message,
"is_amixr_heartbeat": True, "is_oncall_heartbeat": True,
"is_amixr_heartbeat_restored": False, "is_oncall_heartbeat_restored": False,
"is_amixr_heartbeat": True, # Keep for backwards compatibility
"is_amixr_heartbeat_restored": False, # Keep for backwards compatibility
} }
heartbeat_restored_title = heartbeat_text.heartbeat_restored_title heartbeat_restored_title = heartbeat_text.heartbeat_restored_title
@ -31,6 +33,8 @@ heartbeat_restored_payload = {
"state": "ok", "state": "ok",
"link_to_upstream_details": None, "link_to_upstream_details": None,
"message": heartbeat_restored_message, "message": heartbeat_restored_message,
"is_amixr_heartbeat": True, "is_oncall_heartbeat": True,
"is_amixr_heartbeat_restored": True, "is_oncall_heartbeat_restored": True,
"is_amixr_heartbeat": True, # Keep for backwards compatibility
"is_amixr_heartbeat_restored": True, # Keep for backwards compatibility
} }

View file

@ -14,8 +14,10 @@ heartbeat_expired_payload = {
"state": "alerting", "state": "alerting",
"title": heartbeat_expired_title, "title": heartbeat_expired_title,
"message": heartbeat_expired_message, "message": heartbeat_expired_message,
"is_amixr_heartbeat": True, "is_oncall_heartbeat": True,
"is_amixr_heartbeat_restored": False, "is_oncall_heartbeat_restored": False,
"is_amixr_heartbeat": True, # Keep for backwards compatibility
"is_amixr_heartbeat_restored": False, # Keep for backwards compatibility
} }
heartbeat_restored_title = f"[OK] {heartbeat_text.heartbeat_restored_title}" heartbeat_restored_title = f"[OK] {heartbeat_text.heartbeat_restored_title}"
@ -25,6 +27,8 @@ heartbeat_restored_payload = {
"state": "ok", "state": "ok",
"title": heartbeat_restored_title, "title": heartbeat_restored_title,
"message": heartbeat_restored_message, "message": heartbeat_restored_message,
"is_amixr_heartbeat": True, "is_oncall_heartbeat": True,
"is_amixr_heartbeat_restored": True, "is_oncall_heartbeat_restored": True,
"is_amixr_heartbeat": True, # Keep for backwards compatibility
"is_amixr_heartbeat_restored": True, # Keep for backwards compatibility
} }

View file

@ -17,8 +17,10 @@ heartbeat_expired_payload = {
"state": "alerting", "state": "alerting",
"link_to_upstream_details": None, "link_to_upstream_details": None,
"message": heartbeat_expired_message, "message": heartbeat_expired_message,
"is_amixr_heartbeat": True, "is_oncall_heartbeat": True,
"is_amixr_heartbeat_restored": False, "is_oncall_heartbeat_restored": False,
"is_amixr_heartbeat": True, # Keep for backwards compatibility
"is_amixr_heartbeat_restored": False, # Keep for backwards compatibility
} }
heartbeat_restored_title = heartbeat_text.heartbeat_restored_title heartbeat_restored_title = heartbeat_text.heartbeat_restored_title
@ -31,6 +33,8 @@ heartbeat_restored_payload = {
"state": "ok", "state": "ok",
"link_to_upstream_details": None, "link_to_upstream_details": None,
"message": heartbeat_restored_message, "message": heartbeat_restored_message,
"is_amixr_heartbeat": True, "is_oncall_heartbeat": True,
"is_amixr_heartbeat_restored": True, "is_oncall_heartbeat_restored": True,
"is_amixr_heartbeat": True, # Keep for backwards compatibility
"is_amixr_heartbeat_restored": True, # Keep for backwards compatibility
} }

View file

@ -13,12 +13,12 @@ heartbeat_expired_message = heartbeat_text.heartbeat_expired_message
heartbeat_expired_payload = { heartbeat_expired_payload = {
"alert_uid": "7973c835-ff3f-46e4-9444-06df127b6f8e", "alert_uid": "7973c835-ff3f-46e4-9444-06df127b6f8e",
"title": heartbeat_expired_title, "title": heartbeat_expired_title,
"image_url": None,
"state": "alerting", "state": "alerting",
"link_to_upstream_details": None,
"message": heartbeat_expired_message, "message": heartbeat_expired_message,
"is_amixr_heartbeat": True, "is_oncall_heartbeat": True,
"is_amixr_heartbeat_restored": False, "is_oncall_heartbeat_restored": False,
"is_amixr_heartbeat": True, # Keep for backwards compatibility
"is_amixr_heartbeat_restored": False, # Keep for backwards compatibility
} }
heartbeat_restored_title = heartbeat_text.heartbeat_restored_title heartbeat_restored_title = heartbeat_text.heartbeat_restored_title
@ -31,6 +31,8 @@ heartbeat_restored_payload = {
"state": "ok", "state": "ok",
"link_to_upstream_details": None, "link_to_upstream_details": None,
"message": heartbeat_restored_message, "message": heartbeat_restored_message,
"is_amixr_heartbeat": True, "is_oncall_heartbeat": True,
"is_amixr_heartbeat_restored": True, "is_oncall_heartbeat_restored": True,
"is_amixr_heartbeat": True, # Keep for backwards compatibility
"is_amixr_heartbeat_restored": True, # Keep for backwards compatibility
} }

View file

@ -16,8 +16,10 @@ heartbeat_expired_payload = {
"state": "alerting", "state": "alerting",
"link_to_upstream_details": None, "link_to_upstream_details": None,
"message": heartbeat_expired_message, "message": heartbeat_expired_message,
"is_amixr_heartbeat": True, "is_oncall_heartbeat": True,
"is_amixr_heartbeat_restored": False, "is_oncall_heartbeat_restored": False,
"is_amixr_heartbeat": True, # Keep for backwards compatibility
"is_amixr_heartbeat_restored": False, # Keep for backwards compatibility
} }
heartbeat_restored_title = heartbeat_text.heartbeat_restored_title heartbeat_restored_title = heartbeat_text.heartbeat_restored_title
@ -30,6 +32,8 @@ heartbeat_restored_payload = {
"state": "ok", "state": "ok",
"link_to_upstream_details": None, "link_to_upstream_details": None,
"message": heartbeat_restored_message, "message": heartbeat_restored_message,
"is_amixr_heartbeat": True, "is_oncall_heartbeat": True,
"is_amixr_heartbeat_restored": True, "is_oncall_heartbeat_restored": True,
"is_amixr_heartbeat": True, # Keep for backwards compatibility
"is_amixr_heartbeat_restored": True, # Keep for backwards compatibility
} }

View file

@ -96,5 +96,3 @@ def test_ratelimit_integration_heartbeats(
response = c.get(url) response = c.get(url)
assert response.status_code == 429 assert response.status_code == 429
assert mocked_task.call_count == 1

View file

@ -3,6 +3,7 @@ import logging
from django.conf import settings from django.conf import settings
from django.core.exceptions import PermissionDenied from django.core.exceptions import PermissionDenied
from django.db import OperationalError
from django.http import HttpResponseBadRequest, JsonResponse from django.http import HttpResponseBadRequest, JsonResponse
from django.utils.decorators import method_decorator from django.utils.decorators import method_decorator
from django.views.decorators.csrf import csrf_exempt from django.views.decorators.csrf import csrf_exempt
@ -324,6 +325,10 @@ class IntegrationHeartBeatAPIView(AlertChannelDefiningMixin, IntegrationHeartBea
return Response(status=200) return Response(status=200)
def _process_heartbeat_signal(self, request, alert_receive_channel): def _process_heartbeat_signal(self, request, alert_receive_channel):
process_heartbeat_task.apply_async( try:
(alert_receive_channel.pk,), process_heartbeat_task(alert_receive_channel.pk)
) # If database is not ready, fallback to celery task
except OperationalError:
process_heartbeat_task.apply_async(
(alert_receive_channel.pk,),
)

View file

@ -46,14 +46,7 @@ source_link = None
grouping_id = '{{ payload.get("alert_uid", "")}}' grouping_id = '{{ payload.get("alert_uid", "")}}'
resolve_condition = """\ resolve_condition = """{{ payload.get("state", "").upper() == "OK" }}"""
{%- if "is_amixr_heartbeat_restored" in payload -%}
{# We don't know the payload format from your integration. #}
{# The heartbeat alerts will go here so we check for our own key #}
{{ payload["is_amixr_heartbeat_restored"] }}
{%- else -%}
{{ payload.get("state", "").upper() == "OK" }}
{%- endif %}"""
acknowledge_condition = None acknowledge_condition = None

View file

@ -45,16 +45,15 @@ telegram_image_url = slack_image_url
source_link = "{{ payload.url }}" source_link = "{{ payload.url }}"
grouping_id = "{{ payload }}" grouping_id = """\
{% if "is_oncall_heartbeat" in payload %}
{# Case for heartbeat alerts generated by Grafana OnCall #}
{{- payload.alert_uid }}
{% else %}
{{- payload }}
{% endif %}"""
resolve_condition = """\ resolve_condition = """{{ payload.get("state", "").upper() == "OK" }}"""
{%- if "is_amixr_heartbeat_restored" in payload -%}
{# We don't know the payload format from your integration. #}
{# The heartbeat alerts will go here so we check for our own key #}
{{ payload["is_amixr_heartbeat_restored"] }}
{%- else -%}
{{ payload.get("state", "").upper() == "OK" }}
{%- endif %}"""
acknowledge_condition = None acknowledge_condition = None
example_payload = {"message": "This alert was sent by user for demonstration purposes"} example_payload = {"message": "This alert was sent by user for demonstration purposes"}

View file

@ -488,6 +488,11 @@ CELERY_BEAT_SCHEDULE = {
"schedule": 60 * 30, "schedule": 60 * 30,
"args": (), "args": (),
}, },
"check_heartbeats": {
"task": "apps.heartbeat.tasks.check_heartbeats",
"schedule": crontab(minute="*/2"), # every 2 minutes
"args": (),
},
} }
if ESCALATION_AUDITOR_ENABLED: if ESCALATION_AUDITOR_ENABLED: