diff --git a/engine/apps/alerts/grafana_alerting_sync_manager/grafana_alerting_sync.py b/engine/apps/alerts/grafana_alerting_sync_manager/grafana_alerting_sync.py index 7bfcbdef..a9ca08fb 100644 --- a/engine/apps/alerts/grafana_alerting_sync_manager/grafana_alerting_sync.py +++ b/engine/apps/alerts/grafana_alerting_sync_manager/grafana_alerting_sync.py @@ -5,7 +5,7 @@ from typing import Optional from django.apps import apps from rest_framework import status -from apps.alerts.tasks import create_contact_points_for_datasource +from apps.alerts.tasks import schedule_create_contact_points_for_datasource from apps.grafana_plugin.helpers import GrafanaAPIClient logger = logging.getLogger(__name__) @@ -77,16 +77,15 @@ class GrafanaAlertingSyncManager: # sync other datasource for datasource in datasources: if datasource["type"] == GrafanaAlertingSyncManager.ALERTING_DATASOURCE: - if self.create_contact_point(datasource) is None: + contact_point = self.create_contact_point(datasource) + if contact_point is None: # Failed to create contact point duo to getting wrong alerting config. It is expected behaviour. # Add datasource to list and retry to create contact point for it async datasources_to_create.append(datasource) if datasources_to_create: # create other contact points async - create_contact_points_for_datasource.apply_async( - (self.alert_receive_channel.pk, datasources_to_create), - ) + schedule_create_contact_points_for_datasource(self.alert_receive_channel.pk, datasources_to_create) else: self.alert_receive_channel.is_finished_alerting_setup = True self.alert_receive_channel.save(update_fields=["is_finished_alerting_setup"]) diff --git a/engine/apps/alerts/tasks/__init__.py b/engine/apps/alerts/tasks/__init__.py index 8e0e994f..3ff8501e 100644 --- a/engine/apps/alerts/tasks/__init__.py +++ b/engine/apps/alerts/tasks/__init__.py @@ -4,6 +4,7 @@ from .calculcate_escalation_finish_time import calculate_escalation_finish_time from .call_ack_url import call_ack_url # noqa: F401 from .check_escalation_finished import check_escalation_finished_task # noqa: F401 from .create_contact_points_for_datasource import create_contact_points_for_datasource # noqa: F401 +from .create_contact_points_for_datasource import schedule_create_contact_points_for_datasource # noqa: F401 from .custom_button_result import custom_button_result # noqa: F401 from .delete_alert_group import delete_alert_group # noqa: F401 from .distribute_alert import distribute_alert # noqa: F401 diff --git a/engine/apps/alerts/tasks/create_contact_points_for_datasource.py b/engine/apps/alerts/tasks/create_contact_points_for_datasource.py index f3dc3f4b..a447a39c 100644 --- a/engine/apps/alerts/tasks/create_contact_points_for_datasource.py +++ b/engine/apps/alerts/tasks/create_contact_points_for_datasource.py @@ -1,9 +1,32 @@ +import logging + +from celery.utils.log import get_task_logger from django.apps import apps +from django.core.cache import cache from rest_framework import status from apps.grafana_plugin.helpers import GrafanaAPIClient from common.custom_celery_tasks import shared_dedicated_queue_retry_task +logger = get_task_logger(__name__) +logger.setLevel(logging.DEBUG) + + +def get_cache_key_create_contact_points_for_datasource(alert_receive_channel_id): + CACHE_KEY_PREFIX = "create_contact_points_for_datasource" + return f"{CACHE_KEY_PREFIX}_{alert_receive_channel_id}" + + +@shared_dedicated_queue_retry_task +def schedule_create_contact_points_for_datasource(alert_receive_channel_id, datasource_list): + CACHE_LIFETIME = 600 + START_TASK_DELAY = 3 + task = create_contact_points_for_datasource.apply_async( + args=[alert_receive_channel_id, datasource_list], countdown=START_TASK_DELAY + ) + cache_key = get_cache_key_create_contact_points_for_datasource(alert_receive_channel_id) + cache.set(cache_key, task.id, timeout=CACHE_LIFETIME) + @shared_dedicated_queue_retry_task(autoretry_for=(Exception,), retry_backoff=True, max_retries=10) def create_contact_points_for_datasource(alert_receive_channel_id, datasource_list): @@ -11,6 +34,11 @@ def create_contact_points_for_datasource(alert_receive_channel_id, datasource_li Try to create contact points for other datasource. Restart task for datasource, for which contact point was not created. """ + cache_key = get_cache_key_create_contact_points_for_datasource(alert_receive_channel_id) + cached_task_id = cache.get(cache_key) + current_task_id = create_contact_points_for_datasource.request.id + if cached_task_id is not None and current_task_id != cached_task_id: + return AlertReceiveChannel = apps.get_model("alerts", "AlertReceiveChannel") @@ -21,7 +49,7 @@ def create_contact_points_for_datasource(alert_receive_channel_id, datasource_li api_token=alert_receive_channel.organization.api_token, ) # list of datasource for which contact point creation was failed - datasource_to_create = [] + datasources_to_create = [] for datasource in datasource_list: contact_point = None config, response_info = client.get_alerting_config(datasource["id"]) @@ -29,16 +57,22 @@ def create_contact_points_for_datasource(alert_receive_channel_id, datasource_li if response_info.get("status_code") == status.HTTP_404_NOT_FOUND: client.get_alertmanager_status_with_config(datasource["id"]) contact_point = alert_receive_channel.grafana_alerting_sync_manager.create_contact_point(datasource) + elif response_info.get("status_code") == status.HTTP_400_BAD_REQUEST: + logger.warning( + f"Failed to create contact point for integration {alert_receive_channel_id}, " + f"datasource info: {datasource}; response: {response_info}" + ) + continue else: contact_point = alert_receive_channel.grafana_alerting_sync_manager.create_contact_point(datasource) if contact_point is None: # Failed to create contact point duo to getting wrong alerting config. # Add datasource to list and retry to create contact point for it again - datasource_to_create.append(datasource) + datasources_to_create.append(datasource) # if some contact points were not created, restart task for them - if datasource_to_create: - create_contact_points_for_datasource.apply_async((alert_receive_channel_id, datasource_to_create), countdown=5) + if datasources_to_create: + schedule_create_contact_points_for_datasource(alert_receive_channel_id, datasources_to_create) else: alert_receive_channel.is_finished_alerting_setup = True alert_receive_channel.save(update_fields=["is_finished_alerting_setup"])