Fix too many tasks being created for create contact points (#12)

Fix too many tasks being created for create_contact_points_for_datasource task

Co-authored-by: Julia <ferril.darkdiver@gmail.com>
This commit is contained in:
Michael Derynck 2022-06-06 12:04:31 -06:00 committed by GitHub
parent dbaffd86f5
commit 583835f8d4
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 43 additions and 9 deletions

View file

@ -5,7 +5,7 @@ from typing import Optional
from django.apps import apps
from rest_framework import status
from apps.alerts.tasks import create_contact_points_for_datasource
from apps.alerts.tasks import schedule_create_contact_points_for_datasource
from apps.grafana_plugin.helpers import GrafanaAPIClient
logger = logging.getLogger(__name__)
@ -77,16 +77,15 @@ class GrafanaAlertingSyncManager:
# sync other datasource
for datasource in datasources:
if datasource["type"] == GrafanaAlertingSyncManager.ALERTING_DATASOURCE:
if self.create_contact_point(datasource) is None:
contact_point = self.create_contact_point(datasource)
if contact_point is None:
# Failed to create contact point duo to getting wrong alerting config. It is expected behaviour.
# Add datasource to list and retry to create contact point for it async
datasources_to_create.append(datasource)
if datasources_to_create:
# create other contact points async
create_contact_points_for_datasource.apply_async(
(self.alert_receive_channel.pk, datasources_to_create),
)
schedule_create_contact_points_for_datasource(self.alert_receive_channel.pk, datasources_to_create)
else:
self.alert_receive_channel.is_finished_alerting_setup = True
self.alert_receive_channel.save(update_fields=["is_finished_alerting_setup"])

View file

@ -4,6 +4,7 @@ from .calculcate_escalation_finish_time import calculate_escalation_finish_time
from .call_ack_url import call_ack_url # noqa: F401
from .check_escalation_finished import check_escalation_finished_task # noqa: F401
from .create_contact_points_for_datasource import create_contact_points_for_datasource # noqa: F401
from .create_contact_points_for_datasource import schedule_create_contact_points_for_datasource # noqa: F401
from .custom_button_result import custom_button_result # noqa: F401
from .delete_alert_group import delete_alert_group # noqa: F401
from .distribute_alert import distribute_alert # noqa: F401

View file

@ -1,9 +1,32 @@
import logging
from celery.utils.log import get_task_logger
from django.apps import apps
from django.core.cache import cache
from rest_framework import status
from apps.grafana_plugin.helpers import GrafanaAPIClient
from common.custom_celery_tasks import shared_dedicated_queue_retry_task
logger = get_task_logger(__name__)
logger.setLevel(logging.DEBUG)
def get_cache_key_create_contact_points_for_datasource(alert_receive_channel_id):
CACHE_KEY_PREFIX = "create_contact_points_for_datasource"
return f"{CACHE_KEY_PREFIX}_{alert_receive_channel_id}"
@shared_dedicated_queue_retry_task
def schedule_create_contact_points_for_datasource(alert_receive_channel_id, datasource_list):
CACHE_LIFETIME = 600
START_TASK_DELAY = 3
task = create_contact_points_for_datasource.apply_async(
args=[alert_receive_channel_id, datasource_list], countdown=START_TASK_DELAY
)
cache_key = get_cache_key_create_contact_points_for_datasource(alert_receive_channel_id)
cache.set(cache_key, task.id, timeout=CACHE_LIFETIME)
@shared_dedicated_queue_retry_task(autoretry_for=(Exception,), retry_backoff=True, max_retries=10)
def create_contact_points_for_datasource(alert_receive_channel_id, datasource_list):
@ -11,6 +34,11 @@ def create_contact_points_for_datasource(alert_receive_channel_id, datasource_li
Try to create contact points for other datasource.
Restart task for datasource, for which contact point was not created.
"""
cache_key = get_cache_key_create_contact_points_for_datasource(alert_receive_channel_id)
cached_task_id = cache.get(cache_key)
current_task_id = create_contact_points_for_datasource.request.id
if cached_task_id is not None and current_task_id != cached_task_id:
return
AlertReceiveChannel = apps.get_model("alerts", "AlertReceiveChannel")
@ -21,7 +49,7 @@ def create_contact_points_for_datasource(alert_receive_channel_id, datasource_li
api_token=alert_receive_channel.organization.api_token,
)
# list of datasource for which contact point creation was failed
datasource_to_create = []
datasources_to_create = []
for datasource in datasource_list:
contact_point = None
config, response_info = client.get_alerting_config(datasource["id"])
@ -29,16 +57,22 @@ def create_contact_points_for_datasource(alert_receive_channel_id, datasource_li
if response_info.get("status_code") == status.HTTP_404_NOT_FOUND:
client.get_alertmanager_status_with_config(datasource["id"])
contact_point = alert_receive_channel.grafana_alerting_sync_manager.create_contact_point(datasource)
elif response_info.get("status_code") == status.HTTP_400_BAD_REQUEST:
logger.warning(
f"Failed to create contact point for integration {alert_receive_channel_id}, "
f"datasource info: {datasource}; response: {response_info}"
)
continue
else:
contact_point = alert_receive_channel.grafana_alerting_sync_manager.create_contact_point(datasource)
if contact_point is None:
# Failed to create contact point duo to getting wrong alerting config.
# Add datasource to list and retry to create contact point for it again
datasource_to_create.append(datasource)
datasources_to_create.append(datasource)
# if some contact points were not created, restart task for them
if datasource_to_create:
create_contact_points_for_datasource.apply_async((alert_receive_channel_id, datasource_to_create), countdown=5)
if datasources_to_create:
schedule_create_contact_points_for_datasource(alert_receive_channel_id, datasources_to_create)
else:
alert_receive_channel.is_finished_alerting_setup = True
alert_receive_channel.save(update_fields=["is_finished_alerting_setup"])