From 014a9c2ec227064fb38df5efe813bf57808ae630 Mon Sep 17 00:00:00 2001 From: Joey Orlando Date: Wed, 10 May 2023 08:36:23 -0400 Subject: [PATCH] allow the POST incoming alert endpoints to queue create_alert tasks independent of the database status (#1896) # What this PR does https://www.loom.com/share/18cc445117de4895a10892d56c7d3699 In preparation to upgrade our cloud databases, this PR makes some minor changes which, after testing locally, allowed the `POST //` endpoints to successfully receive incoming alerts and queue the celery tasks. I've tested all of the defined `POST /integrations/v1//` endpoints by sending `POST` requests to an integrations' URL while the MySQL database was down, bringing the database back up, and ensuring the alerts were created. ## Some other findings - the integration heartbeat endpoints will not work as we interact w/ the database to persist the incoming heartbeat instance - if the integration was created in the last 180 seconds, incoming alerts will fail due to the way we cache the integration IDs ([code](https://github.com/grafana/oncall/blob/dev/engine/apps/integrations/mixins/alert_channel_defining_mixin.py#L47-L50)) - The `create_alert` celery task is set to `max_retries=None` and `retry_backoff=True`. This means that the queued tasks will continue retrying forever w/ an exponential backoff, until the alerts can be created in the database (ie. when the database is back online). ## Checklist - [ ] Unit, integration, and e2e (if applicable) tests updated (N/A) - [ ] Documentation added (or `pr:no public docs` PR label added if not required) (N/A) - [ ] `CHANGELOG.md` updated (or `pr:no changelog` PR label added if not required) (N/A) --- engine/apps/grafana_plugin/apps.py | 22 +++++++++++-------- .../mixins/alert_channel_defining_mixin.py | 15 +++++++------ 2 files changed, 21 insertions(+), 16 deletions(-) diff --git a/engine/apps/grafana_plugin/apps.py b/engine/apps/grafana_plugin/apps.py index 3d76dfc0..995e6fd4 100644 --- a/engine/apps/grafana_plugin/apps.py +++ b/engine/apps/grafana_plugin/apps.py @@ -3,6 +3,7 @@ import sys from django.apps import AppConfig, apps from django.conf import settings +from django.db import OperationalError logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) @@ -25,13 +26,16 @@ class GrafanaPluginConfig(AppConfig): # TODO: this logic should probably be moved out to a common utility is_not_migration_script = any(startup_command in sys.argv for startup_command in STARTUP_COMMANDS) if is_not_migration_script and settings.IS_OPEN_SOURCE: - Organization = apps.get_model("user_management", "Organization") - has_existing_org = Organization.objects.first() is not None + try: + Organization = apps.get_model("user_management", "Organization") + has_existing_org = Organization.objects.first() is not None - # only enforce the following for new setups - if no organization exists in the database - # and the GRAFANA_API_URL env var is not specified, exit the application - if has_existing_org is False and settings.SELF_HOSTED_SETTINGS["GRAFANA_API_URL"] is None: - logger.error( - f"For OSS installations, GRAFANA_API_URL is a required environment variable. Please set it and restart the application." - ) - sys.exit() + # only enforce the following for new setups - if no organization exists in the database + # and the GRAFANA_API_URL env var is not specified, exit the application + if has_existing_org is False and settings.SELF_HOSTED_SETTINGS["GRAFANA_API_URL"] is None: + logger.error( + f"For OSS installations, GRAFANA_API_URL is a required environment variable. Please set it and restart the application." + ) + sys.exit() + except OperationalError: + pass diff --git a/engine/apps/integrations/mixins/alert_channel_defining_mixin.py b/engine/apps/integrations/mixins/alert_channel_defining_mixin.py index cebcee5e..fd6ac239 100644 --- a/engine/apps/integrations/mixins/alert_channel_defining_mixin.py +++ b/engine/apps/integrations/mixins/alert_channel_defining_mixin.py @@ -48,6 +48,7 @@ class AlertChannelDefiningMixin(object): if cache.get(self.CACHE_DB_FALLBACK_OBSOLETE_KEY) is None: cache.set(self.CACHE_DB_FALLBACK_OBSOLETE_KEY, True, self.CACHE_DB_FALLBACK_REFRESH_INTERVAL) self.update_alert_receive_channel_cache() + except AlertReceiveChannel.DoesNotExist: raise PermissionDenied("Integration key was not found. Permission denied.") except OperationalError: @@ -65,13 +66,13 @@ class AlertChannelDefiningMixin(object): else: logger.info("Cache is empty!") raise - - if alert_receive_channel.organization.is_moved: - raise OrganizationMovedException(alert_receive_channel.organization) - if alert_receive_channel.organization.deleted_at: - # It's better to raise OrganizarionDeletedException, but in legacy code PermissionDenied is returned when integration key not found. - # So, keep it consistent. - raise PermissionDenied("Integration key was not found. Permission denied.") + else: + if alert_receive_channel.organization.is_moved: + raise OrganizationMovedException(alert_receive_channel.organization) + if alert_receive_channel.organization.deleted_at: + # It's better to raise OrganizarionDeletedException, but in legacy code PermissionDenied is returned when integration key not found. + # So, keep it consistent. + raise PermissionDenied("Integration key was not found. Permission denied.") del kwargs["alert_channel_key"] kwargs["alert_receive_channel"] = alert_receive_channel