From 6e61643750ec5dc3e4b187bfb80458c791d85e9b Mon Sep 17 00:00:00 2001 From: Ildar Iskhakov Date: Mon, 24 Apr 2023 13:38:21 +0800 Subject: [PATCH] Limit number of alertmanager alerts in alert group to autoresolve (#1779) # What this PR does This PR set the limit so that workers won't attempt to autoresolve too big alertmanager alert groups. ## Which issue(s) this PR fixes ## Checklist - [ ] Unit, integration, and e2e (if applicable) tests updated - [ ] Documentation added (or `pr:no public docs` PR label added if not required) - [ ] `CHANGELOG.md` updated (or `pr:no changelog` PR label added if not required) --- CHANGELOG.md | 1 + .../configure-alertmanager/index.md | 4 ++-- engine/apps/alerts/models/alert_group.py | 3 ++- engine/apps/alerts/models/alert_manager_models.py | 6 +++++- .../tasks/resolve_alert_group_by_source_if_needed.py | 5 +++++ engine/apps/integrations/tasks.py | 8 +++++--- 6 files changed, 20 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f8f0b7d3..90a86ddd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed - Update shift API to use a default interval value (`1`) when a `frequency` is set and no `interval` is given +- Limit number of alertmanager alerts in alert group to autoresolve by 500 ([1779](https://github.com/grafana/oncall/pull/1779)) ## v1.2.14 (2023-04-19) diff --git a/docs/sources/integrations/available-integrations/configure-alertmanager/index.md b/docs/sources/integrations/available-integrations/configure-alertmanager/index.md index a0e59e06..b43a5aa4 100644 --- a/docs/sources/integrations/available-integrations/configure-alertmanager/index.md +++ b/docs/sources/integrations/available-integrations/configure-alertmanager/index.md @@ -68,5 +68,5 @@ Alertmanager offers three alert grouping options: distraction. Grafana OnCall grouping will help manage this in the following ways: - Grafana OnCall groups alerts based on the first label of each alert. - - Grafana OnCall marks an incident as resolved only when the amount of grouped alerts with state `resolved` equals - the amount of alerts with state `firing`. + - Grafana OnCall marks an alert group as resolved only when there are fewer than 500 grouped + alerts, and every `firing` alert with the same labels has a corresponding `resolved` alert diff --git a/engine/apps/alerts/models/alert_group.py b/engine/apps/alerts/models/alert_group.py index 4d71eb0a..bbf74613 100644 --- a/engine/apps/alerts/models/alert_group.py +++ b/engine/apps/alerts/models/alert_group.py @@ -145,7 +145,7 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. "GroupData", ["is_resolve_signal", "group_distinction", "web_title_cache", "is_acknowledge_signal"] ) - SOURCE, USER, NOT_YET, LAST_STEP, ARCHIVED, WIPED, DISABLE_MAINTENANCE = range(7) + SOURCE, USER, NOT_YET, LAST_STEP, ARCHIVED, WIPED, DISABLE_MAINTENANCE, NOT_YET_STOP_AUTORESOLVE = range(8) SOURCE_CHOICES = ( (SOURCE, "source"), (USER, "user"), @@ -154,6 +154,7 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. (ARCHIVED, "archived"), (WIPED, "wiped"), (DISABLE_MAINTENANCE, "stop maintenance"), + (NOT_YET_STOP_AUTORESOLVE, "not yet, autoresolve disabled"), ) ACKNOWLEDGE = "acknowledge" diff --git a/engine/apps/alerts/models/alert_manager_models.py b/engine/apps/alerts/models/alert_manager_models.py index 479e87cc..57995933 100644 --- a/engine/apps/alerts/models/alert_manager_models.py +++ b/engine/apps/alerts/models/alert_manager_models.py @@ -7,12 +7,16 @@ from apps.alerts.models import Alert, AlertGroup class AlertGroupForAlertManager(AlertGroup): + MAX_ALERTS_IN_GROUP_FOR_AUTO_RESOLVE = 500 + def is_alert_a_resolve_signal(self, alert): non_resolved_hashes = set() hash = alert.get_integration_optimization_hash() if alert.calculated_is_resolve_signal: # Calculate leftover hashes - for alert in AlertForAlertManager.objects.filter(group=self).exclude(pk=alert.pk).all(): + for alert in AlertForAlertManager.objects.filter(group=self).exclude(pk=alert.pk)[ + : AlertGroupForAlertManager.MAX_ALERTS_IN_GROUP_FOR_AUTO_RESOLVE + ]: if alert.calculated_is_resolve_signal: try: non_resolved_hashes.remove(alert.get_integration_optimization_hash()) diff --git a/engine/apps/alerts/tasks/resolve_alert_group_by_source_if_needed.py b/engine/apps/alerts/tasks/resolve_alert_group_by_source_if_needed.py index 43552cd7..2a6de51a 100644 --- a/engine/apps/alerts/tasks/resolve_alert_group_by_source_if_needed.py +++ b/engine/apps/alerts/tasks/resolve_alert_group_by_source_if_needed.py @@ -22,6 +22,11 @@ def resolve_alert_group_by_source_if_needed(alert_group_pk): alert_group.active_resolve_calculation_id ) else: + if alert_group.resolved_by == alert_group.NOT_YET_STOP_AUTORESOLVE: + return "alert_group is too big to auto-resolve" + if alert_group.alerts.count() > AlertGroupForAlertManager.MAX_ALERTS_IN_GROUP_FOR_AUTO_RESOLVE: + alert_group.resolved_by = alert_group.NOT_YET_STOP_AUTORESOLVE + alert_group.save(update_fields=["resolved_by"]) last_alert = AlertForAlertManager.objects.get(pk=alert_group.alerts.last().pk) if alert_group.is_alert_a_resolve_signal(last_alert): alert_group.resolve_by_source() diff --git a/engine/apps/integrations/tasks.py b/engine/apps/integrations/tasks.py index c864fe15..05f59e02 100644 --- a/engine/apps/integrations/tasks.py +++ b/engine/apps/integrations/tasks.py @@ -58,9 +58,11 @@ def create_alertmanager_alerts(alert_receive_channel_pk, alert, is_demo=False, f return if alert_receive_channel.allow_source_based_resolving: - task = resolve_alert_group_by_source_if_needed.apply_async((alert.group.pk,), countdown=5) - alert.group.active_resolve_calculation_id = task.id - alert.group.save(update_fields=["active_resolve_calculation_id"]) + alert_group = alert.group + if alert_group.resolved_by != alert_group.NOT_YET_STOP_AUTORESOLVE: + task = resolve_alert_group_by_source_if_needed.apply_async((alert.group.pk,), countdown=5) + alert.group.active_resolve_calculation_id = task.id + alert.group.save(update_fields=["active_resolve_calculation_id"]) logger.info(f"Created alert {alert.pk} for alert group {alert.group.pk}")