Limit number of alertmanager alerts in alert group to autoresolve (#1779)
# What this PR does This PR set the limit so that workers won't attempt to autoresolve too big alertmanager alert groups. ## Which issue(s) this PR fixes ## Checklist - [ ] Unit, integration, and e2e (if applicable) tests updated - [ ] Documentation added (or `pr:no public docs` PR label added if not required) - [ ] `CHANGELOG.md` updated (or `pr:no changelog` PR label added if not required)
This commit is contained in:
parent
e404d2f4b6
commit
6e61643750
6 changed files with 20 additions and 7 deletions
|
|
@ -19,6 +19,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||||
### Changed
|
### Changed
|
||||||
|
|
||||||
- Update shift API to use a default interval value (`1`) when a `frequency` is set and no `interval` is given
|
- Update shift API to use a default interval value (`1`) when a `frequency` is set and no `interval` is given
|
||||||
|
- Limit number of alertmanager alerts in alert group to autoresolve by 500 ([1779](https://github.com/grafana/oncall/pull/1779))
|
||||||
|
|
||||||
## v1.2.14 (2023-04-19)
|
## v1.2.14 (2023-04-19)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -68,5 +68,5 @@ Alertmanager offers three alert grouping options:
|
||||||
distraction. Grafana OnCall grouping will help manage this in the following ways:
|
distraction. Grafana OnCall grouping will help manage this in the following ways:
|
||||||
|
|
||||||
- Grafana OnCall groups alerts based on the first label of each alert.
|
- Grafana OnCall groups alerts based on the first label of each alert.
|
||||||
- Grafana OnCall marks an incident as resolved only when the amount of grouped alerts with state `resolved` equals
|
- Grafana OnCall marks an alert group as resolved only when there are fewer than 500 grouped
|
||||||
the amount of alerts with state `firing`.
|
alerts, and every `firing` alert with the same labels has a corresponding `resolved` alert
|
||||||
|
|
|
||||||
|
|
@ -145,7 +145,7 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models.
|
||||||
"GroupData", ["is_resolve_signal", "group_distinction", "web_title_cache", "is_acknowledge_signal"]
|
"GroupData", ["is_resolve_signal", "group_distinction", "web_title_cache", "is_acknowledge_signal"]
|
||||||
)
|
)
|
||||||
|
|
||||||
SOURCE, USER, NOT_YET, LAST_STEP, ARCHIVED, WIPED, DISABLE_MAINTENANCE = range(7)
|
SOURCE, USER, NOT_YET, LAST_STEP, ARCHIVED, WIPED, DISABLE_MAINTENANCE, NOT_YET_STOP_AUTORESOLVE = range(8)
|
||||||
SOURCE_CHOICES = (
|
SOURCE_CHOICES = (
|
||||||
(SOURCE, "source"),
|
(SOURCE, "source"),
|
||||||
(USER, "user"),
|
(USER, "user"),
|
||||||
|
|
@ -154,6 +154,7 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models.
|
||||||
(ARCHIVED, "archived"),
|
(ARCHIVED, "archived"),
|
||||||
(WIPED, "wiped"),
|
(WIPED, "wiped"),
|
||||||
(DISABLE_MAINTENANCE, "stop maintenance"),
|
(DISABLE_MAINTENANCE, "stop maintenance"),
|
||||||
|
(NOT_YET_STOP_AUTORESOLVE, "not yet, autoresolve disabled"),
|
||||||
)
|
)
|
||||||
|
|
||||||
ACKNOWLEDGE = "acknowledge"
|
ACKNOWLEDGE = "acknowledge"
|
||||||
|
|
|
||||||
|
|
@ -7,12 +7,16 @@ from apps.alerts.models import Alert, AlertGroup
|
||||||
|
|
||||||
|
|
||||||
class AlertGroupForAlertManager(AlertGroup):
|
class AlertGroupForAlertManager(AlertGroup):
|
||||||
|
MAX_ALERTS_IN_GROUP_FOR_AUTO_RESOLVE = 500
|
||||||
|
|
||||||
def is_alert_a_resolve_signal(self, alert):
|
def is_alert_a_resolve_signal(self, alert):
|
||||||
non_resolved_hashes = set()
|
non_resolved_hashes = set()
|
||||||
hash = alert.get_integration_optimization_hash()
|
hash = alert.get_integration_optimization_hash()
|
||||||
if alert.calculated_is_resolve_signal:
|
if alert.calculated_is_resolve_signal:
|
||||||
# Calculate leftover hashes
|
# Calculate leftover hashes
|
||||||
for alert in AlertForAlertManager.objects.filter(group=self).exclude(pk=alert.pk).all():
|
for alert in AlertForAlertManager.objects.filter(group=self).exclude(pk=alert.pk)[
|
||||||
|
: AlertGroupForAlertManager.MAX_ALERTS_IN_GROUP_FOR_AUTO_RESOLVE
|
||||||
|
]:
|
||||||
if alert.calculated_is_resolve_signal:
|
if alert.calculated_is_resolve_signal:
|
||||||
try:
|
try:
|
||||||
non_resolved_hashes.remove(alert.get_integration_optimization_hash())
|
non_resolved_hashes.remove(alert.get_integration_optimization_hash())
|
||||||
|
|
|
||||||
|
|
@ -22,6 +22,11 @@ def resolve_alert_group_by_source_if_needed(alert_group_pk):
|
||||||
alert_group.active_resolve_calculation_id
|
alert_group.active_resolve_calculation_id
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
|
if alert_group.resolved_by == alert_group.NOT_YET_STOP_AUTORESOLVE:
|
||||||
|
return "alert_group is too big to auto-resolve"
|
||||||
|
if alert_group.alerts.count() > AlertGroupForAlertManager.MAX_ALERTS_IN_GROUP_FOR_AUTO_RESOLVE:
|
||||||
|
alert_group.resolved_by = alert_group.NOT_YET_STOP_AUTORESOLVE
|
||||||
|
alert_group.save(update_fields=["resolved_by"])
|
||||||
last_alert = AlertForAlertManager.objects.get(pk=alert_group.alerts.last().pk)
|
last_alert = AlertForAlertManager.objects.get(pk=alert_group.alerts.last().pk)
|
||||||
if alert_group.is_alert_a_resolve_signal(last_alert):
|
if alert_group.is_alert_a_resolve_signal(last_alert):
|
||||||
alert_group.resolve_by_source()
|
alert_group.resolve_by_source()
|
||||||
|
|
|
||||||
|
|
@ -58,9 +58,11 @@ def create_alertmanager_alerts(alert_receive_channel_pk, alert, is_demo=False, f
|
||||||
return
|
return
|
||||||
|
|
||||||
if alert_receive_channel.allow_source_based_resolving:
|
if alert_receive_channel.allow_source_based_resolving:
|
||||||
task = resolve_alert_group_by_source_if_needed.apply_async((alert.group.pk,), countdown=5)
|
alert_group = alert.group
|
||||||
alert.group.active_resolve_calculation_id = task.id
|
if alert_group.resolved_by != alert_group.NOT_YET_STOP_AUTORESOLVE:
|
||||||
alert.group.save(update_fields=["active_resolve_calculation_id"])
|
task = resolve_alert_group_by_source_if_needed.apply_async((alert.group.pk,), countdown=5)
|
||||||
|
alert.group.active_resolve_calculation_id = task.id
|
||||||
|
alert.group.save(update_fields=["active_resolve_calculation_id"])
|
||||||
|
|
||||||
logger.info(f"Created alert {alert.pk} for alert group {alert.group.pk}")
|
logger.info(f"Created alert {alert.pk} for alert group {alert.group.pk}")
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue