OnCall prometheus metrics exporter (#1605)

# What this PR does
Add OnCall prometheus metrics exporter

## Which issue(s) this PR fixes

## Checklist

- [x] Tests updated
- [ ] Documentation added
- [ ] `CHANGELOG.md` updated

---------

Co-authored-by: Joey Orlando <joey.orlando@grafana.com>
Co-authored-by: Matias Bordese <mbordese@gmail.com>
This commit is contained in:
Yulya Artyukhina 2023-05-25 20:26:13 +02:00 committed by GitHub
parent 64521e721d
commit 15ef692009
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
25 changed files with 1739 additions and 75 deletions

View file

@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## Unreleased
### Added
- Prometheus exporter backend for alert groups related metrics
### Fixed
- Fix error when updating closed modal window in Slack by @vadimkerr ([#2019](https://github.com/grafana/oncall/pull/2019))

View file

@ -1,3 +1,6 @@
from enum import Enum
class ActionSource:
(
SLACK,
@ -10,3 +13,11 @@ class ActionSource:
TASK_DELAY_SECONDS = 1
NEXT_ESCALATION_DELAY = 5
# AlertGroup states verbal
class AlertGroupState(str, Enum):
FIRING = "firing"
ACKNOWLEDGED = "acknowledged"
RESOLVED = "resolved"
SILENCED = "silenced"

View file

@ -0,0 +1,23 @@
# Generated by Django 3.2.19 on 2023-05-23 13:55
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('alerts', '0015_auto_20230508_1641'),
]
operations = [
migrations.AddField(
model_name='alertgroup',
name='response_time',
field=models.DurationField(default=None, null=True),
),
migrations.AddField(
model_name='alertgroup',
name='restarted_at',
field=models.DateTimeField(blank=True, default=None, null=True),
),
]

View file

@ -11,16 +11,20 @@ from django.conf import settings
from django.core.validators import MinLengthValidator
from django.db import IntegrityError, models, transaction
from django.db.models import JSONField, Q, QuerySet
from django.db.models.signals import post_save
from django.dispatch import receiver
from django.utils import timezone
from django.utils.functional import cached_property
from django_deprecate_fields import deprecate_field
from apps.alerts.constants import AlertGroupState
from apps.alerts.escalation_snapshot import EscalationSnapshotMixin
from apps.alerts.incident_appearance.renderers.constants import DEFAULT_BACKUP_TITLE
from apps.alerts.incident_appearance.renderers.slack_renderer import AlertGroupSlackRenderer
from apps.alerts.incident_log_builder import IncidentLogBuilder
from apps.alerts.signals import alert_group_action_triggered_signal, alert_group_created_signal
from apps.alerts.tasks import acknowledge_reminder_task, call_ack_url, send_alert_group_signal, unsilence_task
from apps.metrics_exporter.metrics_cache_manager import MetricsCacheManager
from apps.slack.slack_formatter import SlackFormatter
from apps.user_management.models import User
from common.public_primary_keys import generate_public_primary_key, increase_public_primary_key_length
@ -263,6 +267,10 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models.
silenced_until = models.DateTimeField(blank=True, null=True)
unsilence_task_uuid = models.CharField(max_length=100, null=True, default=None)
restarted_at = models.DateTimeField(blank=True, null=True, default=None)
response_time = models.DurationField(null=True, default=None)
@property
def is_silenced_forever(self):
return self.silenced and self.silenced_until is None
@ -502,9 +510,35 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models.
def happened_while_maintenance(self):
return self.root_alert_group is not None and self.root_alert_group.maintenance_uuid is not None
def _get_response_time(self):
"""Return response_time based on current alert group status."""
response_time = None
timestamps = (self.acknowledged_at, self.resolved_at, self.silenced_at, self.wiped_at)
min_timestamp = min((ts for ts in timestamps if ts), default=None)
if min_timestamp:
response_time = min_timestamp - self.started_at
return response_time
def _update_metrics(self, organization_id, previous_state, state):
"""Update metrics cache for response time and state as needed."""
updated_response_time = self.response_time
if previous_state != AlertGroupState.FIRING or self.restarted_at:
# only consider response time from the first action
updated_response_time = None
MetricsCacheManager.metrics_update_cache_for_alert_group(
self.channel_id,
organization_id=organization_id,
old_state=previous_state,
new_state=state,
response_time=updated_response_time,
started_at=self.started_at,
)
def acknowledge_by_user(self, user: User, action_source: Optional[str] = None) -> None:
AlertGroupLogRecord = apps.get_model("alerts", "AlertGroupLogRecord")
initial_state = self.state
logger.debug(f"Started acknowledge_by_user for alert_group {self.pk}")
# if incident was silenced or resolved, unsilence/unresolve it without starting escalation
if self.silenced:
self.un_silence()
@ -519,6 +553,9 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models.
self.log_records.create(type=AlertGroupLogRecord.TYPE_UN_RESOLVED, author=user, reason="Acknowledge button")
self.acknowledge(acknowledged_by_user=user, acknowledged_by=AlertGroup.USER)
# Update alert group state and response time metrics cache
self._update_metrics(organization_id=user.organization_id, previous_state=initial_state, state=self.state)
self.stop_escalation()
if self.is_root_alert_group:
self.start_ack_reminder(user)
@ -546,6 +583,7 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models.
def acknowledge_by_source(self):
AlertGroupLogRecord = apps.get_model("alerts", "AlertGroupLogRecord")
initial_state = self.state
# if incident was silenced, unsilence it without starting escalation
if self.silenced:
@ -556,6 +594,10 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models.
reason="Acknowledge by source",
)
self.acknowledge(acknowledged_by=AlertGroup.SOURCE)
# Update alert group state and response time metrics cache
self._update_metrics(
organization_id=self.channel.organization_id, previous_state=initial_state, state=self.state
)
self.stop_escalation()
log_record = self.log_records.create(type=AlertGroupLogRecord.TYPE_ACK)
@ -576,9 +618,12 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models.
def un_acknowledge_by_user(self, user: User, action_source: Optional[str] = None) -> None:
AlertGroupLogRecord = apps.get_model("alerts", "AlertGroupLogRecord")
initial_state = self.state
logger.debug(f"Started un_acknowledge_by_user for alert_group {self.pk}")
self.unacknowledge()
# Update alert group state metric cache
self._update_metrics(organization_id=user.organization_id, previous_state=initial_state, state=self.state)
if self.is_root_alert_group:
self.start_escalation_if_needed()
@ -601,6 +646,7 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models.
def resolve_by_user(self, user: User, action_source: Optional[str] = None) -> None:
AlertGroupLogRecord = apps.get_model("alerts", "AlertGroupLogRecord")
initial_state = self.state
# if incident was silenced, unsilence it without starting escalation
if self.silenced:
@ -612,6 +658,8 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models.
reason="Resolve button",
)
self.resolve(resolved_by=AlertGroup.USER, resolved_by_user=user)
# Update alert group state and response time metrics cache
self._update_metrics(organization_id=user.organization_id, previous_state=initial_state, state=self.state)
self.stop_escalation()
log_record = self.log_records.create(type=AlertGroupLogRecord.TYPE_RESOLVED, author=user)
@ -631,6 +679,8 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models.
def resolve_by_source(self):
AlertGroupLogRecord = apps.get_model("alerts", "AlertGroupLogRecord")
initial_state = self.state
# if incident was silenced, unsilence it without starting escalation
if self.silenced:
self.un_silence()
@ -640,6 +690,10 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models.
reason="Resolve by source",
)
self.resolve(resolved_by=AlertGroup.SOURCE)
# Update alert group state and response time metrics cache
self._update_metrics(
organization_id=self.channel.organization_id, previous_state=initial_state, state=self.state
)
self.stop_escalation()
log_record = self.log_records.create(type=AlertGroupLogRecord.TYPE_RESOLVED)
@ -657,6 +711,7 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models.
for dependent_alert_group in self.dependent_alert_groups.all():
dependent_alert_group.resolve_by_source()
# deprecated
def resolve_by_archivation(self):
AlertGroupLogRecord = apps.get_model("alerts", "AlertGroupLogRecord")
# if incident was silenced, unsilence it without starting escalation
@ -690,8 +745,13 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models.
def resolve_by_last_step(self):
AlertGroupLogRecord = apps.get_model("alerts", "AlertGroupLogRecord")
initial_state = self.state
self.resolve(resolved_by=AlertGroup.LAST_STEP)
# Update alert group state and response time metrics cache
self._update_metrics(
organization_id=self.channel.organization_id, previous_state=initial_state, state=self.state
)
self.stop_escalation()
log_record = self.log_records.create(type=AlertGroupLogRecord.TYPE_RESOLVED)
@ -735,7 +795,11 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models.
AlertGroupLogRecord = apps.get_model("alerts", "AlertGroupLogRecord")
if self.wiped_at is None:
initial_state = self.state
self.unresolve()
# Update alert group state metric cache
self._update_metrics(organization_id=user.organization_id, previous_state=initial_state, state=self.state)
log_record = self.log_records.create(type=AlertGroupLogRecord.TYPE_UN_RESOLVED, author=user)
if self.is_root_alert_group:
@ -906,6 +970,8 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models.
def silence_by_user(self, user: User, silence_delay: Optional[int], action_source: Optional[str] = None) -> None:
AlertGroupLogRecord = apps.get_model("alerts", "AlertGroupLogRecord")
initial_state = self.state
if self.resolved:
self.unresolve()
self.log_records.create(type=AlertGroupLogRecord.TYPE_UN_RESOLVED, author=user, reason="Silence button")
@ -935,6 +1001,8 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models.
silenced_until = None
self.silence(silenced_at=now, silenced_until=silenced_until, silenced_by_user=user)
# Update alert group state and response time metrics cache
self._update_metrics(organization_id=user.organization_id, previous_state=initial_state, state=self.state)
log_record = self.log_records.create(
type=AlertGroupLogRecord.TYPE_SILENCE,
@ -959,8 +1027,12 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models.
def un_silence_by_user(self, user: User, action_source: Optional[str] = None) -> None:
AlertGroupLogRecord = apps.get_model("alerts", "AlertGroupLogRecord")
initial_state = self.state
self.un_silence()
# Update alert group state metric cache
self._update_metrics(organization_id=user.organization_id, previous_state=initial_state, state=self.state)
if self.is_root_alert_group:
self.start_escalation_if_needed()
@ -988,6 +1060,7 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models.
def wipe_by_user(self, user: User) -> None:
AlertGroupLogRecord = apps.get_model("alerts", "AlertGroupLogRecord")
initial_state = self.state
if not self.wiped_at:
self.resolve(resolved_by=AlertGroup.WIPED)
@ -996,10 +1069,19 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models.
self.web_title_cache = None
self.wiped_at = timezone.now()
self.wiped_by = user
update_fields = ["distinction", "web_title_cache", "wiped_at", "wiped_by"]
if self.response_time is None:
self.response_time = self._get_response_time()
update_fields += ["response_time"]
for alert in self.alerts.all():
alert.wipe(wiped_by=self.wiped_by, wiped_at=self.wiped_at)
self.save(update_fields=["distinction", "web_title_cache", "wiped_at", "wiped_by"])
self.save(update_fields=update_fields)
# Update alert group state and response time metrics cache
self._update_metrics(organization_id=user.organization_id, previous_state=initial_state, state=self.state)
log_record = self.log_records.create(
type=AlertGroupLogRecord.TYPE_WIPED,
@ -1023,6 +1105,8 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models.
def delete_by_user(self, user: User):
AlertGroupLogRecord = apps.get_model("alerts", "AlertGroupLogRecord")
initial_state = self.state
self.stop_escalation()
# prevent creating multiple logs
# filter instead of get_or_create cause it can be multiple logs of this type due deleting error
@ -1052,6 +1136,8 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models.
dependent_alerts = list(self.dependent_alert_groups.all())
self.hard_delete()
# Update alert group state metric cache
self._update_metrics(organization_id=user.organization_id, previous_state=initial_state, state=None)
for dependent_alert_group in dependent_alerts: # unattach dependent incidents
dependent_alert_group.un_attach_by_delete()
@ -1076,31 +1162,52 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models.
AlertGroupLogRecord = apps.get_model("alerts", "AlertGroupLogRecord")
# it is needed to unserolve those alert_groups which were resolved to build proper log.
alert_groups_to_unresolve_before_acknowledge = alert_groups_to_acknowledge.filter(resolved=True)
alert_groups_to_unresolve_before_acknowledge = alert_groups_to_acknowledge.filter(resolved=models.Value("1"))
# it is needed to unsilence those alert_groups which were silenced to build proper log.
alert_groups_to_unsilence_before_acknowledge = alert_groups_to_acknowledge.filter(silenced=True)
alert_groups_to_unsilence_before_acknowledge = alert_groups_to_acknowledge.filter(silenced=models.Value("1"))
# convert current qs to list to prevent changes by update
alert_groups_to_acknowledge_list = list(alert_groups_to_acknowledge)
alert_groups_to_unresolve_before_acknowledge_list = list(alert_groups_to_unresolve_before_acknowledge)
alert_groups_to_unsilence_before_acknowledge_list = list(alert_groups_to_unsilence_before_acknowledge)
alert_groups_to_acknowledge.update(
acknowledged=True,
resolved=False,
resolved_at=None,
resolved_by=AlertGroup.NOT_YET,
resolved_by_user=None,
silenced_until=None,
silenced_by_user=None,
silenced_at=None,
silenced=False,
acknowledged_at=timezone.now(),
acknowledged_by_user=user,
acknowledged_by=AlertGroup.USER,
is_escalation_finished=True,
)
previous_states = []
for alert_group in alert_groups_to_acknowledge_list:
previous_states.append(alert_group.state)
alert_group.acknowledged = True
alert_group.resolved = False
alert_group.resolved_at = None
alert_group.resolved_by = AlertGroup.NOT_YET
alert_group.resolved_by_user = None
alert_group.silenced_until = None
alert_group.silenced_by_user = None
alert_group.silenced_at = None
alert_group.silenced = False
alert_group.acknowledged_at = timezone.now()
alert_group.acknowledged_by_user = user
alert_group.acknowledged_by = AlertGroup.USER
alert_group.is_escalation_finished = True
if alert_group.response_time is None:
alert_group.response_time = alert_group._get_response_time()
fields_to_update = [
"acknowledged",
"resolved",
"resolved_at",
"resolved_by",
"resolved_by_user",
"silenced_until",
"silenced_by_user",
"silenced_at",
"silenced",
"acknowledged_at",
"acknowledged_by_user",
"acknowledged_by",
"is_escalation_finished",
"response_time",
]
AlertGroup.all_objects.bulk_update(alert_groups_to_acknowledge_list, fields=fields_to_update, batch_size=100)
for alert_group in alert_groups_to_unresolve_before_acknowledge_list:
alert_group.log_records.create(
@ -1114,7 +1221,13 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models.
type=AlertGroupLogRecord.TYPE_UN_SILENCE, author=user, reason="Bulk action acknowledge"
)
for alert_group in alert_groups_to_acknowledge_list:
for alert_group, previous_state in zip(alert_groups_to_acknowledge_list, previous_states):
# update metrics cache
alert_group._update_metrics(
organization_id=user.organization_id,
previous_state=previous_state,
state=AlertGroupState.ACKNOWLEDGED,
)
if alert_group.is_root_alert_group:
alert_group.start_ack_reminder(user)
@ -1147,31 +1260,55 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models.
AlertGroupLogRecord = apps.get_model("alerts", "AlertGroupLogRecord")
# it is needed to unsilence those alert_groups which were silenced to build proper log.
alert_groups_to_unsilence_before_resolve = alert_groups_to_resolve.filter(silenced=True)
alert_groups_to_unsilence_before_resolve = alert_groups_to_resolve.filter(silenced=models.Value("1"))
# convert current qs to list to prevent changes by update
alert_groups_to_resolve_list = list(alert_groups_to_resolve)
alert_groups_to_unsilence_before_resolve_list = list(alert_groups_to_unsilence_before_resolve)
alert_groups_to_resolve.update(
resolved=True,
resolved_at=timezone.now(),
is_open_for_grouping=None,
resolved_by_user=user,
resolved_by=AlertGroup.USER,
is_escalation_finished=True,
silenced_until=None,
silenced_by_user=None,
silenced_at=None,
silenced=False,
)
previous_states = []
for alert_group in alert_groups_to_resolve_list:
previous_states.append(alert_group.state)
alert_group.resolved = True
alert_group.resolved_at = timezone.now()
alert_group.is_open_for_grouping = None
alert_group.resolved_by_user = user
alert_group.resolved_by = AlertGroup.USER
alert_group.is_escalation_finished = True
alert_group.silenced_until = None
alert_group.silenced_by_user = None
alert_group.silenced_at = None
alert_group.silenced = False
if alert_group.response_time is None:
alert_group.response_time = alert_group._get_response_time()
fields_to_update = [
"resolved",
"resolved_at",
"resolved_by",
"resolved_by_user",
"is_open_for_grouping",
"silenced_until",
"silenced_by_user",
"silenced_at",
"silenced",
"is_escalation_finished",
"response_time",
]
AlertGroup.all_objects.bulk_update(alert_groups_to_resolve_list, fields=fields_to_update, batch_size=100)
for alert_group in alert_groups_to_unsilence_before_resolve_list:
alert_group.log_records.create(
type=AlertGroupLogRecord.TYPE_UN_SILENCE, author=user, reason="Bulk action resolve"
)
for alert_group in alert_groups_to_resolve_list:
for alert_group, previous_state in zip(alert_groups_to_resolve_list, previous_states):
# update metrics cache
alert_group._update_metrics(
organization_id=user.organization_id,
previous_state=previous_state,
state=AlertGroupState.RESOLVED,
)
log_record = alert_group.log_records.create(type=AlertGroupLogRecord.TYPE_RESOLVED, author=user)
send_alert_group_signal.apply_async((log_record.pk,))
@ -1223,10 +1360,17 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models.
silenced_by_user=None,
silenced_at=None,
silenced=False,
restarted_at=timezone.now(),
)
# unacknowledge alert groups
for alert_group in alert_groups_to_restart_unack_list:
# update metrics cache (note alert_group.state is the original alert group's state)
alert_group._update_metrics(
organization_id=user.organization_id,
previous_state=alert_group.state,
state=AlertGroupState.FIRING,
)
log_record = alert_group.log_records.create(
type=AlertGroupLogRecord.TYPE_UN_ACK,
author=user,
@ -1259,10 +1403,17 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models.
silenced_by_user=None,
silenced_at=None,
silenced=False,
restarted_at=timezone.now(),
)
# unresolve alert groups
for alert_group in alert_groups_to_restart_unresolve_list:
# update metrics cache (note alert_group.state is the original alert group's state)
alert_group._update_metrics(
organization_id=user.organization_id,
previous_state=alert_group.state,
state=AlertGroupState.FIRING,
)
log_record = alert_group.log_records.create(
type=AlertGroupLogRecord.TYPE_UN_RESOLVED,
author=user,
@ -1295,10 +1446,17 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models.
silenced_by_user=None,
silenced_at=None,
silenced=False,
restarted_at=timezone.now(),
)
# unsilence alert groups
for alert_group in alert_groups_to_restart_unsilence_list:
# update metrics cache (note alert_group.state is the original alert group's state)
alert_group._update_metrics(
organization_id=user.organization_id,
previous_state=alert_group.state,
state=AlertGroupState.FIRING,
)
log_record = alert_group.log_records.create(
type=AlertGroupLogRecord.TYPE_UN_SILENCE, author=user, reason="Bulk action restart"
)
@ -1363,38 +1521,45 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models.
alert_groups_to_unacknowledge_before_silence_list = list(alert_groups_to_unacknowledge_before_silence)
alert_groups_to_unresolve_before_silence_list = list(alert_groups_to_unresolve_before_silence)
if silence_for_period:
alert_groups_to_silence.update(
acknowledged=False,
acknowledged_at=None,
acknowledged_by_user=None,
acknowledged_by=AlertGroup.NOT_YET,
resolved=False,
resolved_at=None,
resolved_by_user=None,
resolved_by=AlertGroup.NOT_YET,
silenced=True,
silenced_at=now,
silenced_until=silenced_until,
silenced_by_user=user,
)
else:
alert_groups_to_silence.update(
acknowledged=False,
acknowledged_at=None,
acknowledged_by_user=None,
acknowledged_by=AlertGroup.NOT_YET,
resolved=False,
resolved_at=None,
resolved_by_user=None,
resolved_by=AlertGroup.NOT_YET,
silenced=True,
silenced_at=now,
silenced_until=silenced_until,
silenced_by_user=user,
is_escalation_finished=True,
)
previous_states = []
for alert_group in alert_groups_to_silence_list:
previous_states.append(alert_group.state)
alert_group.acknowledged = False
alert_group.acknowledged_at = None
alert_group.acknowledged_by_user = None
alert_group.acknowledged_by = AlertGroup.NOT_YET
alert_group.resolved = False
alert_group.resolved_at = None
alert_group.resolved_by_user = None
alert_group.resolved_by = AlertGroup.NOT_YET
alert_group.silenced = True
alert_group.silenced_at = now
alert_group.silenced_until = silenced_until
alert_group.silenced_by_user = user
if not silence_for_period:
alert_group.is_escalation_finished = True
if alert_group.response_time is None:
alert_group.response_time = alert_group._get_response_time()
fields_to_update = [
"acknowledged",
"acknowledged_at",
"acknowledged_by_user",
"acknowledged_by",
"resolved",
"resolved_at",
"resolved_by_user",
"resolved_by",
"silenced",
"silenced_at",
"silenced_until",
"silenced_by_user",
"is_escalation_finished",
"response_time",
]
AlertGroup.all_objects.bulk_update(alert_groups_to_silence_list, fields=fields_to_update, batch_size=100)
# create log records
for alert_group in alert_groups_to_unresolve_before_silence_list:
alert_group.log_records.create(
type=AlertGroupLogRecord.TYPE_UN_RESOLVED,
@ -1416,7 +1581,13 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models.
reason="Bulk action silence",
)
for alert_group in alert_groups_to_silence_list:
for alert_group, previous_state in zip(alert_groups_to_silence_list, previous_states):
# update metrics cache
alert_group._update_metrics(
organization_id=user.organization_id,
previous_state=previous_state,
state=AlertGroupState.SILENCED,
)
log_record = alert_group.log_records.create(
type=AlertGroupLogRecord.TYPE_SILENCE,
author=user,
@ -1498,7 +1669,12 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models.
for k, v in kwargs.items():
setattr(self, k, v)
self.save(update_fields=["acknowledged", "acknowledged_at", *kwargs.keys()])
update_fields = ["acknowledged", "acknowledged_at", *kwargs.keys()]
if self.response_time is None:
self.response_time = self._get_response_time()
update_fields += ["response_time"]
self.save(update_fields=update_fields)
def unacknowledge(self):
self.un_silence()
@ -1518,7 +1694,12 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models.
for k, v in kwargs.items():
setattr(self, k, v)
self.save(update_fields=["resolved", "resolved_at", "is_open_for_grouping", *kwargs.keys()])
update_fields = ["resolved", "resolved_at", "is_open_for_grouping", *kwargs.keys()]
if self.response_time is None:
self.response_time = self._get_response_time()
update_fields += ["response_time"]
self.save(update_fields=update_fields)
def unresolve(self):
self.unacknowledge()
@ -1538,7 +1719,12 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models.
for k, v in kwargs.items():
setattr(self, k, v)
self.save(update_fields=["silenced", *kwargs.keys()])
update_fields = ["silenced", *kwargs.keys()]
if self.response_time is None:
self.response_time = self._get_response_time()
update_fields += ["response_time"]
self.save(update_fields=update_fields)
def un_silence(self):
self.silenced_until = None
@ -1546,8 +1732,16 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models.
self.silenced_at = None
self.silenced = False
self.unsilence_task_uuid = None
self.restarted_at = timezone.now()
self.save(
update_fields=["silenced_until", "silenced", "silenced_by_user", "silenced_at", "unsilence_task_uuid"]
update_fields=[
"silenced_until",
"silenced",
"silenced_by_user",
"silenced_at",
"unsilence_task_uuid",
"restarted_at",
]
)
def archive(self):
@ -1621,13 +1815,13 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models.
@property
def state(self):
if self.resolved:
return "resolved"
return AlertGroupState.RESOLVED
elif self.acknowledged:
return "acknowledged"
return AlertGroupState.ACKNOWLEDGED
elif self.silenced:
return "silenced"
return AlertGroupState.SILENCED
else:
return "new"
return AlertGroupState.FIRING
@property
def notify_in_slack_enabled(self):
@ -1675,3 +1869,15 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models.
)
return stop_escalation_log
@receiver(post_save, sender=AlertGroup)
def listen_for_alertgroup_model_save(sender, instance, created, *args, **kwargs):
if created and not instance.is_maintenance_incident:
# Update alert group state and response time metrics cache
instance._update_metrics(
organization_id=instance.channel.organization_id, previous_state=None, state=AlertGroupState.FIRING
)
post_save.connect(listen_for_alertgroup_model_save, AlertGroup)

View file

@ -23,6 +23,11 @@ from apps.base.messaging import get_messaging_backend_from_id
from apps.base.utils import live_settings
from apps.integrations.metadata import heartbeat
from apps.integrations.tasks import create_alert, create_alertmanager_alerts
from apps.metrics_exporter.helpers import (
metrics_add_integration_to_cache,
metrics_remove_deleted_integration_from_cache,
metrics_update_integration_cache,
)
from apps.slack.constants import SLACK_RATE_LIMIT_DELAY, SLACK_RATE_LIMIT_TIMEOUT
from apps.slack.tasks import post_slack_rate_limit_message
from apps.slack.utils import post_message_to_channel
@ -256,7 +261,15 @@ class AlertReceiveChannel(IntegrationOptionsMixin, MaintainableObject):
def grafana_alerting_sync_manager(self):
return GrafanaAlertingSyncManager(self)
@property
@cached_property
def team_name(self):
return self.team.name if self.team else "No team"
@cached_property
def team_id_or_no_team(self):
return self.team_id if self.team else "no_team"
@cached_property
def emojized_verbal_name(self):
return emoji.emojize(self.verbal_name, use_aliases=True)
@ -615,6 +628,13 @@ def listen_for_alertreceivechannel_model_save(sender, instance, created, *args,
heartbeat = IntegrationHeartBeat.objects.create(alert_receive_channel=instance, timeout_seconds=TEN_MINUTES)
write_resource_insight_log(instance=heartbeat, author=instance.author, event=EntityEvent.CREATED)
metrics_add_integration_to_cache(instance)
elif instance.deleted_at:
metrics_remove_deleted_integration_from_cache(instance)
else:
metrics_update_integration_cache(instance)
if instance.integration == AlertReceiveChannel.INTEGRATION_GRAFANA_ALERTING:
if created:
instance.grafana_alerting_sync_manager.create_contact_points()

View file

@ -30,8 +30,16 @@ def unsilence_task(alert_group_pk):
)
return
if alert_group.status == AlertGroup.SILENCED and alert_group.is_root_alert_group:
initial_state = alert_group.state
task_logger.info(f"unsilence alert_group {alert_group_pk} and start escalation if needed")
alert_group.un_silence()
# update metrics
alert_group._update_metrics(
organization_id=alert_group.channel.organization_id,
previous_state=initial_state,
state=alert_group.state,
)
alert_group.start_escalation_if_needed()
un_silence_log_record = AlertGroupLogRecord(
type=AlertGroupLogRecord.TYPE_UN_SILENCE,

View file

@ -69,3 +69,4 @@ def test_unsilence_alert_group(
assert alert_group.silenced_at is None
assert alert_group.silenced_until is None
assert alert_group.silenced_by_user is None
assert alert_group.restarted_at is not None

View file

View file

@ -0,0 +1,51 @@
import typing
from django.utils import timezone
class AlertGroupsTotalMetricsDict(typing.TypedDict):
integration_name: str
team_name: str
team_id: int
org_id: int
slug: str
id: int
firing: int
acknowledged: int
silenced: int
resolved: int
class AlertGroupsResponseTimeMetricsDict(typing.TypedDict):
integration_name: str
team_name: str
team_id: int
org_id: int
slug: str
id: int
response_time: list
class RecalculateMetricsTimer(typing.TypedDict):
recalculate_timeout: int
forced_started: bool
class RecalculateOrgMetricsDict(typing.TypedDict):
organization_id: int
force: bool
ALERT_GROUPS_TOTAL = "oncall_alert_groups_total"
ALERT_GROUPS_RESPONSE_TIME = "oncall_alert_groups_response_time_seconds"
METRICS_RESPONSE_TIME_CALCULATION_PERIOD = timezone.timedelta(days=7)
METRICS_CACHE_LIFETIME = 93600 # 26 hours. Should be higher than METRICS_RECALCULATE_CACHE_TIMEOUT
METRICS_CACHE_TIMER = "metrics_cache_timer"
METRICS_RECALCULATION_CACHE_TIMEOUT = 86400 # 24 hours. Should be less than METRICS_CACHE_LIFETIME
METRICS_RECALCULATION_CACHE_TIMEOUT_DISPERSE = (0, 3600) # 1 hour
METRICS_ORGANIZATIONS_IDS = "metrics_organizations_ids"
METRICS_ORGANIZATIONS_IDS_CACHE_TIMEOUT = 3600 # 1 hour

View file

@ -0,0 +1,227 @@
import random
import typing
from django.apps import apps
from django.core.cache import cache
from django.utils import timezone
from apps.alerts.constants import AlertGroupState
from apps.metrics_exporter.constants import (
ALERT_GROUPS_RESPONSE_TIME,
ALERT_GROUPS_TOTAL,
METRICS_CACHE_LIFETIME,
METRICS_CACHE_TIMER,
METRICS_ORGANIZATIONS_IDS,
METRICS_ORGANIZATIONS_IDS_CACHE_TIMEOUT,
METRICS_RECALCULATION_CACHE_TIMEOUT,
METRICS_RECALCULATION_CACHE_TIMEOUT_DISPERSE,
METRICS_RESPONSE_TIME_CALCULATION_PERIOD,
AlertGroupsResponseTimeMetricsDict,
AlertGroupsTotalMetricsDict,
)
def get_organization_ids():
"""Try getting organizations ids from cache, otherwise get from db and save values in cache"""
Organization = apps.get_model("user_management", "Organization")
organizations_ids = cache.get(METRICS_ORGANIZATIONS_IDS, [])
if not organizations_ids:
organizations_ids = Organization.objects.all().values_list("id", flat=True)
organizations_ids = list(organizations_ids)
cache.set(organizations_ids, METRICS_ORGANIZATIONS_IDS, METRICS_ORGANIZATIONS_IDS_CACHE_TIMEOUT)
return organizations_ids
def get_response_time_period():
"""Returns period for response time calculation"""
return timezone.now() - METRICS_RESPONSE_TIME_CALCULATION_PERIOD
def get_metrics_recalculation_timeout():
"""
Returns timeout when metrics should be recalculated.
Add some dispersion to avoid starting recalculation tasks for all organizations at the same time.
"""
return METRICS_RECALCULATION_CACHE_TIMEOUT + random.randint(*METRICS_RECALCULATION_CACHE_TIMEOUT_DISPERSE)
def get_metrics_cache_timeout(organization_id):
metrics_cache_timer_key = get_metrics_cache_timer_key(organization_id)
metrics_cache_timer = cache.get(metrics_cache_timer_key)
if metrics_cache_timer:
TWO_HOURS = 7200
metrics_cache_timeout = int(metrics_cache_timer.get("recalculate_timeout")) + TWO_HOURS
else:
metrics_cache_timeout = METRICS_CACHE_LIFETIME
return metrics_cache_timeout
def get_metrics_cache_timer_key(organization_id):
return f"{METRICS_CACHE_TIMER}_{organization_id}"
def get_metrics_cache_timer_for_organization(organization_id):
key = get_metrics_cache_timer_key(organization_id)
return cache.get(key)
def get_metric_alert_groups_total_key(organization_id):
return f"{ALERT_GROUPS_TOTAL}_{organization_id}"
def get_metric_alert_groups_response_time_key(organization_id):
return f"{ALERT_GROUPS_RESPONSE_TIME}_{organization_id}"
def metrics_update_integration_cache(integration):
"""Update integration data in metrics cache"""
metrics_cache_timeout = get_metrics_cache_timeout(integration.organization_id)
metric_alert_groups_total_key = get_metric_alert_groups_total_key(integration.organization_id)
metric_alert_groups_response_time_key = get_metric_alert_groups_response_time_key(integration.organization_id)
for metric_key in [metric_alert_groups_total_key, metric_alert_groups_response_time_key]:
metric_cache = cache.get(metric_key, {})
integration_metric_cache = metric_cache.get(integration.id)
if integration_metric_cache:
cache_updated = False
if integration_metric_cache["team_id"] != integration.team_id_or_no_team:
integration_metric_cache["team_id"] = integration.team_id_or_no_team
integration_metric_cache["team_name"] = integration.team_name
cache_updated = True
if integration_metric_cache["integration_name"] != integration.emojized_verbal_name:
integration_metric_cache["integration_name"] = integration.emojized_verbal_name
cache_updated = True
if cache_updated:
cache.set(metric_key, metric_cache, timeout=metrics_cache_timeout)
def metrics_remove_deleted_integration_from_cache(integration):
"""Remove data related to deleted integration from metrics cache"""
metrics_cache_timeout = get_metrics_cache_timeout(integration.organization_id)
metric_alert_groups_total_key = get_metric_alert_groups_total_key(integration.organization_id)
metric_alert_groups_response_time_key = get_metric_alert_groups_response_time_key(integration.organization_id)
for metric_key in [metric_alert_groups_total_key, metric_alert_groups_response_time_key]:
metric_cache = cache.get(metric_key)
if metric_cache:
metric_cache.pop(integration.id, None)
cache.set(metric_key, metric_cache, timeout=metrics_cache_timeout)
def metrics_add_integration_to_cache(integration):
"""Add new integration data to metrics cache"""
metrics_cache_timeout = get_metrics_cache_timeout(integration.organization_id)
metric_alert_groups_total_key = get_metric_alert_groups_total_key(integration.organization_id)
instance_slug = integration.organization.stack_slug
instance_id = integration.organization.stack_id
grafana_org_id = integration.organization.org_id
metric_alert_groups_total: typing.Dict[int, AlertGroupsTotalMetricsDict] = cache.get(
metric_alert_groups_total_key, {}
)
metric_alert_groups_total.setdefault(
integration.id,
{
"integration_name": integration.emojized_verbal_name,
"team_name": integration.team_name,
"team_id": integration.team_id_or_no_team,
"org_id": grafana_org_id,
"slug": instance_slug,
"id": instance_id,
AlertGroupState.FIRING.value: 0,
AlertGroupState.ACKNOWLEDGED.value: 0,
AlertGroupState.RESOLVED.value: 0,
AlertGroupState.SILENCED.value: 0,
},
)
cache.set(metric_alert_groups_total_key, metric_alert_groups_total, timeout=metrics_cache_timeout)
metric_alert_groups_response_time_key = get_metric_alert_groups_response_time_key(integration.organization_id)
metric_alert_groups_response_time: typing.Dict[int, AlertGroupsResponseTimeMetricsDict] = cache.get(
metric_alert_groups_response_time_key, {}
)
metric_alert_groups_response_time.setdefault(
integration.id,
{
"integration_name": integration.emojized_verbal_name,
"team_name": integration.team_name,
"team_id": integration.team_id_or_no_team,
"org_id": grafana_org_id,
"slug": instance_slug,
"id": instance_id,
"response_time": [],
},
)
cache.set(metric_alert_groups_response_time_key, metric_alert_groups_response_time, timeout=metrics_cache_timeout)
def metrics_bulk_update_team_label_cache(teams_updated_data, organization_id):
"""Update team related data in metrics cache for each team in `teams_updated_data`"""
if not teams_updated_data:
return
metrics_cache_timeout = get_metrics_cache_timeout(organization_id)
metric_alert_groups_total_key = get_metric_alert_groups_total_key(organization_id)
metric_alert_groups_response_time_key = get_metric_alert_groups_response_time_key(organization_id)
metric_alert_groups_total = cache.get(metric_alert_groups_total_key, {})
metric_alert_groups_response_time = cache.get(metric_alert_groups_response_time_key, {})
for team_id, team_data in teams_updated_data.items():
for integration_id in metric_alert_groups_total:
if metric_alert_groups_total[integration_id]["team_id"] == team_id:
integration_response_time_metrics = metric_alert_groups_response_time.get(integration_id)
if team_data["deleted"]:
metric_alert_groups_total[integration_id]["team_id"] = "no_team"
metric_alert_groups_total[integration_id]["team_name"] = "No team"
if integration_response_time_metrics:
integration_response_time_metrics["team_id"] = "no_team"
integration_response_time_metrics["team_name"] = "No team"
else:
metric_alert_groups_total[integration_id]["team_name"] = team_data["team_name"]
if integration_response_time_metrics:
integration_response_time_metrics["team_name"] = team_data["team_name"]
cache.set(metric_alert_groups_total_key, metric_alert_groups_total, timeout=metrics_cache_timeout)
cache.set(metric_alert_groups_response_time_key, metric_alert_groups_response_time, timeout=metrics_cache_timeout)
def metrics_update_alert_groups_state_cache(states_diff, organization_id):
"""Update alert groups state metric cache for each integration in states_diff dict."""
if not states_diff:
return
metrics_cache_timeout = get_metrics_cache_timeout(organization_id)
metric_alert_groups_total_key = get_metric_alert_groups_total_key(organization_id)
metric_alert_groups_total = cache.get(metric_alert_groups_total_key, {})
if not metric_alert_groups_total:
return
for integration_id, integration_states_diff in states_diff.items():
integration_alert_groups = metric_alert_groups_total.get(int(integration_id))
if not integration_alert_groups:
continue
for previous_state, counter in integration_states_diff["previous_states"].items():
if integration_alert_groups[previous_state] - counter > 0:
integration_alert_groups[previous_state] -= counter
else:
integration_alert_groups[previous_state] = 0
for new_state, counter in integration_states_diff["new_states"].items():
integration_alert_groups[new_state] += counter
cache.set(metric_alert_groups_total_key, metric_alert_groups_total, timeout=metrics_cache_timeout)
def metrics_update_alert_groups_response_time_cache(integrations_response_time, organization_id):
"""Update alert groups response time metric cache for each integration in `integrations_response_time` dict."""
if not integrations_response_time:
return
metrics_cache_timeout = get_metrics_cache_timeout(organization_id)
metric_alert_groups_response_time_key = get_metric_alert_groups_response_time_key(organization_id)
metric_alert_groups_response_time = cache.get(metric_alert_groups_response_time_key, {})
if not metric_alert_groups_response_time:
return
for integration_id, integration_response_time in integrations_response_time.items():
integration_response_time_metrics = metric_alert_groups_response_time.get(int(integration_id))
if not integration_response_time_metrics:
continue
integration_response_time_metrics["response_time"].extend(integration_response_time)
cache.set(metric_alert_groups_response_time_key, metric_alert_groups_response_time, timeout=metrics_cache_timeout)

View file

@ -0,0 +1,86 @@
from apps.alerts.constants import AlertGroupState
from apps.metrics_exporter.helpers import (
get_response_time_period,
metrics_update_alert_groups_response_time_cache,
metrics_update_alert_groups_state_cache,
)
class MetricsCacheManager:
@staticmethod
def get_default_teams_diff_dict():
default_dict = {
"team_name": None,
"deleted": False,
}
return default_dict
@staticmethod
def update_team_diff(teams_diff, team_id, new_name=None, deleted=False):
teams_diff.setdefault(team_id, MetricsCacheManager.get_default_teams_diff_dict())
teams_diff[team_id]["team_name"] = new_name
teams_diff[team_id]["deleted"] = deleted
return teams_diff
@staticmethod
def get_default_states_diff_dict():
default_dict = {
"previous_states": {state.value: 0 for state in AlertGroupState},
"new_states": {state.value: 0 for state in AlertGroupState},
}
return default_dict
@staticmethod
def update_integration_states_diff(metrics_dict, integration_id, previous_state=None, new_state=None):
metrics_dict.setdefault(integration_id, MetricsCacheManager.get_default_states_diff_dict())
if previous_state:
state_value = previous_state.value
metrics_dict[integration_id]["previous_states"][state_value] += 1
if new_state:
state_value = new_state.value
metrics_dict[integration_id]["new_states"][state_value] += 1
return metrics_dict
@staticmethod
def update_integration_response_time_diff(metrics_dict, integration_id, response_time_seconds):
metrics_dict.setdefault(integration_id, [])
metrics_dict[integration_id].append(response_time_seconds)
return metrics_dict
@staticmethod
def metrics_update_state_cache_for_alert_group(integration_id, organization_id, old_state=None, new_state=None):
"""
Update state metric cache for one alert group.
Run the task to update async if organization_id is None due to an additional request to db
"""
metrics_state_diff = MetricsCacheManager.update_integration_states_diff(
{}, integration_id, previous_state=old_state, new_state=new_state
)
metrics_update_alert_groups_state_cache(metrics_state_diff, organization_id)
@staticmethod
def metrics_update_response_time_cache_for_alert_group(integration_id, organization_id, response_time_seconds):
"""
Update response time metric cache for one alert group.
Run the task to update async if organization_id is None due to an additional request to db
"""
metrics_response_time = MetricsCacheManager.update_integration_response_time_diff(
{}, integration_id, response_time_seconds
)
metrics_update_alert_groups_response_time_cache(metrics_response_time, organization_id)
@staticmethod
def metrics_update_cache_for_alert_group(
integration_id, organization_id, old_state=None, new_state=None, response_time=None, started_at=None
):
"""Call methods to update state and response time metrics cache for one alert group."""
if response_time and old_state == AlertGroupState.FIRING and started_at > get_response_time_period():
response_time_seconds = int(response_time.total_seconds())
MetricsCacheManager.metrics_update_response_time_cache_for_alert_group(
integration_id, organization_id, response_time_seconds
)
if old_state or new_state:
MetricsCacheManager.metrics_update_state_cache_for_alert_group(
integration_id, organization_id, old_state, new_state
)

View file

@ -0,0 +1,122 @@
import typing
from django.core.cache import cache
from prometheus_client import CollectorRegistry
from prometheus_client.metrics_core import GaugeMetricFamily, HistogramMetricFamily
from apps.alerts.constants import AlertGroupState
from apps.metrics_exporter.constants import (
ALERT_GROUPS_RESPONSE_TIME,
ALERT_GROUPS_TOTAL,
AlertGroupsResponseTimeMetricsDict,
AlertGroupsTotalMetricsDict,
RecalculateOrgMetricsDict,
)
from apps.metrics_exporter.helpers import (
get_metric_alert_groups_response_time_key,
get_metric_alert_groups_total_key,
get_metrics_cache_timer_for_organization,
get_organization_ids,
)
from apps.metrics_exporter.tasks import start_calculate_and_cache_metrics
application_metrics_registry = CollectorRegistry()
# https://github.com/prometheus/client_python#custom-collectors
class ApplicationMetricsCollector:
def __init__(self):
self._buckets = (60, 300, 600, 3600, "+Inf")
self._labels = [
"integration",
"team",
"org_id",
"slug",
"id",
]
self._labels_with_state = self._labels + ["state"]
def collect(self):
alert_groups_total = GaugeMetricFamily(ALERT_GROUPS_TOTAL, "All alert groups", labels=self._labels_with_state)
alert_groups_response_time_seconds = HistogramMetricFamily(
ALERT_GROUPS_RESPONSE_TIME, "Users response time to alert groups in 7 days (seconds)", labels=self._labels
)
metrics_to_recalculate = []
org_ids = get_organization_ids()
for organization_id in org_ids:
start_calculation_task = False
# get alert_groups_total metric
alert_groups_total_key = get_metric_alert_groups_total_key(organization_id)
ag_states: typing.Dict[int, AlertGroupsTotalMetricsDict] = cache.get(alert_groups_total_key)
if ag_states is None:
start_calculation_task = True
else:
for integration, integration_data in ag_states.items():
# Labels values should have the same order as _labels
labels_values = [
integration_data["integration_name"], # integration
integration_data["team_name"], # team
integration_data["org_id"], # org_id
integration_data["slug"], # grafana instance slug
integration_data["id"], # grafana instance id
]
labels_values = list(map(str, labels_values))
for state in AlertGroupState:
alert_groups_total.add_metric(labels_values + [state.value], integration_data[state.value])
# get alert_groups_response_time metric
alert_groups_response_time_key = get_metric_alert_groups_response_time_key(organization_id)
ag_response_time: typing.Dict[int, AlertGroupsResponseTimeMetricsDict] = cache.get(
alert_groups_response_time_key
)
if ag_response_time is None:
start_calculation_task = True
else:
for integration, integration_data in ag_response_time.items():
# Labels values should have the same order as _labels
labels_values = [
integration_data["integration_name"], # integration
integration_data["team_name"], # team
integration_data["org_id"], # org_id
integration_data["slug"], # grafana instance slug
integration_data["id"], # grafana instance id
]
labels_values = list(map(str, labels_values))
response_time_values = integration_data["response_time"]
if not response_time_values:
continue
buckets, sum_value = self.get_buckets_with_sum(response_time_values)
buckets = sorted(list(buckets.items()), key=lambda x: float(x[0]))
alert_groups_response_time_seconds.add_metric(labels_values, buckets=buckets, sum_value=sum_value)
if start_calculation_task or not get_metrics_cache_timer_for_organization(organization_id):
org_to_recalculate: RecalculateOrgMetricsDict = {
"organization_id": organization_id,
"force": start_calculation_task,
}
metrics_to_recalculate.append(org_to_recalculate)
if metrics_to_recalculate:
start_calculate_and_cache_metrics.apply_async((metrics_to_recalculate,))
yield alert_groups_total
yield alert_groups_response_time_seconds
def get_buckets_with_sum(self, values):
"""Put values in correct buckets and count values sum"""
buckets_values = {str(key): 0 for key in self._buckets}
sum_value = 0
for value in values:
for bucket in self._buckets:
if value <= float(bucket):
buckets_values[str(bucket)] += 1.0
sum_value += value
return buckets_values, sum_value
application_metrics_registry.register(ApplicationMetricsCollector())

View file

@ -0,0 +1,151 @@
import typing
from django.apps import apps
from django.conf import settings
from django.core.cache import cache
from django.db.models import Q
from apps.alerts.constants import AlertGroupState
from apps.metrics_exporter.constants import (
METRICS_ORGANIZATIONS_IDS,
METRICS_ORGANIZATIONS_IDS_CACHE_TIMEOUT,
AlertGroupsResponseTimeMetricsDict,
AlertGroupsTotalMetricsDict,
RecalculateMetricsTimer,
RecalculateOrgMetricsDict,
)
from apps.metrics_exporter.helpers import (
get_metric_alert_groups_response_time_key,
get_metric_alert_groups_total_key,
get_metrics_cache_timer_key,
get_metrics_recalculation_timeout,
get_response_time_period,
)
from apps.user_management.models import Organization
from common.custom_celery_tasks import shared_dedicated_queue_retry_task
from common.database import get_random_readonly_database_key_if_present_otherwise_default
@shared_dedicated_queue_retry_task(
autoretry_for=(Exception,), retry_backoff=True, max_retries=1 if settings.DEBUG else None
)
def save_organizations_ids_in_cache():
organizations_ids = Organization.objects.all().values_list("id", flat=True)
organizations_ids = list(organizations_ids)
cache.set(organizations_ids, METRICS_ORGANIZATIONS_IDS, METRICS_ORGANIZATIONS_IDS_CACHE_TIMEOUT)
@shared_dedicated_queue_retry_task(
autoretry_for=(Exception,), retry_backoff=True, max_retries=1 if settings.DEBUG else None
)
def start_calculate_and_cache_metrics(metrics_to_recalculate: list[RecalculateOrgMetricsDict]):
"""Start calculation metrics for each object in metrics_to_recalculate"""
for counter, recalculation_data in enumerate(metrics_to_recalculate):
# start immediately if recalculation starting has been forced
countdown = 0 if recalculation_data.get("force") else counter
calculate_and_cache_metrics.apply_async(kwargs=recalculation_data, countdown=countdown)
@shared_dedicated_queue_retry_task(
autoretry_for=(Exception,), retry_backoff=True, max_retries=1 if settings.DEBUG else None
)
def calculate_and_cache_metrics(organization_id, force=False):
"""
Calculate metrics for organization.
Before calculation checks if calculation has already been started to avoid too frequent launch or parallel launch
"""
AlertGroup = apps.get_model("alerts", "AlertGroup")
AlertReceiveChannel = apps.get_model("alerts", "AlertReceiveChannel")
ONE_HOUR = 3600
TWO_HOURS = 7200
recalculate_timeout = get_metrics_recalculation_timeout()
# check if recalculation has been already started
metrics_cache_timer_key = get_metrics_cache_timer_key(organization_id)
metrics_cache_timer = cache.get(metrics_cache_timer_key)
if metrics_cache_timer:
if not force or metrics_cache_timer.get("forced_started", False):
return
else:
metrics_cache_timer["forced_started"] = True
else:
metrics_cache_timer: RecalculateMetricsTimer = {
"recalculate_timeout": recalculate_timeout,
"forced_started": force,
}
metrics_cache_timer["recalculate_timeout"] = recalculate_timeout
cache.set(metrics_cache_timer_key, metrics_cache_timer, timeout=recalculate_timeout)
integrations = (
AlertReceiveChannel.objects.using(get_random_readonly_database_key_if_present_otherwise_default())
.filter(
~Q(integration=AlertReceiveChannel.INTEGRATION_MAINTENANCE)
& Q(organization__deleted_at__isnull=True)
& Q(organization_id=organization_id)
)
.select_related("organization", "team")
.prefetch_related("alert_groups")
)
response_time_period = get_response_time_period()
metric_alert_group_total: typing.Dict[int, AlertGroupsTotalMetricsDict] = {}
metric_alert_group_response_time: typing.Dict[int, AlertGroupsResponseTimeMetricsDict] = {}
states = {
AlertGroupState.FIRING.value: AlertGroup.get_new_state_filter(),
AlertGroupState.SILENCED.value: AlertGroup.get_silenced_state_filter(),
AlertGroupState.ACKNOWLEDGED.value: AlertGroup.get_acknowledged_state_filter(),
AlertGroupState.RESOLVED.value: AlertGroup.get_resolved_state_filter(),
}
for integration in integrations:
instance_slug = integration.organization.stack_slug
instance_id = integration.organization.stack_id
# calculate states
for state, alert_group_filter in states.items():
metric_alert_group_total.setdefault(
integration.id,
{
"integration_name": integration.emojized_verbal_name,
"team_name": integration.team_name,
"team_id": integration.team_id_or_no_team,
"org_id": integration.organization.org_id,
"slug": instance_slug,
"id": instance_id,
},
)[state] = integration.alert_groups.filter(alert_group_filter).count()
# calculate response time
all_response_time = []
alert_groups = integration.alert_groups.filter(started_at__gte=response_time_period)
for alert_group in alert_groups:
if alert_group.response_time:
all_response_time.append(int(alert_group.response_time.total_seconds()))
elif alert_group.state != AlertGroupState.FIRING:
# get calculated value from current alert group information
response_time = alert_group._get_response_time()
if response_time:
all_response_time.append(int(response_time.total_seconds()))
metric_alert_group_response_time[integration.id] = {
"integration_name": integration.emojized_verbal_name,
"team_name": integration.team_name,
"team_id": integration.team_id_or_no_team,
"org_id": integration.organization.org_id,
"slug": instance_slug,
"id": instance_id,
"response_time": all_response_time,
}
metric_alert_groups_total_key = get_metric_alert_groups_total_key(organization_id)
metric_alert_groups_response_time_key = get_metric_alert_groups_response_time_key(organization_id)
metrics_cache_timeout = recalculate_timeout + TWO_HOURS
cache.set(metric_alert_groups_total_key, metric_alert_group_total, timeout=metrics_cache_timeout)
cache.set(metric_alert_groups_response_time_key, metric_alert_group_response_time, timeout=metrics_cache_timeout)
if metrics_cache_timer["forced_started"]:
metrics_cache_timer["forced_started"] = False
cache.set(metrics_cache_timer_key, metrics_cache_timer, timeout=recalculate_timeout - ONE_HOUR)

View file

@ -0,0 +1,100 @@
import pytest
from django.core.cache import cache
from apps.metrics_exporter.constants import ALERT_GROUPS_RESPONSE_TIME, ALERT_GROUPS_TOTAL
from apps.metrics_exporter.helpers import get_metric_alert_groups_response_time_key, get_metric_alert_groups_total_key
METRICS_TEST_INTEGRATION_NAME = "Test integration"
METRICS_TEST_ORG_ID = 123 # random number
METRICS_TEST_INSTANCE_SLUG = "test_instance"
METRICS_TEST_INSTANCE_ID = 292 # random number
@pytest.fixture()
def mock_cache_get_metrics_for_collector(monkeypatch):
def _mock_cache_get(key, *args, **kwargs):
if key.startswith(ALERT_GROUPS_TOTAL):
key = ALERT_GROUPS_TOTAL
elif key.startswith(ALERT_GROUPS_RESPONSE_TIME):
key = ALERT_GROUPS_RESPONSE_TIME
test_metrics = {
ALERT_GROUPS_TOTAL: {
1: {
"integration_name": "Test metrics integration",
"team_name": "Test team",
"team_id": 1,
"org_id": 1,
"slug": "Test stack",
"id": 1,
"firing": 2,
"acknowledged": 3,
"silenced": 4,
"resolved": 5,
}
},
ALERT_GROUPS_RESPONSE_TIME: {
1: {
"integration_name": "Test metrics integration",
"team_name": "Test team",
"team_id": 1,
"org_id": 1,
"slug": "Test stack",
"id": 1,
"response_time": [2, 10, 200, 650],
}
},
}
return test_metrics.get(key)
monkeypatch.setattr(cache, "get", _mock_cache_get)
@pytest.fixture()
def mock_get_metrics_cache(monkeypatch):
def _mock_cache_get(key, *args, **kwargs):
return {}
monkeypatch.setattr(cache, "get", _mock_cache_get)
@pytest.fixture
def make_metrics_cache_params(monkeypatch):
def _make_cache_params(integration_id, organization_id, team_name=None, team_id=None):
team_name = team_name or "No team"
team_id = team_id or "no_team"
metric_alert_groups_total_key = get_metric_alert_groups_total_key(organization_id)
metric_alert_groups_response_time_key = get_metric_alert_groups_response_time_key(organization_id)
def cache_get(key, *args, **kwargs):
metrics_data = {
metric_alert_groups_response_time_key: {
integration_id: {
"integration_name": METRICS_TEST_INTEGRATION_NAME,
"team_name": team_name,
"team_id": team_id,
"org_id": METRICS_TEST_ORG_ID,
"slug": METRICS_TEST_INSTANCE_SLUG,
"id": METRICS_TEST_INSTANCE_ID,
"response_time": [],
}
},
metric_alert_groups_total_key: {
integration_id: {
"integration_name": METRICS_TEST_INTEGRATION_NAME,
"team_name": team_name,
"team_id": team_id,
"org_id": METRICS_TEST_ORG_ID,
"slug": METRICS_TEST_INSTANCE_SLUG,
"id": METRICS_TEST_INSTANCE_ID,
"firing": 0,
"acknowledged": 0,
"silenced": 0,
"resolved": 0,
}
},
}
return metrics_data.get(key, {})
return cache_get
return _make_cache_params

View file

@ -0,0 +1,115 @@
from unittest.mock import patch
import pytest
from apps.metrics_exporter.helpers import (
get_metric_alert_groups_response_time_key,
get_metric_alert_groups_total_key,
get_metrics_cache_timer_key,
)
from apps.metrics_exporter.tasks import calculate_and_cache_metrics
@patch("apps.alerts.models.alert_group.MetricsCacheManager.metrics_update_state_cache_for_alert_group")
@pytest.mark.django_db
def test_calculate_and_cache_metrics_task(
mocked_update_state_cache,
make_organization,
make_user_for_organization,
make_team,
make_alert_receive_channel,
make_alert_group,
make_alert,
):
METRICS_RESPONSE_TIME_LEN = 3 # 1 for each alert group with changed state (acked, resolved, silenced)
organization = make_organization()
team = make_team(organization)
alert_receive_channel_1 = make_alert_receive_channel(organization)
alert_receive_channel_2 = make_alert_receive_channel(organization, team=team)
for alert_receive_channel in [alert_receive_channel_1, alert_receive_channel_2]:
for _ in range(2):
alert_group = make_alert_group(alert_receive_channel)
make_alert(alert_group=alert_group, raw_request_data={})
alert_group_to_ack = make_alert_group(alert_receive_channel)
make_alert(alert_group=alert_group_to_ack, raw_request_data={})
alert_group_to_ack.acknowledge()
alert_group_to_res = make_alert_group(alert_receive_channel)
make_alert(alert_group=alert_group_to_res, raw_request_data={})
alert_group_to_res.resolve()
alert_group_to_sil = make_alert_group(alert_receive_channel)
make_alert(alert_group=alert_group_to_sil, raw_request_data={})
alert_group_to_sil.silence()
metrics_cache_timer_key = get_metrics_cache_timer_key(organization.id)
metric_alert_groups_total_key = get_metric_alert_groups_total_key(organization.id)
metric_alert_groups_response_time_key = get_metric_alert_groups_response_time_key(organization.id)
expected_result_metric_alert_groups_total = {
alert_receive_channel_1.id: {
"integration_name": alert_receive_channel_1.verbal_name,
"team_name": "No team",
"team_id": "no_team",
"org_id": organization.org_id,
"slug": organization.stack_slug,
"id": organization.stack_id,
"firing": 2,
"silenced": 1,
"acknowledged": 1,
"resolved": 1,
},
alert_receive_channel_2.id: {
"integration_name": alert_receive_channel_2.verbal_name,
"team_name": team.name,
"team_id": team.id,
"org_id": organization.org_id,
"slug": organization.stack_slug,
"id": organization.stack_id,
"firing": 2,
"silenced": 1,
"acknowledged": 1,
"resolved": 1,
},
}
expected_result_metric_alert_groups_response_time = {
alert_receive_channel_1.id: {
"integration_name": alert_receive_channel_1.verbal_name,
"team_name": "No team",
"team_id": "no_team",
"org_id": organization.org_id,
"slug": organization.stack_slug,
"id": organization.stack_id,
"response_time": [],
},
alert_receive_channel_2.id: {
"integration_name": alert_receive_channel_2.verbal_name,
"team_name": team.name,
"team_id": team.id,
"org_id": organization.org_id,
"slug": organization.stack_slug,
"id": organization.stack_id,
"response_time": [],
},
}
with patch("apps.metrics_exporter.tasks.cache.set") as mock_cache_set:
calculate_and_cache_metrics(organization.id)
args = mock_cache_set.call_args_list
assert args[0].args[0] == metrics_cache_timer_key
# check alert_groups_total metric cache
metric_alert_groups_total_values = args[1].args
assert metric_alert_groups_total_values[0] == metric_alert_groups_total_key
assert metric_alert_groups_total_values[1] == expected_result_metric_alert_groups_total
# check alert_groups_response_time metric cache
metric_alert_groups_response_time_values = args[2].args
assert metric_alert_groups_response_time_values[0] == metric_alert_groups_response_time_key
for integration_id, values in metric_alert_groups_response_time_values[1].items():
assert len(values["response_time"]) == METRICS_RESPONSE_TIME_LEN
# set response time to expected result because it is calculated on fly
expected_result_metric_alert_groups_response_time[integration_id]["response_time"] = values["response_time"]
assert metric_alert_groups_response_time_values[1] == expected_result_metric_alert_groups_response_time

View file

@ -0,0 +1,33 @@
from unittest.mock import patch
import pytest
from prometheus_client import CollectorRegistry, generate_latest
from apps.alerts.constants import AlertGroupState
from apps.metrics_exporter.constants import ALERT_GROUPS_RESPONSE_TIME, ALERT_GROUPS_TOTAL
from apps.metrics_exporter.metrics_collectors import ApplicationMetricsCollector
@patch("apps.metrics_exporter.metrics_collectors.get_organization_ids", return_value=[1])
@patch("apps.metrics_exporter.metrics_collectors.start_calculate_and_cache_metrics.apply_async")
@pytest.mark.django_db
def test_application_metrics_collector(
mocked_org_ids, mocked_start_calculate_and_cache_metrics, mock_cache_get_metrics_for_collector
):
"""Test that ApplicationMetricsCollector generates expected metrics from cache"""
collector = ApplicationMetricsCollector()
test_metrics_registry = CollectorRegistry()
test_metrics_registry.register(collector)
for metric in test_metrics_registry.collect():
if metric.name == ALERT_GROUPS_TOTAL:
# integration with labels for each alert group state
assert len(metric.samples) == len(AlertGroupState)
elif metric.name == ALERT_GROUPS_RESPONSE_TIME:
# integration with labels for each value in collector's bucket + _count and _sum histogram values
assert len(metric.samples) == len(collector._buckets) + 2
result = generate_latest(test_metrics_registry).decode("utf-8")
assert result is not None
assert mocked_org_ids.called
# Since there is no recalculation timer for test org in cache, start_calculate_and_cache_metrics must be called
assert mocked_start_calculate_and_cache_metrics.called
test_metrics_registry.unregister(collector)

View file

@ -0,0 +1,452 @@
from unittest.mock import patch
import pytest
from django.core.cache import cache
from django.test import override_settings
from apps.metrics_exporter.helpers import (
get_metric_alert_groups_response_time_key,
get_metric_alert_groups_total_key,
metrics_bulk_update_team_label_cache,
)
from apps.metrics_exporter.metrics_cache_manager import MetricsCacheManager
from apps.metrics_exporter.tests.conftest import (
METRICS_TEST_INSTANCE_ID,
METRICS_TEST_INSTANCE_SLUG,
METRICS_TEST_INTEGRATION_NAME,
METRICS_TEST_ORG_ID,
)
@patch("apps.alerts.models.alert_group_log_record.send_update_log_report_signal.apply_async")
@patch("apps.alerts.models.alert_group.alert_group_action_triggered_signal.send")
@pytest.mark.django_db
@override_settings(CELERY_TASK_ALWAYS_EAGER=True)
def test_update_metric_alert_groups_total_cache_on_action(
mocked_send_log_signal,
mocked_action_signal_send,
make_organization,
make_user_for_organization,
make_alert_receive_channel,
make_alert_group,
make_alert,
make_metrics_cache_params,
monkeypatch,
):
organization = make_organization(
org_id=METRICS_TEST_ORG_ID,
stack_slug=METRICS_TEST_INSTANCE_SLUG,
stack_id=METRICS_TEST_INSTANCE_ID,
)
user = make_user_for_organization(organization)
alert_receive_channel = make_alert_receive_channel(organization, verbal_name=METRICS_TEST_INTEGRATION_NAME)
metric_alert_groups_total_key = get_metric_alert_groups_total_key(organization.id)
expected_result_metric_alert_groups_total = {
alert_receive_channel.id: {
"integration_name": alert_receive_channel.verbal_name,
"team_name": "No team",
"team_id": "no_team",
"org_id": organization.org_id,
"slug": organization.stack_slug,
"id": organization.stack_id,
"firing": 0,
"silenced": 0,
"acknowledged": 0,
"resolved": 0,
}
}
expected_result_firing = {
"firing": 1,
"silenced": 0,
"acknowledged": 0,
"resolved": 0,
}
expected_result_acked = {
"firing": 0,
"silenced": 0,
"acknowledged": 1,
"resolved": 0,
}
expected_result_resolved = {
"firing": 0,
"silenced": 0,
"acknowledged": 0,
"resolved": 1,
}
expected_result_silenced = {
"firing": 0,
"silenced": 1,
"acknowledged": 0,
"resolved": 0,
}
metrics_cache = make_metrics_cache_params(alert_receive_channel.id, organization.id)
monkeypatch.setattr(cache, "get", metrics_cache)
def get_called_arg_index_and_compare_results(update_expected_result):
"""find index for the metric argument, that was set in cache"""
for idx, called_arg in enumerate(mock_cache_set_called_args):
if idx >= arg_idx and called_arg.args[0] == metric_alert_groups_total_key:
expected_result_metric_alert_groups_total[alert_receive_channel.id].update(update_expected_result)
assert called_arg.args[1] == expected_result_metric_alert_groups_total
return idx + 1
raise AssertionError
with patch("apps.metrics_exporter.tasks.cache.set") as mock_cache_set:
arg_idx = 0
alert_group = make_alert_group(alert_receive_channel)
make_alert(alert_group=alert_group, raw_request_data={})
# check alert_groups_total metric cache, get called args
mock_cache_set_called_args = mock_cache_set.call_args_list
arg_idx = get_called_arg_index_and_compare_results(expected_result_firing)
alert_group.acknowledge_by_user(user)
arg_idx = get_called_arg_index_and_compare_results(expected_result_acked)
alert_group.un_acknowledge_by_user(user)
arg_idx = get_called_arg_index_and_compare_results(expected_result_firing)
alert_group.resolve_by_user(user)
arg_idx = get_called_arg_index_and_compare_results(expected_result_resolved)
alert_group.un_resolve_by_user(user)
arg_idx = get_called_arg_index_and_compare_results(expected_result_firing)
alert_group.silence_by_user(user, silence_delay=None)
arg_idx = get_called_arg_index_and_compare_results(expected_result_silenced)
alert_group.un_silence_by_user(user)
get_called_arg_index_and_compare_results(expected_result_firing)
@patch("apps.alerts.models.alert_group_log_record.send_update_log_report_signal.apply_async")
@patch("apps.alerts.models.alert_group.alert_group_action_triggered_signal.send")
@pytest.mark.django_db
@override_settings(CELERY_TASK_ALWAYS_EAGER=True)
def test_update_metric_alert_groups_response_time_cache_on_action(
mocked_send_log_signal,
mocked_action_signal_send,
make_organization,
make_user_for_organization,
make_alert_receive_channel,
make_alert_group,
make_alert,
monkeypatch,
make_metrics_cache_params,
):
organization = make_organization(
org_id=METRICS_TEST_ORG_ID,
stack_slug=METRICS_TEST_INSTANCE_SLUG,
stack_id=METRICS_TEST_INSTANCE_ID,
)
user = make_user_for_organization(organization)
alert_receive_channel = make_alert_receive_channel(organization, verbal_name=METRICS_TEST_INTEGRATION_NAME)
metric_alert_groups_response_time_key = get_metric_alert_groups_response_time_key(organization.id)
expected_result_metric_alert_groups_response_time = {
alert_receive_channel.id: {
"integration_name": alert_receive_channel.verbal_name,
"team_name": "No team",
"team_id": "no_team",
"org_id": organization.org_id,
"slug": organization.stack_slug,
"id": organization.stack_id,
"response_time": [],
}
}
metrics_cache = make_metrics_cache_params(alert_receive_channel.id, organization.id)
monkeypatch.setattr(cache, "get", metrics_cache)
def get_called_arg_index_and_compare_results():
"""find index for related to the metric argument, that was set in cache"""
for idx, called_arg in enumerate(mock_cache_set_called_args):
if idx >= arg_idx and called_arg.args[0] == metric_alert_groups_response_time_key:
response_time_values = called_arg.args[1][alert_receive_channel.id]["response_time"]
expected_result_metric_alert_groups_response_time[alert_receive_channel.id].update(
{"response_time": response_time_values}
)
# response time values len always will be 1 here since cache is mocked and refreshed on every call
assert len(response_time_values) == 1
assert called_arg.args[1] == expected_result_metric_alert_groups_response_time
return idx + 1
raise AssertionError
def assert_cache_was_not_changed_by_response_time_metric():
for idx, called_arg in enumerate(mock_cache_set_called_args):
if idx >= arg_idx and called_arg.args[0] == metric_alert_groups_response_time_key:
raise AssertionError
with patch("apps.metrics_exporter.tasks.cache.set") as mock_cache_set:
arg_idx = 0
alert_group_1, alert_group_2, alert_group_3 = [make_alert_group(alert_receive_channel) for _ in range(3)]
for alert_group in [alert_group_1, alert_group_2, alert_group_3]:
make_alert(alert_group=alert_group, raw_request_data={})
# check alert_groups_response_time metric cache, get called args
mock_cache_set_called_args = mock_cache_set.call_args_list
# alert_groups_response_time cache shouldn't be updated on create alert group
assert_cache_was_not_changed_by_response_time_metric()
alert_group_1.acknowledge_by_user(user)
arg_idx = get_called_arg_index_and_compare_results()
# assert that only the first action counts
alert_group_1.un_acknowledge_by_user(user)
assert_cache_was_not_changed_by_response_time_metric()
alert_group_1.resolve_by_user(user)
assert_cache_was_not_changed_by_response_time_metric()
alert_group_1.un_resolve_by_user(user)
assert_cache_was_not_changed_by_response_time_metric()
alert_group_1.silence_by_user(user, silence_delay=None)
assert_cache_was_not_changed_by_response_time_metric()
alert_group_1.un_silence_by_user(user)
assert_cache_was_not_changed_by_response_time_metric()
# check that response_time cache updates on other actions with other alert groups
alert_group_2.resolve_by_user(user)
arg_idx = get_called_arg_index_and_compare_results()
alert_group_3.silence_by_user(user, silence_delay=None)
get_called_arg_index_and_compare_results()
@pytest.mark.django_db
def test_update_metrics_cache_on_update_integration(
make_organization,
make_user_for_organization,
make_alert_receive_channel_with_post_save_signal,
make_team,
make_metrics_cache_params,
monkeypatch,
mock_get_metrics_cache,
):
organization = make_organization(
org_id=METRICS_TEST_ORG_ID,
stack_slug=METRICS_TEST_INSTANCE_SLUG,
stack_id=METRICS_TEST_INSTANCE_ID,
)
team = make_team(organization)
metric_alert_groups_total_key = get_metric_alert_groups_total_key(organization.id)
metric_alert_groups_response_time_key = get_metric_alert_groups_response_time_key(organization.id)
expected_result_updated_team = {
"team_name": team.name,
"team_id": team.id,
}
expected_result_updated_name = {"integration_name": "Renamed test integration"}
def get_called_arg_index_and_compare_results():
"""find index for related to the metric argument, that was set in cache"""
is_set_metric_alert_groups_total_cache = False
is_set_metric_alert_groups_response_time_cache = False
for idx, called_arg in enumerate(mock_cache_set_called_args):
if idx >= arg_idx and called_arg.args[0] == metric_alert_groups_total_key:
assert called_arg.args[1] == expected_result_metric_alert_groups_total
is_set_metric_alert_groups_total_cache = True
elif idx >= arg_idx and called_arg.args[0] == metric_alert_groups_response_time_key:
assert called_arg.args[1] == expected_result_metric_alert_groups_response_time
is_set_metric_alert_groups_response_time_cache = True
if is_set_metric_alert_groups_total_cache and is_set_metric_alert_groups_response_time_cache:
return idx + 1
raise AssertionError
with patch("apps.metrics_exporter.tasks.cache.set") as mock_cache_set:
arg_idx = 0
# check cache update on create integration
alert_receive_channel = make_alert_receive_channel_with_post_save_signal(
organization, verbal_name=METRICS_TEST_INTEGRATION_NAME
)
expected_result_metric_alert_groups_total = {
alert_receive_channel.id: {
"integration_name": METRICS_TEST_INTEGRATION_NAME,
"team_name": "No team",
"team_id": "no_team",
"org_id": organization.org_id,
"slug": organization.stack_slug,
"id": organization.stack_id,
"firing": 0,
"silenced": 0,
"acknowledged": 0,
"resolved": 0,
}
}
expected_result_metric_alert_groups_response_time = {
alert_receive_channel.id: {
"integration_name": METRICS_TEST_INTEGRATION_NAME,
"team_name": "No team",
"team_id": "no_team",
"org_id": organization.org_id,
"slug": organization.stack_slug,
"id": organization.stack_id,
"response_time": [],
}
}
mock_cache_set_called_args = mock_cache_set.call_args_list
arg_idx = get_called_arg_index_and_compare_results()
metrics_cache = make_metrics_cache_params(alert_receive_channel.id, organization.id)
monkeypatch.setattr(cache, "get", metrics_cache)
# check cache update on update integration's team
alert_receive_channel.team = team
# clear cached_property
del alert_receive_channel.team_name
del alert_receive_channel.team_id_or_no_team
alert_receive_channel.save()
for expected_result in [
expected_result_metric_alert_groups_total,
expected_result_metric_alert_groups_response_time,
]:
expected_result[alert_receive_channel.id].update(expected_result_updated_team)
arg_idx = get_called_arg_index_and_compare_results()
# check cache update on update integration's name
alert_receive_channel.refresh_from_db()
alert_receive_channel.verbal_name = expected_result_updated_name["integration_name"]
# clear cached_property
del alert_receive_channel.emojized_verbal_name
alert_receive_channel.save()
for expected_result in [
expected_result_metric_alert_groups_total,
expected_result_metric_alert_groups_response_time,
]:
expected_result[alert_receive_channel.id].update(expected_result_updated_name)
arg_idx = get_called_arg_index_and_compare_results()
# check cache update on update integration's name
alert_receive_channel.refresh_from_db()
alert_receive_channel.verbal_name = expected_result_updated_name["integration_name"]
# clear cached_property
del alert_receive_channel.emojized_verbal_name
alert_receive_channel.save()
for expected_result in [
expected_result_metric_alert_groups_total,
expected_result_metric_alert_groups_response_time,
]:
expected_result[alert_receive_channel.id].update(expected_result_updated_name)
arg_idx = get_called_arg_index_and_compare_results()
# check cache update on delete integration
alert_receive_channel.refresh_from_db()
alert_receive_channel.delete()
for expected_result in [
expected_result_metric_alert_groups_total,
expected_result_metric_alert_groups_response_time,
]:
expected_result.pop(alert_receive_channel.id)
get_called_arg_index_and_compare_results()
@pytest.mark.django_db
def test_update_metrics_cache_on_update_team(
make_organization,
make_user_for_organization,
make_alert_receive_channel,
make_team,
make_metrics_cache_params,
monkeypatch,
mock_get_metrics_cache,
):
organization = make_organization(
org_id=METRICS_TEST_ORG_ID,
stack_slug=METRICS_TEST_INSTANCE_SLUG,
stack_id=METRICS_TEST_INSTANCE_ID,
)
team = make_team(organization)
alert_receive_channel = make_alert_receive_channel(
organization, verbal_name=METRICS_TEST_INTEGRATION_NAME, team=team
)
metrics_cache = make_metrics_cache_params(alert_receive_channel.id, organization.id, team.name, team.id)
monkeypatch.setattr(cache, "get", metrics_cache)
metric_alert_groups_total_key = get_metric_alert_groups_total_key(organization.id)
metric_alert_groups_response_time_key = get_metric_alert_groups_response_time_key(organization.id)
new_team_name = "Test team renamed"
expected_result_metric_alert_groups_total = {
alert_receive_channel.id: {
"integration_name": METRICS_TEST_INTEGRATION_NAME,
"team_name": new_team_name,
"team_id": team.id,
"org_id": organization.org_id,
"slug": organization.stack_slug,
"id": organization.stack_id,
"firing": 0,
"silenced": 0,
"acknowledged": 0,
"resolved": 0,
}
}
expected_result_metric_alert_groups_response_time = {
alert_receive_channel.id: {
"integration_name": METRICS_TEST_INTEGRATION_NAME,
"team_name": new_team_name,
"team_id": team.id,
"org_id": organization.org_id,
"slug": organization.stack_slug,
"id": organization.stack_id,
"response_time": [],
}
}
expected_result_delete_team = {
"team_name": "No team",
"team_id": "no_team",
}
def get_called_arg_index_and_compare_results():
"""find index for related to the metric argument, that was set in cache"""
is_set_metric_alert_groups_total_cache = False
is_set_metric_alert_groups_response_time_cache = False
for idx, called_arg in enumerate(mock_cache_set_called_args):
if idx >= arg_idx and called_arg.args[0] == metric_alert_groups_total_key:
assert called_arg.args[1] == expected_result_metric_alert_groups_total
is_set_metric_alert_groups_total_cache = True
elif idx >= arg_idx and called_arg.args[0] == metric_alert_groups_response_time_key:
assert called_arg.args[1] == expected_result_metric_alert_groups_response_time
is_set_metric_alert_groups_response_time_cache = True
if is_set_metric_alert_groups_total_cache and is_set_metric_alert_groups_response_time_cache:
return idx + 1
raise AssertionError
team.name = new_team_name
team.save()
with patch("apps.metrics_exporter.tasks.cache.set") as mock_cache_set:
arg_idx = 0
mock_cache_set_called_args = mock_cache_set.call_args_list
metrics_team_to_update = MetricsCacheManager.update_team_diff({}, team.id, new_name=new_team_name)
metrics_bulk_update_team_label_cache(metrics_team_to_update, organization.id)
arg_idx = get_called_arg_index_and_compare_results()
metrics_team_to_update = MetricsCacheManager.update_team_diff({}, team.id, deleted=True)
metrics_bulk_update_team_label_cache(metrics_team_to_update, organization.id)
for expected_result in [
expected_result_metric_alert_groups_total,
expected_result_metric_alert_groups_response_time,
]:
expected_result[alert_receive_channel.id].update(expected_result_delete_team)
get_called_arg_index_and_compare_results()

View file

@ -0,0 +1,7 @@
from django.urls import path
from .views import MetricsExporterView
urlpatterns = [
path("", MetricsExporterView.as_view()),
]

View file

@ -0,0 +1,11 @@
from django.http import HttpResponse
from prometheus_client import generate_latest
from rest_framework.views import APIView
from .metrics_collectors import application_metrics_registry
class MetricsExporterView(APIView):
def get(self, request):
result = generate_latest(application_metrics_registry).decode("utf-8")
return HttpResponse(result, content_type="text/plain; version=0.0.4; charset=utf-8")

View file

@ -2,6 +2,8 @@ from django.conf import settings
from django.core.validators import MinLengthValidator
from django.db import models
from apps.metrics_exporter.helpers import metrics_bulk_update_team_label_cache
from apps.metrics_exporter.metrics_cache_manager import MetricsCacheManager
from common.public_primary_keys import generate_public_primary_key, increase_public_primary_key_length
@ -43,6 +45,13 @@ class TeamManager(models.Manager):
team_ids_to_delete = existing_team_ids - grafana_teams.keys()
organization.teams.filter(team_id__in=team_ids_to_delete).delete()
# collect teams diffs to update metrics cache
metrics_teams_to_update = {}
for team_id in team_ids_to_delete:
metrics_teams_to_update = MetricsCacheManager.update_team_diff(
metrics_teams_to_update, team_id, deleted=True
)
# update existing teams if any fields have changed
teams_to_update = []
for team in organization.teams.filter(team_id__in=existing_team_ids):
@ -52,12 +61,19 @@ class TeamManager(models.Manager):
or team.email != grafana_team["email"]
or team.avatar_url != grafana_team["avatarUrl"]
):
if team.name != grafana_team["name"]:
# collect teams diffs to update metrics cache
metrics_teams_to_update = MetricsCacheManager.update_team_diff(
metrics_teams_to_update, team.id, new_name=grafana_team["name"]
)
team.name = grafana_team["name"]
team.email = grafana_team["email"]
team.avatar_url = grafana_team["avatarUrl"]
teams_to_update.append(team)
organization.teams.bulk_update(teams_to_update, ["name", "email", "avatar_url"], batch_size=5000)
metrics_bulk_update_team_label_cache(metrics_teams_to_update, organization.id)
class Team(models.Model):
public_primary_key = models.CharField(

View file

@ -440,6 +440,17 @@ def make_alert_receive_channel():
return _make_alert_receive_channel
@pytest.fixture
def make_alert_receive_channel_with_post_save_signal():
def _make_alert_receive_channel(organization, **kwargs):
if "integration" not in kwargs:
kwargs["integration"] = AlertReceiveChannel.INTEGRATION_GRAFANA
alert_receive_channel = AlertReceiveChannelFactory(organization=organization, **kwargs)
return alert_receive_channel
return _make_alert_receive_channel
@pytest.fixture
def make_channel_filter():
def _make_channel_filter(alert_receive_channel, filtering_term=None, **kwargs):

View file

@ -55,3 +55,4 @@ django-deprecate-fields==0.1.1
pymdown-extensions==10.0
requests==2.31.0
urllib3==1.26.15
prometheus_client==0.16.0

View file

@ -220,6 +220,7 @@ INSTALLED_APPS = [
"apps.public_api",
"apps.grafana_plugin",
"apps.webhooks",
"apps.metrics_exporter",
"corsheaders",
"debug_toolbar",
"social_django",
@ -482,6 +483,10 @@ CELERY_BEAT_SCHEDULE = {
"conditionally_send_going_oncall_push_notifications_for_all_schedules": {
"task": "apps.mobile_app.tasks.conditionally_send_going_oncall_push_notifications_for_all_schedules",
"schedule": 10 * 60,
},
"save_organizations_ids_in_cache": {
"task": "apps.metrics_exporter.tasks.save_organizations_ids_in_cache",
"schedule": 60 * 30,
"args": (),
},
}

View file

@ -57,6 +57,8 @@ CELERY_TASK_ROUTES = {
"apps.heartbeat.tasks.integration_heartbeat_checkup": {"queue": "default"},
"apps.heartbeat.tasks.process_heartbeat_task": {"queue": "default"},
"apps.heartbeat.tasks.restore_heartbeat_tasks": {"queue": "default"},
"apps.metrics_exporter.tasks.start_calculate_and_cache_metrics": {"queue": "default"},
"apps.metrics_exporter.tasks.save_organizations_ids_in_cache": {"queue": "default"},
"apps.schedules.tasks.refresh_ical_files.refresh_ical_file": {"queue": "default"},
"apps.schedules.tasks.refresh_ical_files.start_refresh_ical_files": {"queue": "default"},
"apps.schedules.tasks.notify_about_gaps_in_schedule.check_empty_shifts_in_schedule": {"queue": "default"},
@ -116,6 +118,7 @@ CELERY_TASK_ROUTES = {
"apps.grafana_plugin.tasks.sync.start_cleanup_deleted_organizations": {"queue": "long"},
"apps.grafana_plugin.tasks.sync.start_sync_organizations": {"queue": "long"},
"apps.grafana_plugin.tasks.sync.sync_organization_async": {"queue": "long"},
"apps.metrics_exporter.tasks.calculate_and_cache_metrics": {"queue": "long"},
# SLACK
"apps.integrations.tasks.notify_about_integration_ratelimit_in_slack": {"queue": "slack"},
"apps.slack.helpers.alert_group_representative.on_alert_group_action_triggered_async": {"queue": "slack"},