oncall-engine/engine/apps/metrics_exporter/helpers.py
Yulya Artyukhina 4a8d0f3ad3
Fix metrics and dashboard (#2895)
# What this PR does

## Which issue(s) this PR fixes
https://github.com/grafana/oncall/issues/2897

## Checklist

- [x] Unit, integration, and e2e (if applicable) tests updated
- [x] Documentation added (or `pr:no public docs` PR label added if not
required)
- [x] `CHANGELOG.md` updated (or `pr:no changelog` PR label added if not
required)
2023-08-29 13:52:24 +00:00

297 lines
13 KiB
Python

import datetime
import random
import typing
from django.core.cache import cache
from django.utils import timezone
from apps.alerts.constants import AlertGroupState
from apps.metrics_exporter.constants import (
ALERT_GROUPS_RESPONSE_TIME,
ALERT_GROUPS_TOTAL,
METRICS_CACHE_LIFETIME,
METRICS_CACHE_TIMER,
METRICS_ORGANIZATIONS_IDS,
METRICS_ORGANIZATIONS_IDS_CACHE_TIMEOUT,
METRICS_RECALCULATION_CACHE_TIMEOUT,
METRICS_RECALCULATION_CACHE_TIMEOUT_DISPERSE,
METRICS_RESPONSE_TIME_CALCULATION_PERIOD,
USER_WAS_NOTIFIED_OF_ALERT_GROUPS,
AlertGroupsResponseTimeMetricsDict,
AlertGroupsTotalMetricsDict,
RecalculateMetricsTimer,
UserWasNotifiedOfAlertGroupsMetricsDict,
)
if typing.TYPE_CHECKING:
from apps.alerts.models import AlertReceiveChannel
def get_organization_ids_from_db():
from apps.alerts.models import AlertReceiveChannel
# get only not deleted organizations that have integrations
organizations_ids = (
AlertReceiveChannel.objects.filter(organization__deleted_at__isnull=True)
.values_list("organization_id", flat=True)
.distinct()
)
organizations_ids = list(organizations_ids)
return organizations_ids
def get_organization_ids():
"""Try to get organizations ids from cache, otherwise get from db and save values in cache"""
organizations_ids = cache.get(METRICS_ORGANIZATIONS_IDS, [])
if not organizations_ids:
organizations_ids = get_organization_ids_from_db()
cache.set(organizations_ids, METRICS_ORGANIZATIONS_IDS, METRICS_ORGANIZATIONS_IDS_CACHE_TIMEOUT)
return organizations_ids
def is_allowed_to_start_metrics_calculation(organization_id, force=False) -> bool:
"""Check if metrics_cache_timer doesn't exist or if recalculation was started by force."""
recalculate_timeout = get_metrics_recalculation_timeout()
metrics_cache_timer_key = get_metrics_cache_timer_key(organization_id)
metrics_cache_timer: typing.Optional[RecalculateMetricsTimer]
metrics_cache_timer = cache.get(metrics_cache_timer_key)
if metrics_cache_timer:
if not force or metrics_cache_timer.get("forced_started", False):
return False
else:
metrics_cache_timer["forced_started"] = True
else:
metrics_cache_timer = {
"recalculate_timeout": recalculate_timeout,
"forced_started": force,
}
metrics_cache_timer["recalculate_timeout"] = recalculate_timeout
cache.set(metrics_cache_timer_key, metrics_cache_timer, timeout=recalculate_timeout)
return True
def get_response_time_period() -> datetime.datetime:
"""Returns period for response time calculation"""
return timezone.now() - METRICS_RESPONSE_TIME_CALCULATION_PERIOD
def get_metrics_recalculation_timeout() -> int:
"""
Returns timeout when metrics should be recalculated.
Add some dispersion to avoid starting recalculation tasks for all organizations at the same time.
"""
return METRICS_RECALCULATION_CACHE_TIMEOUT + random.randint(*METRICS_RECALCULATION_CACHE_TIMEOUT_DISPERSE)
def get_metrics_cache_timeout(organization_id):
metrics_cache_timer_key = get_metrics_cache_timer_key(organization_id)
metrics_cache_timer = cache.get(metrics_cache_timer_key)
if metrics_cache_timer:
TWO_HOURS = 7200
metrics_cache_timeout = int(metrics_cache_timer.get("recalculate_timeout")) + TWO_HOURS
else:
metrics_cache_timeout = METRICS_CACHE_LIFETIME
return metrics_cache_timeout
def get_metrics_cache_timer_key(organization_id) -> str:
return f"{METRICS_CACHE_TIMER}_{organization_id}"
def get_metrics_cache_timer_for_organization(organization_id):
key = get_metrics_cache_timer_key(organization_id)
return cache.get(key)
def get_metric_alert_groups_total_key(organization_id) -> str:
return f"{ALERT_GROUPS_TOTAL}_{organization_id}"
def get_metric_alert_groups_response_time_key(organization_id) -> str:
return f"{ALERT_GROUPS_RESPONSE_TIME}_{organization_id}"
def get_metric_user_was_notified_of_alert_groups_key(organization_id) -> str:
return f"{USER_WAS_NOTIFIED_OF_ALERT_GROUPS}_{organization_id}"
def get_metric_calculation_started_key(metric_name) -> str:
return f"calculation_started_for_{metric_name}"
def metrics_update_integration_cache(integration: "AlertReceiveChannel") -> None:
"""Update integration data in metrics cache"""
metrics_cache_timeout = get_metrics_cache_timeout(integration.organization_id)
metric_alert_groups_total_key = get_metric_alert_groups_total_key(integration.organization_id)
metric_alert_groups_response_time_key = get_metric_alert_groups_response_time_key(integration.organization_id)
for metric_key in [metric_alert_groups_total_key, metric_alert_groups_response_time_key]:
metric_cache = cache.get(metric_key, {})
integration_metric_cache = metric_cache.get(integration.id)
if integration_metric_cache:
cache_updated = False
if integration_metric_cache["team_id"] != integration.team_id_or_no_team:
integration_metric_cache["team_id"] = integration.team_id_or_no_team
integration_metric_cache["team_name"] = integration.team_name
cache_updated = True
if integration_metric_cache["integration_name"] != integration.emojized_verbal_name:
integration_metric_cache["integration_name"] = integration.emojized_verbal_name
cache_updated = True
if cache_updated:
cache.set(metric_key, metric_cache, timeout=metrics_cache_timeout)
def metrics_remove_deleted_integration_from_cache(integration: "AlertReceiveChannel"):
"""Remove data related to deleted integration from metrics cache"""
metrics_cache_timeout = get_metrics_cache_timeout(integration.organization_id)
metric_alert_groups_total_key = get_metric_alert_groups_total_key(integration.organization_id)
metric_alert_groups_response_time_key = get_metric_alert_groups_response_time_key(integration.organization_id)
for metric_key in [metric_alert_groups_total_key, metric_alert_groups_response_time_key]:
metric_cache = cache.get(metric_key)
if metric_cache:
metric_cache.pop(integration.id, None)
cache.set(metric_key, metric_cache, timeout=metrics_cache_timeout)
def metrics_add_integration_to_cache(integration: "AlertReceiveChannel"):
"""Add new integration data to metrics cache"""
metrics_cache_timeout = get_metrics_cache_timeout(integration.organization_id)
metric_alert_groups_total_key = get_metric_alert_groups_total_key(integration.organization_id)
instance_slug = integration.organization.stack_slug
instance_id = integration.organization.stack_id
grafana_org_id = integration.organization.org_id
metric_alert_groups_total: typing.Dict[int, AlertGroupsTotalMetricsDict] = cache.get(
metric_alert_groups_total_key, {}
)
metric_alert_groups_total.setdefault(
integration.id,
{
"integration_name": integration.emojized_verbal_name,
"team_name": integration.team_name,
"team_id": integration.team_id_or_no_team,
"org_id": grafana_org_id,
"slug": instance_slug,
"id": instance_id,
AlertGroupState.FIRING.value: 0,
AlertGroupState.ACKNOWLEDGED.value: 0,
AlertGroupState.RESOLVED.value: 0,
AlertGroupState.SILENCED.value: 0,
},
)
cache.set(metric_alert_groups_total_key, metric_alert_groups_total, timeout=metrics_cache_timeout)
metric_alert_groups_response_time_key = get_metric_alert_groups_response_time_key(integration.organization_id)
metric_alert_groups_response_time: typing.Dict[int, AlertGroupsResponseTimeMetricsDict] = cache.get(
metric_alert_groups_response_time_key, {}
)
metric_alert_groups_response_time.setdefault(
integration.id,
{
"integration_name": integration.emojized_verbal_name,
"team_name": integration.team_name,
"team_id": integration.team_id_or_no_team,
"org_id": grafana_org_id,
"slug": instance_slug,
"id": instance_id,
"response_time": [],
},
)
cache.set(metric_alert_groups_response_time_key, metric_alert_groups_response_time, timeout=metrics_cache_timeout)
def metrics_bulk_update_team_label_cache(teams_updated_data, organization_id):
"""Update team related data in metrics cache for each team in `teams_updated_data`"""
if not teams_updated_data:
return
metrics_cache_timeout = get_metrics_cache_timeout(organization_id)
metric_alert_groups_total_key = get_metric_alert_groups_total_key(organization_id)
metric_alert_groups_response_time_key = get_metric_alert_groups_response_time_key(organization_id)
metric_alert_groups_total = cache.get(metric_alert_groups_total_key, {})
metric_alert_groups_response_time = cache.get(metric_alert_groups_response_time_key, {})
for team_id, team_data in teams_updated_data.items():
for integration_id in metric_alert_groups_total:
if metric_alert_groups_total[integration_id]["team_id"] == team_id:
integration_response_time_metrics = metric_alert_groups_response_time.get(integration_id)
if team_data["deleted"]:
metric_alert_groups_total[integration_id]["team_id"] = "no_team"
metric_alert_groups_total[integration_id]["team_name"] = "No team"
if integration_response_time_metrics:
integration_response_time_metrics["team_id"] = "no_team"
integration_response_time_metrics["team_name"] = "No team"
else:
metric_alert_groups_total[integration_id]["team_name"] = team_data["team_name"]
if integration_response_time_metrics:
integration_response_time_metrics["team_name"] = team_data["team_name"]
cache.set(metric_alert_groups_total_key, metric_alert_groups_total, timeout=metrics_cache_timeout)
cache.set(metric_alert_groups_response_time_key, metric_alert_groups_response_time, timeout=metrics_cache_timeout)
def metrics_update_alert_groups_state_cache(states_diff, organization_id):
"""Update alert groups state metric cache for each integration in states_diff dict."""
if not states_diff:
return
metrics_cache_timeout = get_metrics_cache_timeout(organization_id)
metric_alert_groups_total_key = get_metric_alert_groups_total_key(organization_id)
metric_alert_groups_total = cache.get(metric_alert_groups_total_key, {})
if not metric_alert_groups_total:
return
for integration_id, integration_states_diff in states_diff.items():
integration_alert_groups = metric_alert_groups_total.get(int(integration_id))
if not integration_alert_groups:
continue
for previous_state, counter in integration_states_diff["previous_states"].items():
if integration_alert_groups[previous_state] - counter > 0:
integration_alert_groups[previous_state] -= counter
else:
integration_alert_groups[previous_state] = 0
for new_state, counter in integration_states_diff["new_states"].items():
integration_alert_groups[new_state] += counter
cache.set(metric_alert_groups_total_key, metric_alert_groups_total, timeout=metrics_cache_timeout)
def metrics_update_alert_groups_response_time_cache(integrations_response_time, organization_id):
"""Update alert groups response time metric cache for each integration in `integrations_response_time` dict."""
if not integrations_response_time:
return
metrics_cache_timeout = get_metrics_cache_timeout(organization_id)
metric_alert_groups_response_time_key = get_metric_alert_groups_response_time_key(organization_id)
metric_alert_groups_response_time = cache.get(metric_alert_groups_response_time_key, {})
if not metric_alert_groups_response_time:
return
for integration_id, integration_response_time in integrations_response_time.items():
integration_response_time_metrics = metric_alert_groups_response_time.get(int(integration_id))
if not integration_response_time_metrics:
continue
integration_response_time_metrics["response_time"].extend(integration_response_time)
cache.set(metric_alert_groups_response_time_key, metric_alert_groups_response_time, timeout=metrics_cache_timeout)
def metrics_update_user_cache(user):
"""Update "user_was_notified_of_alert_groups" metric cache."""
metrics_cache_timeout = get_metrics_cache_timeout(user.organization_id)
metric_user_was_notified_key = get_metric_user_was_notified_of_alert_groups_key(user.organization_id)
metric_user_was_notified: typing.Dict[int, UserWasNotifiedOfAlertGroupsMetricsDict] = cache.get(
metric_user_was_notified_key, {}
)
metric_user_was_notified.setdefault(
user.id,
{
"user_username": user.username,
"org_id": user.organization.org_id,
"slug": user.organization.stack_slug,
"id": user.organization.stack_id,
"counter": 0,
},
)["counter"] += 1
cache.set(metric_user_was_notified_key, metric_user_was_notified, timeout=metrics_cache_timeout)