address issue with metrics calculations when redis cluster is used (#3510)
## Which issue(s) this PR fixes Fixes this issue we started seeing popping up because of a change introduced in #3496: ```python3 File "/etc/app/apps/metrics_exporter/views.py", line 22, in get result = generate_latest(application_metrics_registry).decode("utf-8") ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/site-packages/prometheus_client/exposition.py", line 198, in generate_latest for metric in registry.collect(): File "/usr/local/lib/python3.11/site-packages/prometheus_client/registry.py", line 97, in collect yield from collector.collect() File "/etc/app/apps/metrics_exporter/metrics_collectors.py", line 56, in collect alert_groups_total, missing_org_ids_1 = self._get_alert_groups_total_metric(org_ids) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/etc/app/apps/metrics_exporter/metrics_collectors.py", line 97, in _get_alert_groups_total_metric org_id_from_key = RE_ALERT_GROUPS_TOTAL.match(org_key).groups()[0] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ AttributeError: 'NoneType' object has no attribute 'groups' ``` ```python3 >>> import re >>> ALERT_GROUPS_TOTAL = "oncall_alert_groups_total" >>> _RE_BASE_PATTERN = r"{{?{}}}?_(\d+)" >>> RE_ALERT_GROUPS_TOTAL = re.compile(_RE_BASE_PATTERN.format(ALERT_GROUPS_TOTAL)) >>> org_key = "{oncall_alert_groups_total}_1" >>> RE_ALERT_GROUPS_TOTAL.match(org_key).groups()[0] '1' >>> org_key = "oncall_alert_groups_total_1" >>> RE_ALERT_GROUPS_TOTAL.match(org_key).groups()[0] '1' ``` ## Checklist - [x] Unit, integration, and e2e (if applicable) tests updated - [ ] Documentation added (or `pr:no public docs` PR label added if not required) - [ ] `CHANGELOG.md` updated (or `pr:no changelog` PR label added if not required)
This commit is contained in:
parent
93fee4228d
commit
2bb80b487e
3 changed files with 36 additions and 26 deletions
|
|
@ -28,9 +28,13 @@ from apps.metrics_exporter.tasks import start_calculate_and_cache_metrics, start
|
|||
application_metrics_registry = CollectorRegistry()
|
||||
|
||||
|
||||
RE_ALERT_GROUPS_TOTAL = re.compile(r"{}_(\d+)".format(ALERT_GROUPS_TOTAL))
|
||||
RE_ALERT_GROUPS_RESPONSE_TIME = re.compile(r"{}_(\d+)".format(ALERT_GROUPS_RESPONSE_TIME))
|
||||
RE_USER_WAS_NOTIFIED_OF_ALERT_GROUPS = re.compile(r"{}_(\d+)".format(USER_WAS_NOTIFIED_OF_ALERT_GROUPS))
|
||||
# _RE_BASE_PATTERN allows for optional curly-brackets around the metric name as in some cases this may occur
|
||||
# see common.cache.ensure_cache_key_allocates_to_the_same_hash_slot for more details regarding this
|
||||
_RE_BASE_PATTERN = r"{{?{}}}?_(\d+)"
|
||||
|
||||
RE_ALERT_GROUPS_TOTAL = re.compile(_RE_BASE_PATTERN.format(ALERT_GROUPS_TOTAL))
|
||||
RE_ALERT_GROUPS_RESPONSE_TIME = re.compile(_RE_BASE_PATTERN.format(ALERT_GROUPS_RESPONSE_TIME))
|
||||
RE_USER_WAS_NOTIFIED_OF_ALERT_GROUPS = re.compile(_RE_BASE_PATTERN.format(USER_WAS_NOTIFIED_OF_ALERT_GROUPS))
|
||||
|
||||
|
||||
# https://github.com/prometheus/client_python#custom-collectors
|
||||
|
|
|
|||
|
|
@ -22,11 +22,11 @@ METRICS_TEST_USER_USERNAME = "Alex"
|
|||
@pytest.fixture()
|
||||
def mock_cache_get_metrics_for_collector(monkeypatch):
|
||||
def _mock_cache_get(key, *args, **kwargs):
|
||||
if key.startswith(ALERT_GROUPS_TOTAL):
|
||||
if ALERT_GROUPS_TOTAL in key:
|
||||
key = ALERT_GROUPS_TOTAL
|
||||
elif key.startswith(ALERT_GROUPS_RESPONSE_TIME):
|
||||
elif ALERT_GROUPS_RESPONSE_TIME in key:
|
||||
key = ALERT_GROUPS_RESPONSE_TIME
|
||||
elif key.startswith(USER_WAS_NOTIFIED_OF_ALERT_GROUPS):
|
||||
elif USER_WAS_NOTIFIED_OF_ALERT_GROUPS in key:
|
||||
key = USER_WAS_NOTIFIED_OF_ALERT_GROUPS
|
||||
test_metrics = {
|
||||
ALERT_GROUPS_TOTAL: {
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
from django.test import override_settings
|
||||
from prometheus_client import CollectorRegistry, generate_latest
|
||||
|
||||
from apps.alerts.constants import AlertGroupState
|
||||
|
|
@ -12,29 +13,34 @@ from apps.metrics_exporter.constants import (
|
|||
from apps.metrics_exporter.metrics_collectors import ApplicationMetricsCollector
|
||||
|
||||
|
||||
# redis cluster usage modifies the cache keys for some operations, so we need to test both cases
|
||||
# see common.cache.ensure_cache_key_allocates_to_the_same_hash_slot for more details
|
||||
@pytest.mark.parametrize("use_redis_cluster", [True, False])
|
||||
@patch("apps.metrics_exporter.metrics_collectors.get_organization_ids", return_value=[1])
|
||||
@patch("apps.metrics_exporter.metrics_collectors.start_calculate_and_cache_metrics.apply_async")
|
||||
@pytest.mark.django_db
|
||||
def test_application_metrics_collector(
|
||||
mocked_org_ids, mocked_start_calculate_and_cache_metrics, mock_cache_get_metrics_for_collector
|
||||
mocked_org_ids, mocked_start_calculate_and_cache_metrics, mock_cache_get_metrics_for_collector, use_redis_cluster
|
||||
):
|
||||
"""Test that ApplicationMetricsCollector generates expected metrics from cache"""
|
||||
collector = ApplicationMetricsCollector()
|
||||
test_metrics_registry = CollectorRegistry()
|
||||
test_metrics_registry.register(collector)
|
||||
for metric in test_metrics_registry.collect():
|
||||
if metric.name == ALERT_GROUPS_TOTAL:
|
||||
# integration with labels for each alert group state
|
||||
assert len(metric.samples) == len(AlertGroupState)
|
||||
elif metric.name == ALERT_GROUPS_RESPONSE_TIME:
|
||||
# integration with labels for each value in collector's bucket + _count and _sum histogram values
|
||||
assert len(metric.samples) == len(collector._buckets) + 2
|
||||
elif metric.name == USER_WAS_NOTIFIED_OF_ALERT_GROUPS:
|
||||
# metric with labels for each notified user
|
||||
assert len(metric.samples) == 1
|
||||
result = generate_latest(test_metrics_registry).decode("utf-8")
|
||||
assert result is not None
|
||||
assert mocked_org_ids.called
|
||||
# Since there is no recalculation timer for test org in cache, start_calculate_and_cache_metrics must be called
|
||||
assert mocked_start_calculate_and_cache_metrics.called
|
||||
test_metrics_registry.unregister(collector)
|
||||
|
||||
with override_settings(USE_REDIS_CLUSTER=use_redis_cluster):
|
||||
collector = ApplicationMetricsCollector()
|
||||
test_metrics_registry = CollectorRegistry()
|
||||
test_metrics_registry.register(collector)
|
||||
for metric in test_metrics_registry.collect():
|
||||
if metric.name == ALERT_GROUPS_TOTAL:
|
||||
# integration with labels for each alert group state
|
||||
assert len(metric.samples) == len(AlertGroupState)
|
||||
elif metric.name == ALERT_GROUPS_RESPONSE_TIME:
|
||||
# integration with labels for each value in collector's bucket + _count and _sum histogram values
|
||||
assert len(metric.samples) == len(collector._buckets) + 2
|
||||
elif metric.name == USER_WAS_NOTIFIED_OF_ALERT_GROUPS:
|
||||
# metric with labels for each notified user
|
||||
assert len(metric.samples) == 1
|
||||
result = generate_latest(test_metrics_registry).decode("utf-8")
|
||||
assert result is not None
|
||||
assert mocked_org_ids.called
|
||||
# Since there is no recalculation timer for test org in cache, start_calculate_and_cache_metrics must be called
|
||||
assert mocked_start_calculate_and_cache_metrics.called
|
||||
test_metrics_registry.unregister(collector)
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue