From 2bb80b487e849782037c0d14867eac23ffb45ce8 Mon Sep 17 00:00:00 2001 From: Joey Orlando Date: Tue, 5 Dec 2023 12:12:08 -0500 Subject: [PATCH] address issue with metrics calculations when redis cluster is used (#3510) ## Which issue(s) this PR fixes Fixes this issue we started seeing popping up because of a change introduced in #3496: ```python3 File "/etc/app/apps/metrics_exporter/views.py", line 22, in get result = generate_latest(application_metrics_registry).decode("utf-8") ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/site-packages/prometheus_client/exposition.py", line 198, in generate_latest for metric in registry.collect(): File "/usr/local/lib/python3.11/site-packages/prometheus_client/registry.py", line 97, in collect yield from collector.collect() File "/etc/app/apps/metrics_exporter/metrics_collectors.py", line 56, in collect alert_groups_total, missing_org_ids_1 = self._get_alert_groups_total_metric(org_ids) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/etc/app/apps/metrics_exporter/metrics_collectors.py", line 97, in _get_alert_groups_total_metric org_id_from_key = RE_ALERT_GROUPS_TOTAL.match(org_key).groups()[0] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ AttributeError: 'NoneType' object has no attribute 'groups' ``` ```python3 >>> import re >>> ALERT_GROUPS_TOTAL = "oncall_alert_groups_total" >>> _RE_BASE_PATTERN = r"{{?{}}}?_(\d+)" >>> RE_ALERT_GROUPS_TOTAL = re.compile(_RE_BASE_PATTERN.format(ALERT_GROUPS_TOTAL)) >>> org_key = "{oncall_alert_groups_total}_1" >>> RE_ALERT_GROUPS_TOTAL.match(org_key).groups()[0] '1' >>> org_key = "oncall_alert_groups_total_1" >>> RE_ALERT_GROUPS_TOTAL.match(org_key).groups()[0] '1' ``` ## Checklist - [x] Unit, integration, and e2e (if applicable) tests updated - [ ] Documentation added (or `pr:no public docs` PR label added if not required) - [ ] `CHANGELOG.md` updated (or `pr:no changelog` PR label added if not required) --- .../metrics_exporter/metrics_collectors.py | 10 ++-- .../apps/metrics_exporter/tests/conftest.py | 6 +-- .../tests/test_metrics_collectors.py | 46 +++++++++++-------- 3 files changed, 36 insertions(+), 26 deletions(-) diff --git a/engine/apps/metrics_exporter/metrics_collectors.py b/engine/apps/metrics_exporter/metrics_collectors.py index 27d65b12..45ff38ed 100644 --- a/engine/apps/metrics_exporter/metrics_collectors.py +++ b/engine/apps/metrics_exporter/metrics_collectors.py @@ -28,9 +28,13 @@ from apps.metrics_exporter.tasks import start_calculate_and_cache_metrics, start application_metrics_registry = CollectorRegistry() -RE_ALERT_GROUPS_TOTAL = re.compile(r"{}_(\d+)".format(ALERT_GROUPS_TOTAL)) -RE_ALERT_GROUPS_RESPONSE_TIME = re.compile(r"{}_(\d+)".format(ALERT_GROUPS_RESPONSE_TIME)) -RE_USER_WAS_NOTIFIED_OF_ALERT_GROUPS = re.compile(r"{}_(\d+)".format(USER_WAS_NOTIFIED_OF_ALERT_GROUPS)) +# _RE_BASE_PATTERN allows for optional curly-brackets around the metric name as in some cases this may occur +# see common.cache.ensure_cache_key_allocates_to_the_same_hash_slot for more details regarding this +_RE_BASE_PATTERN = r"{{?{}}}?_(\d+)" + +RE_ALERT_GROUPS_TOTAL = re.compile(_RE_BASE_PATTERN.format(ALERT_GROUPS_TOTAL)) +RE_ALERT_GROUPS_RESPONSE_TIME = re.compile(_RE_BASE_PATTERN.format(ALERT_GROUPS_RESPONSE_TIME)) +RE_USER_WAS_NOTIFIED_OF_ALERT_GROUPS = re.compile(_RE_BASE_PATTERN.format(USER_WAS_NOTIFIED_OF_ALERT_GROUPS)) # https://github.com/prometheus/client_python#custom-collectors diff --git a/engine/apps/metrics_exporter/tests/conftest.py b/engine/apps/metrics_exporter/tests/conftest.py index f82ee211..f2868ee4 100644 --- a/engine/apps/metrics_exporter/tests/conftest.py +++ b/engine/apps/metrics_exporter/tests/conftest.py @@ -22,11 +22,11 @@ METRICS_TEST_USER_USERNAME = "Alex" @pytest.fixture() def mock_cache_get_metrics_for_collector(monkeypatch): def _mock_cache_get(key, *args, **kwargs): - if key.startswith(ALERT_GROUPS_TOTAL): + if ALERT_GROUPS_TOTAL in key: key = ALERT_GROUPS_TOTAL - elif key.startswith(ALERT_GROUPS_RESPONSE_TIME): + elif ALERT_GROUPS_RESPONSE_TIME in key: key = ALERT_GROUPS_RESPONSE_TIME - elif key.startswith(USER_WAS_NOTIFIED_OF_ALERT_GROUPS): + elif USER_WAS_NOTIFIED_OF_ALERT_GROUPS in key: key = USER_WAS_NOTIFIED_OF_ALERT_GROUPS test_metrics = { ALERT_GROUPS_TOTAL: { diff --git a/engine/apps/metrics_exporter/tests/test_metrics_collectors.py b/engine/apps/metrics_exporter/tests/test_metrics_collectors.py index 4dbe7d05..670c590b 100644 --- a/engine/apps/metrics_exporter/tests/test_metrics_collectors.py +++ b/engine/apps/metrics_exporter/tests/test_metrics_collectors.py @@ -1,6 +1,7 @@ from unittest.mock import patch import pytest +from django.test import override_settings from prometheus_client import CollectorRegistry, generate_latest from apps.alerts.constants import AlertGroupState @@ -12,29 +13,34 @@ from apps.metrics_exporter.constants import ( from apps.metrics_exporter.metrics_collectors import ApplicationMetricsCollector +# redis cluster usage modifies the cache keys for some operations, so we need to test both cases +# see common.cache.ensure_cache_key_allocates_to_the_same_hash_slot for more details +@pytest.mark.parametrize("use_redis_cluster", [True, False]) @patch("apps.metrics_exporter.metrics_collectors.get_organization_ids", return_value=[1]) @patch("apps.metrics_exporter.metrics_collectors.start_calculate_and_cache_metrics.apply_async") @pytest.mark.django_db def test_application_metrics_collector( - mocked_org_ids, mocked_start_calculate_and_cache_metrics, mock_cache_get_metrics_for_collector + mocked_org_ids, mocked_start_calculate_and_cache_metrics, mock_cache_get_metrics_for_collector, use_redis_cluster ): """Test that ApplicationMetricsCollector generates expected metrics from cache""" - collector = ApplicationMetricsCollector() - test_metrics_registry = CollectorRegistry() - test_metrics_registry.register(collector) - for metric in test_metrics_registry.collect(): - if metric.name == ALERT_GROUPS_TOTAL: - # integration with labels for each alert group state - assert len(metric.samples) == len(AlertGroupState) - elif metric.name == ALERT_GROUPS_RESPONSE_TIME: - # integration with labels for each value in collector's bucket + _count and _sum histogram values - assert len(metric.samples) == len(collector._buckets) + 2 - elif metric.name == USER_WAS_NOTIFIED_OF_ALERT_GROUPS: - # metric with labels for each notified user - assert len(metric.samples) == 1 - result = generate_latest(test_metrics_registry).decode("utf-8") - assert result is not None - assert mocked_org_ids.called - # Since there is no recalculation timer for test org in cache, start_calculate_and_cache_metrics must be called - assert mocked_start_calculate_and_cache_metrics.called - test_metrics_registry.unregister(collector) + + with override_settings(USE_REDIS_CLUSTER=use_redis_cluster): + collector = ApplicationMetricsCollector() + test_metrics_registry = CollectorRegistry() + test_metrics_registry.register(collector) + for metric in test_metrics_registry.collect(): + if metric.name == ALERT_GROUPS_TOTAL: + # integration with labels for each alert group state + assert len(metric.samples) == len(AlertGroupState) + elif metric.name == ALERT_GROUPS_RESPONSE_TIME: + # integration with labels for each value in collector's bucket + _count and _sum histogram values + assert len(metric.samples) == len(collector._buckets) + 2 + elif metric.name == USER_WAS_NOTIFIED_OF_ALERT_GROUPS: + # metric with labels for each notified user + assert len(metric.samples) == 1 + result = generate_latest(test_metrics_registry).decode("utf-8") + assert result is not None + assert mocked_org_ids.called + # Since there is no recalculation timer for test org in cache, start_calculate_and_cache_metrics must be called + assert mocked_start_calculate_and_cache_metrics.called + test_metrics_registry.unregister(collector)