address issue with metrics calculations when redis cluster is used (#3510)

## Which issue(s) this PR fixes

Fixes this issue we started seeing popping up because of a change
introduced in #3496:
```python3
File "/etc/app/apps/metrics_exporter/views.py", line 22, in get
    result = generate_latest(application_metrics_registry).decode("utf-8")
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/site-packages/prometheus_client/exposition.py", line 198, in generate_latest
    for metric in registry.collect():
  File "/usr/local/lib/python3.11/site-packages/prometheus_client/registry.py", line 97, in collect
    yield from collector.collect()
  File "/etc/app/apps/metrics_exporter/metrics_collectors.py", line 56, in collect
    alert_groups_total, missing_org_ids_1 = self._get_alert_groups_total_metric(org_ids)
                                            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/etc/app/apps/metrics_exporter/metrics_collectors.py", line 97, in _get_alert_groups_total_metric
    org_id_from_key = RE_ALERT_GROUPS_TOTAL.match(org_key).groups()[0]
                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: 'NoneType' object has no attribute 'groups'
```



```python3
>>> import re
>>> ALERT_GROUPS_TOTAL = "oncall_alert_groups_total"
>>> _RE_BASE_PATTERN = r"{{?{}}}?_(\d+)"
>>> RE_ALERT_GROUPS_TOTAL = re.compile(_RE_BASE_PATTERN.format(ALERT_GROUPS_TOTAL))
>>> org_key = "{oncall_alert_groups_total}_1"
>>> RE_ALERT_GROUPS_TOTAL.match(org_key).groups()[0]
'1'
>>> org_key = "oncall_alert_groups_total_1"
>>> RE_ALERT_GROUPS_TOTAL.match(org_key).groups()[0]
'1'
```

## Checklist

- [x] Unit, integration, and e2e (if applicable) tests updated
- [ ] Documentation added (or `pr:no public docs` PR label added if not
required)
- [ ] `CHANGELOG.md` updated (or `pr:no changelog` PR label added if not
required)
This commit is contained in:
Joey Orlando 2023-12-05 12:12:08 -05:00 committed by GitHub
parent 93fee4228d
commit 2bb80b487e
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 36 additions and 26 deletions

View file

@ -28,9 +28,13 @@ from apps.metrics_exporter.tasks import start_calculate_and_cache_metrics, start
application_metrics_registry = CollectorRegistry()
RE_ALERT_GROUPS_TOTAL = re.compile(r"{}_(\d+)".format(ALERT_GROUPS_TOTAL))
RE_ALERT_GROUPS_RESPONSE_TIME = re.compile(r"{}_(\d+)".format(ALERT_GROUPS_RESPONSE_TIME))
RE_USER_WAS_NOTIFIED_OF_ALERT_GROUPS = re.compile(r"{}_(\d+)".format(USER_WAS_NOTIFIED_OF_ALERT_GROUPS))
# _RE_BASE_PATTERN allows for optional curly-brackets around the metric name as in some cases this may occur
# see common.cache.ensure_cache_key_allocates_to_the_same_hash_slot for more details regarding this
_RE_BASE_PATTERN = r"{{?{}}}?_(\d+)"
RE_ALERT_GROUPS_TOTAL = re.compile(_RE_BASE_PATTERN.format(ALERT_GROUPS_TOTAL))
RE_ALERT_GROUPS_RESPONSE_TIME = re.compile(_RE_BASE_PATTERN.format(ALERT_GROUPS_RESPONSE_TIME))
RE_USER_WAS_NOTIFIED_OF_ALERT_GROUPS = re.compile(_RE_BASE_PATTERN.format(USER_WAS_NOTIFIED_OF_ALERT_GROUPS))
# https://github.com/prometheus/client_python#custom-collectors

View file

@ -22,11 +22,11 @@ METRICS_TEST_USER_USERNAME = "Alex"
@pytest.fixture()
def mock_cache_get_metrics_for_collector(monkeypatch):
def _mock_cache_get(key, *args, **kwargs):
if key.startswith(ALERT_GROUPS_TOTAL):
if ALERT_GROUPS_TOTAL in key:
key = ALERT_GROUPS_TOTAL
elif key.startswith(ALERT_GROUPS_RESPONSE_TIME):
elif ALERT_GROUPS_RESPONSE_TIME in key:
key = ALERT_GROUPS_RESPONSE_TIME
elif key.startswith(USER_WAS_NOTIFIED_OF_ALERT_GROUPS):
elif USER_WAS_NOTIFIED_OF_ALERT_GROUPS in key:
key = USER_WAS_NOTIFIED_OF_ALERT_GROUPS
test_metrics = {
ALERT_GROUPS_TOTAL: {

View file

@ -1,6 +1,7 @@
from unittest.mock import patch
import pytest
from django.test import override_settings
from prometheus_client import CollectorRegistry, generate_latest
from apps.alerts.constants import AlertGroupState
@ -12,29 +13,34 @@ from apps.metrics_exporter.constants import (
from apps.metrics_exporter.metrics_collectors import ApplicationMetricsCollector
# redis cluster usage modifies the cache keys for some operations, so we need to test both cases
# see common.cache.ensure_cache_key_allocates_to_the_same_hash_slot for more details
@pytest.mark.parametrize("use_redis_cluster", [True, False])
@patch("apps.metrics_exporter.metrics_collectors.get_organization_ids", return_value=[1])
@patch("apps.metrics_exporter.metrics_collectors.start_calculate_and_cache_metrics.apply_async")
@pytest.mark.django_db
def test_application_metrics_collector(
mocked_org_ids, mocked_start_calculate_and_cache_metrics, mock_cache_get_metrics_for_collector
mocked_org_ids, mocked_start_calculate_and_cache_metrics, mock_cache_get_metrics_for_collector, use_redis_cluster
):
"""Test that ApplicationMetricsCollector generates expected metrics from cache"""
collector = ApplicationMetricsCollector()
test_metrics_registry = CollectorRegistry()
test_metrics_registry.register(collector)
for metric in test_metrics_registry.collect():
if metric.name == ALERT_GROUPS_TOTAL:
# integration with labels for each alert group state
assert len(metric.samples) == len(AlertGroupState)
elif metric.name == ALERT_GROUPS_RESPONSE_TIME:
# integration with labels for each value in collector's bucket + _count and _sum histogram values
assert len(metric.samples) == len(collector._buckets) + 2
elif metric.name == USER_WAS_NOTIFIED_OF_ALERT_GROUPS:
# metric with labels for each notified user
assert len(metric.samples) == 1
result = generate_latest(test_metrics_registry).decode("utf-8")
assert result is not None
assert mocked_org_ids.called
# Since there is no recalculation timer for test org in cache, start_calculate_and_cache_metrics must be called
assert mocked_start_calculate_and_cache_metrics.called
test_metrics_registry.unregister(collector)
with override_settings(USE_REDIS_CLUSTER=use_redis_cluster):
collector = ApplicationMetricsCollector()
test_metrics_registry = CollectorRegistry()
test_metrics_registry.register(collector)
for metric in test_metrics_registry.collect():
if metric.name == ALERT_GROUPS_TOTAL:
# integration with labels for each alert group state
assert len(metric.samples) == len(AlertGroupState)
elif metric.name == ALERT_GROUPS_RESPONSE_TIME:
# integration with labels for each value in collector's bucket + _count and _sum histogram values
assert len(metric.samples) == len(collector._buckets) + 2
elif metric.name == USER_WAS_NOTIFIED_OF_ALERT_GROUPS:
# metric with labels for each notified user
assert len(metric.samples) == 1
result = generate_latest(test_metrics_registry).decode("utf-8")
assert result is not None
assert mocked_org_ids.called
# Since there is no recalculation timer for test org in cache, start_calculate_and_cache_metrics must be called
assert mocked_start_calculate_and_cache_metrics.called
test_metrics_registry.unregister(collector)