patch redis cluster multi-key operations (#3496)
# Which issue(s) this PR fixes Related to https://github.com/grafana/oncall-private/issues/2363 Addresses this issue that arises when using `cache.get_many`/`cache.set_many` operations with a Redis Cluster: ```python3 File "/usr/local/lib/python3.11/site-packages/redis/cluster.py", line 1006, in determine_slot raise RedisClusterException( redis.exceptions.RedisClusterException: MGET - all keys must map to the same key slot ``` From the Redis Cluster [docs](https://redis.io/docs/reference/cluster-spec/#hash-tags), this can be addressed with this 👇 . Basically this will ensure that keys in multi-key operations will resolve to the same hash slot (read: node): > Hash tags > There is an exception for the computation of the hash slot that is used in order to implement hash tags. Hash tags are a way to ensure that multiple keys are allocated in the same hash slot. This is used in order to implement multi-key operations in Redis Cluster. > > To implement hash tags, the hash slot for a key is computed in a slightly different way in certain conditions. If the key contains a "{...}" pattern only the substring between { and } is hashed in order to obtain the hash slot. However since it is possible that there are multiple occurrences of { or } the algorithm is well specified by the following rules: > > IF the key contains a { character. > AND IF there is a } character to the right of {. > AND IF there are one or more characters between the first occurrence of { and the first occurrence of }. > Then instead of hashing the key, only what is between the first occurrence of { and the following first occurrence of } is hashed. ## Checklist - [x] Unit, integration, and e2e (if applicable) tests updated - [ ] Documentation added (or `pr:no public docs` PR label added if not required) - [ ] `CHANGELOG.md` updated (or `pr:no changelog` PR label added if not required)
This commit is contained in:
parent
9796489b8e
commit
1df1b1eaa0
7 changed files with 125 additions and 14 deletions
|
|
@ -22,6 +22,7 @@ from apps.metrics_exporter.constants import (
|
|||
RecalculateMetricsTimer,
|
||||
UserWasNotifiedOfAlertGroupsMetricsDict,
|
||||
)
|
||||
from common.cache import ensure_cache_key_allocates_to_the_same_hash_slot
|
||||
|
||||
if typing.TYPE_CHECKING:
|
||||
from apps.alerts.models import AlertReceiveChannel
|
||||
|
|
@ -98,24 +99,27 @@ def get_metrics_cache_timeout(organization_id):
|
|||
|
||||
|
||||
def get_metrics_cache_timer_key(organization_id) -> str:
|
||||
return f"{METRICS_CACHE_TIMER}_{organization_id}"
|
||||
|
||||
|
||||
def get_metrics_cache_timer_for_organization(organization_id):
|
||||
key = get_metrics_cache_timer_key(organization_id)
|
||||
return cache.get(key)
|
||||
return ensure_cache_key_allocates_to_the_same_hash_slot(
|
||||
f"{METRICS_CACHE_TIMER}_{organization_id}", METRICS_CACHE_TIMER
|
||||
)
|
||||
|
||||
|
||||
def get_metric_alert_groups_total_key(organization_id) -> str:
|
||||
return f"{ALERT_GROUPS_TOTAL}_{organization_id}"
|
||||
return ensure_cache_key_allocates_to_the_same_hash_slot(
|
||||
f"{ALERT_GROUPS_TOTAL}_{organization_id}", ALERT_GROUPS_TOTAL
|
||||
)
|
||||
|
||||
|
||||
def get_metric_alert_groups_response_time_key(organization_id) -> str:
|
||||
return f"{ALERT_GROUPS_RESPONSE_TIME}_{organization_id}"
|
||||
return ensure_cache_key_allocates_to_the_same_hash_slot(
|
||||
f"{ALERT_GROUPS_RESPONSE_TIME}_{organization_id}", ALERT_GROUPS_RESPONSE_TIME
|
||||
)
|
||||
|
||||
|
||||
def get_metric_user_was_notified_of_alert_groups_key(organization_id) -> str:
|
||||
return f"{USER_WAS_NOTIFIED_OF_ALERT_GROUPS}_{organization_id}"
|
||||
return ensure_cache_key_allocates_to_the_same_hash_slot(
|
||||
f"{USER_WAS_NOTIFIED_OF_ALERT_GROUPS}_{organization_id}", USER_WAS_NOTIFIED_OF_ALERT_GROUPS
|
||||
)
|
||||
|
||||
|
||||
def get_metric_calculation_started_key(metric_name) -> str:
|
||||
|
|
|
|||
|
|
@ -15,6 +15,7 @@ from apps.mobile_app.types import FCMMessageData, MessageType, Platform
|
|||
from apps.mobile_app.utils import MAX_RETRIES, construct_fcm_message, send_push_notification
|
||||
from apps.schedules.models.on_call_schedule import OnCallSchedule, ScheduleEvent
|
||||
from apps.user_management.models import User
|
||||
from common.cache import ensure_cache_key_allocates_to_the_same_hash_slot
|
||||
from common.custom_celery_tasks import shared_dedicated_queue_retry_task
|
||||
from common.l10n import format_localized_datetime, format_localized_time
|
||||
|
||||
|
|
@ -164,7 +165,10 @@ def _should_we_send_push_notification(
|
|||
|
||||
|
||||
def _generate_cache_key(user_pk: str, schedule_event: ScheduleEvent) -> str:
|
||||
return f"going_oncall_push_notification:{user_pk}:{schedule_event['shift']['pk']}"
|
||||
KEY_PREFIX = "going_oncall_push_notification"
|
||||
return ensure_cache_key_allocates_to_the_same_hash_slot(
|
||||
f"{KEY_PREFIX}:{user_pk}:{schedule_event['shift']['pk']}", KEY_PREFIX
|
||||
)
|
||||
|
||||
|
||||
@shared_dedicated_queue_retry_task(autoretry_for=(Exception,), retry_backoff=True, max_retries=MAX_RETRIES)
|
||||
|
|
|
|||
|
|
@ -35,6 +35,7 @@ from apps.schedules.constants import (
|
|||
RE_PRIORITY,
|
||||
)
|
||||
from apps.schedules.ical_events import ical_events
|
||||
from common.cache import ensure_cache_key_allocates_to_the_same_hash_slot
|
||||
from common.timezones import is_valid_timezone
|
||||
from common.utils import timed_lru_cache
|
||||
|
||||
|
|
@ -403,15 +404,24 @@ def get_cached_oncall_users_for_multiple_schedules(schedules: typing.List["OnCal
|
|||
from apps.schedules.models import OnCallSchedule
|
||||
from apps.user_management.models import User
|
||||
|
||||
CACHE_KEY_PREFIX = "schedule_oncall_users_"
|
||||
|
||||
def _generate_cache_key_for_schedule_oncall_users(schedule: "OnCallSchedule") -> str:
|
||||
return f"schedule_{schedule.public_primary_key}_oncall_users"
|
||||
return ensure_cache_key_allocates_to_the_same_hash_slot(
|
||||
f"{CACHE_KEY_PREFIX}{schedule.public_primary_key}", CACHE_KEY_PREFIX
|
||||
)
|
||||
|
||||
def _get_schedule_public_primary_key_from_schedule_oncall_users_cache_key(cache_key: str) -> str:
|
||||
return cache_key.replace("schedule_", "").replace("_oncall_users", "")
|
||||
"""
|
||||
remove any brackets that might be included in the cache key (when redis cluster is active).
|
||||
See `_generate_cache_key_for_schedule_oncall_users` just above
|
||||
"""
|
||||
cache_key = cache_key.replace("{", "").replace("}", "")
|
||||
return cache_key.replace(CACHE_KEY_PREFIX, "")
|
||||
|
||||
CACHE_TTL = 15 * 60 # 15 minutes in seconds
|
||||
|
||||
cache_keys: typing.List[str] = [_generate_cache_key_for_schedule_oncall_users(schedule) for schedule in schedules]
|
||||
cache_keys = [_generate_cache_key_for_schedule_oncall_users(schedule) for schedule in schedules]
|
||||
|
||||
# get_many returns a dictionary with all the keys we asked for that actually exist
|
||||
# in the cache (and haven’t expired)
|
||||
|
|
|
|||
|
|
@ -584,7 +584,7 @@ def test_get_cached_oncall_users_for_multiple_schedules(
|
|||
return users, (schedule1, schedule2, schedule3)
|
||||
|
||||
def _generate_cache_key(schedule):
|
||||
return f"schedule_{schedule.public_primary_key}_oncall_users"
|
||||
return f"schedule_oncall_users_{schedule.public_primary_key}"
|
||||
|
||||
# scenario: nothing is cached, need to recalculate everything and cache it
|
||||
users, schedules = _test_setup()
|
||||
|
|
|
|||
41
engine/common/cache.py
Normal file
41
engine/common/cache.py
Normal file
|
|
@ -0,0 +1,41 @@
|
|||
import typing
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
_RT = typing.TypeVar("_RT", str, typing.List[str], typing.Dict[str, typing.Any])
|
||||
|
||||
|
||||
def ensure_cache_key_allocates_to_the_same_hash_slot(cache_keys: _RT, pattern_to_wrap_in_brackets: str) -> _RT:
|
||||
"""
|
||||
This method will ensure that when using Redis Cluster, multiple cache keys will be allocated to the same hash slot.
|
||||
This ensures that multi-key operations (ex `cache.get_many` and `cache.set_many`) will work without raising this
|
||||
exception:
|
||||
|
||||
```
|
||||
File "/usr/local/lib/python3.11/site-packages/redis/cluster.py", line 1006, in determine_slot
|
||||
raise RedisClusterException(
|
||||
redis.exceptions.RedisClusterException: MGET - all keys must map to the same key slot
|
||||
```
|
||||
|
||||
From the Redis Cluster [docs](https://redis.io/docs/reference/cluster-spec/#hash-tags):
|
||||
|
||||
There is an exception for the computation of the hash slot that is used in order to implement hash tags.
|
||||
Hash tags are a way to ensure that multiple keys are allocated in the same hash slot.
|
||||
This is used in order to implement multi-key operations in Redis Cluster.
|
||||
|
||||
To implement hash tags, the hash slot for a key is computed in a slightly different way in certain conditions.
|
||||
If the key contains a "{...}" pattern only the substring between { and } is hashed in order to obtain the hash slot.
|
||||
However since it is possible that there are multiple occurrences of { or } the algorithm is well specified by the
|
||||
following rules:
|
||||
"""
|
||||
if not settings.USE_REDIS_CLUSTER:
|
||||
return cache_keys
|
||||
|
||||
def _replace_key(key: str) -> str:
|
||||
return key.replace(pattern_to_wrap_in_brackets, f"{{{pattern_to_wrap_in_brackets}}}")
|
||||
|
||||
if isinstance(cache_keys, str):
|
||||
return _replace_key(cache_keys)
|
||||
elif isinstance(cache_keys, dict):
|
||||
return {_replace_key(key): value for key, value in cache_keys.items()}
|
||||
return [_replace_key(key) for key in cache_keys]
|
||||
51
engine/common/tests/test_cache.py
Normal file
51
engine/common/tests/test_cache.py
Normal file
|
|
@ -0,0 +1,51 @@
|
|||
from django.test import override_settings
|
||||
|
||||
from common.cache import ensure_cache_key_allocates_to_the_same_hash_slot
|
||||
|
||||
PATTERN = "schedule_oncall_users"
|
||||
NON_EXISTENT_PATTERN = "nmzxcnvmzxcv"
|
||||
NUM_CACHE_KEYS = 5
|
||||
SINGLE_CACHE_KEY = f"{PATTERN}_0"
|
||||
CACHE_KEYS = [f"{PATTERN}_{pk}" for pk in range(NUM_CACHE_KEYS)]
|
||||
SET_MANY_CACHE_KEYS_DICT = {k: "foo" for k in CACHE_KEYS}
|
||||
|
||||
|
||||
def test_ensure_cache_key_allocates_to_the_same_hash_slot() -> None:
|
||||
def _convert_key(key: str) -> str:
|
||||
return key.replace(PATTERN, f"{{{PATTERN}}}")
|
||||
|
||||
# when USE_REDIS_CLUSTER is False the method should just return the cache keys
|
||||
with override_settings(USE_REDIS_CLUSTER=False):
|
||||
assert ensure_cache_key_allocates_to_the_same_hash_slot(SINGLE_CACHE_KEY, PATTERN) == SINGLE_CACHE_KEY
|
||||
assert ensure_cache_key_allocates_to_the_same_hash_slot(CACHE_KEYS, PATTERN) == CACHE_KEYS
|
||||
assert (
|
||||
ensure_cache_key_allocates_to_the_same_hash_slot(SET_MANY_CACHE_KEYS_DICT, PATTERN)
|
||||
== SET_MANY_CACHE_KEYS_DICT
|
||||
)
|
||||
|
||||
# when USE_REDIS_CLUSTER is True the method should wrap the specified pattern within the cache keys in curly brackets
|
||||
with override_settings(USE_REDIS_CLUSTER=True):
|
||||
# works with a single str cache key
|
||||
assert ensure_cache_key_allocates_to_the_same_hash_slot(SINGLE_CACHE_KEY, PATTERN) == _convert_key(
|
||||
SINGLE_CACHE_KEY
|
||||
)
|
||||
|
||||
# works with a list (useful for cache.get_many operations)
|
||||
assert ensure_cache_key_allocates_to_the_same_hash_slot(CACHE_KEYS, PATTERN) == [
|
||||
_convert_key(k) for k in CACHE_KEYS
|
||||
]
|
||||
|
||||
# works with a dict (useful for cache.set_many operations)
|
||||
assert ensure_cache_key_allocates_to_the_same_hash_slot(SET_MANY_CACHE_KEYS_DICT, PATTERN) == {
|
||||
_convert_key(k): v for k, v in SET_MANY_CACHE_KEYS_DICT.items()
|
||||
}
|
||||
|
||||
# if the pattern doesn't exist, we don't wrap it in brackets
|
||||
assert (
|
||||
ensure_cache_key_allocates_to_the_same_hash_slot(SINGLE_CACHE_KEY, NON_EXISTENT_PATTERN) == SINGLE_CACHE_KEY
|
||||
)
|
||||
assert ensure_cache_key_allocates_to_the_same_hash_slot(CACHE_KEYS, NON_EXISTENT_PATTERN) == CACHE_KEYS
|
||||
assert (
|
||||
ensure_cache_key_allocates_to_the_same_hash_slot(SET_MANY_CACHE_KEYS_DICT, NON_EXISTENT_PATTERN)
|
||||
== SET_MANY_CACHE_KEYS_DICT
|
||||
)
|
||||
|
|
@ -194,6 +194,7 @@ REDIS_URI = os.getenv("REDIS_URI")
|
|||
if not REDIS_URI:
|
||||
REDIS_URI = f"{REDIS_PROTOCOL}://{REDIS_USERNAME}:{REDIS_PASSWORD}@{REDIS_HOST}:{REDIS_PORT}/{REDIS_DATABASE}"
|
||||
|
||||
USE_REDIS_CLUSTER = getenv_boolean("USE_REDIS_CLUSTER", default=False)
|
||||
REDIS_USE_SSL = os.getenv("REDIS_USE_SSL")
|
||||
REDIS_SSL_CONFIG = {}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue