From 3bcf5efc2444131112c2a2df6609ff205ea9ea3f Mon Sep 17 00:00:00 2001 From: Joey Orlando Date: Mon, 8 Jan 2024 19:13:15 -0500 Subject: [PATCH] manually retry for `requests.exceptions.Timeout` exceptions when sending outgoing webhooks (#3632) # Which issue(s) this PR fixes Fixes https://github.com/grafana/oncall-private/issues/2439 ## Checklist - [x] Unit, integration, and e2e (if applicable) tests updated - [x] Documentation added (or `pr:no public docs` PR label added if not required) - [x] `CHANGELOG.md` updated (or `pr:no changelog` PR label added if not required) --- CHANGELOG.md | 1 + engine/apps/webhooks/tasks/trigger_webhook.py | 55 ++++++++++++++++--- .../webhooks/tests/test_trigger_webhook.py | 54 ++++++++++++++++++ 3 files changed, 101 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5ab47fc7..df63c0c8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed - Fixed schedule timezone issues ([#3576](https://github.com/grafana/oncall/issues/3576)) +- Ignore `requests.exceptions.Timeout` exceptions when attempting to send outgoing webhook requests by @joeyorlando ([#3632](https://github.com/grafana/oncall/pull/3632)) ## v1.3.82 (2024-01-04) diff --git a/engine/apps/webhooks/tasks/trigger_webhook.py b/engine/apps/webhooks/tasks/trigger_webhook.py index dcc0f960..8e2ffa83 100644 --- a/engine/apps/webhooks/tasks/trigger_webhook.py +++ b/engine/apps/webhooks/tasks/trigger_webhook.py @@ -1,7 +1,10 @@ import json import logging +import typing +from datetime import datetime from json import JSONDecodeError +import requests from celery.utils.log import get_task_logger from django.conf import settings from django.db.models import Prefetch @@ -28,6 +31,10 @@ logger = get_task_logger(__name__) logger.setLevel(logging.DEBUG) +EXECUTE_WEBHOOK_RETRIES = 3 +# these exceptions are fully out of our control (e.g. customer's network issues) +# let's manually retry them without raising an exception +EXECUTE_WEBHOOK_EXCEPTIONS_TO_MANUALLY_RETRY = (requests.exceptions.Timeout,) TRIGGER_TYPE_TO_LABEL = { Webhook.TRIGGER_ALERT_GROUP_CREATED: "alert group created", Webhook.TRIGGER_ACKNOWLEDGE: "acknowledge", @@ -40,6 +47,17 @@ TRIGGER_TYPE_TO_LABEL = { } +class WebhookRequestStatus(typing.TypedDict): + url: typing.Optional[str] + request_trigger: typing.Optional[str] + request_headers: typing.Optional[str] + request_data: typing.Optional[str] + status_code: typing.Optional[int] + content: typing.Optional[str] + webhook: Webhook + event_data: str + + @shared_dedicated_queue_retry_task( autoretry_for=(Exception,), retry_backoff=True, max_retries=1 if settings.DEBUG else None ) @@ -52,15 +70,14 @@ def send_webhook_event(trigger_type, alert_group_id, organization_id=None, user_ ).exclude(is_webhook_enabled=False) for webhook in webhooks_qs: - print(webhook.name) execute_webhook.apply_async((webhook.pk, alert_group_id, user_id, None)) -def _isoformat_date(date_value): +def _isoformat_date(date_value: datetime) -> typing.Optional[str]: return date_value.isoformat() if date_value else None -def _build_payload(webhook, alert_group, user): +def _build_payload(webhook: Webhook, alert_group: AlertGroup, user: User) -> typing.Dict[str, typing.Any]: trigger_type = webhook.trigger_type event = { "type": TRIGGER_TYPE_TO_LABEL[trigger_type], @@ -96,7 +113,9 @@ def _build_payload(webhook, alert_group, user): return data -def mask_authorization_header(headers, header_keys_to_mask): +def mask_authorization_header( + headers: typing.Dict[str, str], header_keys_to_mask: typing.List[str] +) -> typing.Dict[str, str]: masked_headers = headers.copy() lower_keys = set(k.lower() for k in header_keys_to_mask) for k in headers.keys(): @@ -105,8 +124,10 @@ def mask_authorization_header(headers, header_keys_to_mask): return masked_headers -def make_request(webhook, alert_group, data): - status = { +def make_request( + webhook: Webhook, alert_group: AlertGroup, data: typing.Dict[str, typing.Any] +) -> typing.Tuple[bool, WebhookRequestStatus, typing.Optional[str], typing.Optional[Exception]]: + status: WebhookRequestStatus = { "url": None, "request_trigger": None, "request_headers": None, @@ -172,9 +193,9 @@ def make_request(webhook, alert_group, data): @shared_dedicated_queue_retry_task( - autoretry_for=(Exception,), retry_backoff=True, max_retries=1 if settings.DEBUG else 3 + autoretry_for=(Exception,), retry_backoff=True, max_retries=1 if settings.DEBUG else EXECUTE_WEBHOOK_RETRIES ) -def execute_webhook(webhook_pk, alert_group_id, user_id, escalation_policy_id): +def execute_webhook(webhook_pk, alert_group_id, user_id, escalation_policy_id, manual_retry_num=0): from apps.webhooks.models import Webhook try: @@ -244,5 +265,21 @@ def execute_webhook(webhook_pk, alert_group_id, user_id, escalation_policy_id): escalation_error_code=error_code, ) - if exception: + if isinstance(exception, EXECUTE_WEBHOOK_EXCEPTIONS_TO_MANUALLY_RETRY): + msg_details = ( + f"webhook={webhook_pk} alert_group={alert_group_id} user={user_id} escalation_policy={escalation_policy_id}" + ) + + if manual_retry_num < EXECUTE_WEBHOOK_RETRIES: + retry_num = manual_retry_num + 1 + logger.warning(f"Manually retrying execute_webhook for {msg_details} manual_retry_num={retry_num}") + execute_webhook.apply_async( + (webhook_pk, alert_group_id, user_id, escalation_policy_id, retry_num), + countdown=10, + ) + else: + # don't raise an exception if we've exhausted retries for + # exceptions within EXECUTE_WEBHOOK_EXCEPTIONS_TO_MANUALLY_RETRY, simply give up trying + logger.warning(f"Exhausted execute_webhook retries for {msg_details}") + elif exception: raise exception diff --git a/engine/apps/webhooks/tests/test_trigger_webhook.py b/engine/apps/webhooks/tests/test_trigger_webhook.py index 9930f7fb..9cb78577 100644 --- a/engine/apps/webhooks/tests/test_trigger_webhook.py +++ b/engine/apps/webhooks/tests/test_trigger_webhook.py @@ -2,6 +2,7 @@ import json from unittest.mock import call, patch import pytest +import requests from django.utils import timezone from apps.alerts.models import AlertGroupLogRecord, EscalationPolicy @@ -555,3 +556,56 @@ def test_response_content_limit( assert log.status_code == 200 assert log.content == f"Response content {content_length} exceeds {WEBHOOK_RESPONSE_LIMIT} character limit" assert log.url == "https://test/" + + +@patch("apps.webhooks.tasks.trigger_webhook.execute_webhook", wraps=execute_webhook) +@patch("apps.webhooks.models.webhook.requests") +@patch("apps.webhooks.utils.socket.gethostbyname", return_value="8.8.8.8") +@pytest.mark.django_db +@pytest.mark.parametrize("exception", [requests.exceptions.ConnectTimeout, requests.exceptions.ReadTimeout]) +def test_manually_retried_exceptions( + _mock_gethostbyname, + mock_requests, + spy_execute_webhook, + make_organization, + make_user_for_organization, + make_alert_receive_channel, + make_alert_group, + make_custom_webhook, + exception, +): + mock_requests.post.side_effect = exception("foo bar") + + organization = make_organization() + user = make_user_for_organization(organization) + alert_receive_channel = make_alert_receive_channel(organization) + alert_group = make_alert_group( + alert_receive_channel, acknowledged_at=timezone.now(), acknowledged=True, acknowledged_by=user.pk + ) + webhook = make_custom_webhook( + organization=organization, + url="https://test/", + http_method="POST", + trigger_type=Webhook.TRIGGER_ACKNOWLEDGE, + forward_all=False, + ) + + execute_webhook_args = webhook.pk, alert_group.pk, user.pk, None + + # should retry + execute_webhook(*execute_webhook_args) + + mock_requests.post.assert_called_once_with("https://test/", timeout=10, headers={}) + spy_execute_webhook.apply_async.assert_called_once_with((*execute_webhook_args, 1), countdown=10) + + mock_requests.reset_mock() + spy_execute_webhook.reset_mock() + + # should stop retrying after 3 attempts without raising issue + try: + execute_webhook(*execute_webhook_args, manual_retry_num=3) + except Exception: + pytest.fail() + + mock_requests.post.assert_called_once_with("https://test/", timeout=10, headers={}) + spy_execute_webhook.apply_async.assert_not_called()