manually retry for requests.exceptions.Timeout exceptions when sending outgoing webhooks (#3632)

# Which issue(s) this PR fixes Fixes https://github.com/grafana/oncall-private/issues/2439 ## Checklist - [x] Unit, integration, and e2e (if applicable) tests updated - [x] Documentation added (or `pr:no public docs` PR label added if not required) - [x] `CHANGELOG.md` updated (or `pr:no changelog` PR label added if not required)
2024-01-08 19:13:15 -05:00 · 2024-01-08 19:13:15 -05:00 · 3bcf5efc24
commit 3bcf5efc24
parent d57b41b758
3 changed files with 101 additions and 9 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -16,6 +16,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Fixed

 - Fixed schedule timezone issues ([#3576](https://github.com/grafana/oncall/issues/3576))
+- Ignore `requests.exceptions.Timeout` exceptions when attempting to send outgoing webhook requests by @joeyorlando ([#3632](https://github.com/grafana/oncall/pull/3632))

 ## v1.3.82 (2024-01-04)

--- a/engine/apps/webhooks/tasks/trigger_webhook.py
+++ b/engine/apps/webhooks/tasks/trigger_webhook.py
@ -1,7 +1,10 @@
 import json
 import logging
+import typing
+from datetime import datetime
 from json import JSONDecodeError

+import requests
 from celery.utils.log import get_task_logger
 from django.conf import settings
 from django.db.models import Prefetch
@ -28,6 +31,10 @@ logger = get_task_logger(__name__)
 logger.setLevel(logging.DEBUG)


+EXECUTE_WEBHOOK_RETRIES = 3
+# these exceptions are fully out of our control (e.g. customer's network issues)
+# let's manually retry them without raising an exception
+EXECUTE_WEBHOOK_EXCEPTIONS_TO_MANUALLY_RETRY = (requests.exceptions.Timeout,)
 TRIGGER_TYPE_TO_LABEL = {
    Webhook.TRIGGER_ALERT_GROUP_CREATED: "alert group created",
    Webhook.TRIGGER_ACKNOWLEDGE: "acknowledge",
@ -40,6 +47,17 @@ TRIGGER_TYPE_TO_LABEL = {
 }


+class WebhookRequestStatus(typing.TypedDict):
+    url: typing.Optional[str]
+    request_trigger: typing.Optional[str]
+    request_headers: typing.Optional[str]
+    request_data: typing.Optional[str]
+    status_code: typing.Optional[int]
+    content: typing.Optional[str]
+    webhook: Webhook
+    event_data: str
+
+
@shared_dedicated_queue_retry_task(
    autoretry_for=(Exception,), retry_backoff=True, max_retries=1 if settings.DEBUG else None
 )
@ -52,15 +70,14 @@ def send_webhook_event(trigger_type, alert_group_id, organization_id=None, user_
    ).exclude(is_webhook_enabled=False)

    for webhook in webhooks_qs:
-        print(webhook.name)
        execute_webhook.apply_async((webhook.pk, alert_group_id, user_id, None))


-def _isoformat_date(date_value):
+def _isoformat_date(date_value: datetime) -> typing.Optional[str]:
    return date_value.isoformat() if date_value else None


-def _build_payload(webhook, alert_group, user):
+def _build_payload(webhook: Webhook, alert_group: AlertGroup, user: User) -> typing.Dict[str, typing.Any]:
    trigger_type = webhook.trigger_type
    event = {
        "type": TRIGGER_TYPE_TO_LABEL[trigger_type],
@ -96,7 +113,9 @@ def _build_payload(webhook, alert_group, user):
    return data


-def mask_authorization_header(headers, header_keys_to_mask):
+def mask_authorization_header(
+    headers: typing.Dict[str, str], header_keys_to_mask: typing.List[str]
+) -> typing.Dict[str, str]:
    masked_headers = headers.copy()
    lower_keys = set(k.lower() for k in header_keys_to_mask)
    for k in headers.keys():
@ -105,8 +124,10 @@ def mask_authorization_header(headers, header_keys_to_mask):
    return masked_headers


-def make_request(webhook, alert_group, data):
-    status = {
+def make_request(
+    webhook: Webhook, alert_group: AlertGroup, data: typing.Dict[str, typing.Any]
+) -> typing.Tuple[bool, WebhookRequestStatus, typing.Optional[str], typing.Optional[Exception]]:
+    status: WebhookRequestStatus = {
        "url": None,
        "request_trigger": None,
        "request_headers": None,
@ -172,9 +193,9 @@ def make_request(webhook, alert_group, data):


@shared_dedicated_queue_retry_task(
-    autoretry_for=(Exception,), retry_backoff=True, max_retries=1 if settings.DEBUG else 3
+    autoretry_for=(Exception,), retry_backoff=True, max_retries=1 if settings.DEBUG else EXECUTE_WEBHOOK_RETRIES
 )
-def execute_webhook(webhook_pk, alert_group_id, user_id, escalation_policy_id):
+def execute_webhook(webhook_pk, alert_group_id, user_id, escalation_policy_id, manual_retry_num=0):
    from apps.webhooks.models import Webhook

    try:
@ -244,5 +265,21 @@ def execute_webhook(webhook_pk, alert_group_id, user_id, escalation_policy_id):
            escalation_error_code=error_code,
        )

-    if exception:
+    if isinstance(exception, EXECUTE_WEBHOOK_EXCEPTIONS_TO_MANUALLY_RETRY):
+        msg_details = (
+            f"webhook={webhook_pk} alert_group={alert_group_id} user={user_id} escalation_policy={escalation_policy_id}"
+        )
+
+        if manual_retry_num < EXECUTE_WEBHOOK_RETRIES:
+            retry_num = manual_retry_num + 1
+            logger.warning(f"Manually retrying execute_webhook for {msg_details} manual_retry_num={retry_num}")
+            execute_webhook.apply_async(
+                (webhook_pk, alert_group_id, user_id, escalation_policy_id, retry_num),
+                countdown=10,
+            )
+        else:
+            # don't raise an exception if we've exhausted retries for
+            # exceptions within EXECUTE_WEBHOOK_EXCEPTIONS_TO_MANUALLY_RETRY, simply give up trying
+            logger.warning(f"Exhausted execute_webhook retries for {msg_details}")
+    elif exception:
        raise exception
--- a/engine/apps/webhooks/tests/test_trigger_webhook.py
+++ b/engine/apps/webhooks/tests/test_trigger_webhook.py
@ -2,6 +2,7 @@ import json
 from unittest.mock import call, patch

 import pytest
+import requests
 from django.utils import timezone

 from apps.alerts.models import AlertGroupLogRecord, EscalationPolicy
@ -555,3 +556,56 @@ def test_response_content_limit(
    assert log.status_code == 200
    assert log.content == f"Response content {content_length} exceeds {WEBHOOK_RESPONSE_LIMIT} character limit"
    assert log.url == "https://test/"
+
+
+@patch("apps.webhooks.tasks.trigger_webhook.execute_webhook", wraps=execute_webhook)
+@patch("apps.webhooks.models.webhook.requests")
+@patch("apps.webhooks.utils.socket.gethostbyname", return_value="8.8.8.8")
+@pytest.mark.django_db
+@pytest.mark.parametrize("exception", [requests.exceptions.ConnectTimeout, requests.exceptions.ReadTimeout])
+def test_manually_retried_exceptions(
+    _mock_gethostbyname,
+    mock_requests,
+    spy_execute_webhook,
+    make_organization,
+    make_user_for_organization,
+    make_alert_receive_channel,
+    make_alert_group,
+    make_custom_webhook,
+    exception,
+):
+    mock_requests.post.side_effect = exception("foo bar")
+
+    organization = make_organization()
+    user = make_user_for_organization(organization)
+    alert_receive_channel = make_alert_receive_channel(organization)
+    alert_group = make_alert_group(
+        alert_receive_channel, acknowledged_at=timezone.now(), acknowledged=True, acknowledged_by=user.pk
+    )
+    webhook = make_custom_webhook(
+        organization=organization,
+        url="https://test/",
+        http_method="POST",
+        trigger_type=Webhook.TRIGGER_ACKNOWLEDGE,
+        forward_all=False,
+    )
+
+    execute_webhook_args = webhook.pk, alert_group.pk, user.pk, None
+
+    # should retry
+    execute_webhook(*execute_webhook_args)
+
+    mock_requests.post.assert_called_once_with("https://test/", timeout=10, headers={})
+    spy_execute_webhook.apply_async.assert_called_once_with((*execute_webhook_args, 1), countdown=10)
+
+    mock_requests.reset_mock()
+    spy_execute_webhook.reset_mock()
+
+    # should stop retrying after 3 attempts without raising issue
+    try:
+        execute_webhook(*execute_webhook_args, manual_retry_num=3)
+    except Exception:
+        pytest.fail()
+
+    mock_requests.post.assert_called_once_with("https://test/", timeout=10, headers={})
+    spy_execute_webhook.apply_async.assert_not_called()