manually retry for requests.exceptions.Timeout exceptions when sending outgoing webhooks (#3632)
# Which issue(s) this PR fixes Fixes https://github.com/grafana/oncall-private/issues/2439 ## Checklist - [x] Unit, integration, and e2e (if applicable) tests updated - [x] Documentation added (or `pr:no public docs` PR label added if not required) - [x] `CHANGELOG.md` updated (or `pr:no changelog` PR label added if not required)
This commit is contained in:
parent
d57b41b758
commit
3bcf5efc24
3 changed files with 101 additions and 9 deletions
|
|
@ -16,6 +16,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|||
### Fixed
|
||||
|
||||
- Fixed schedule timezone issues ([#3576](https://github.com/grafana/oncall/issues/3576))
|
||||
- Ignore `requests.exceptions.Timeout` exceptions when attempting to send outgoing webhook requests by @joeyorlando ([#3632](https://github.com/grafana/oncall/pull/3632))
|
||||
|
||||
## v1.3.82 (2024-01-04)
|
||||
|
||||
|
|
|
|||
|
|
@ -1,7 +1,10 @@
|
|||
import json
|
||||
import logging
|
||||
import typing
|
||||
from datetime import datetime
|
||||
from json import JSONDecodeError
|
||||
|
||||
import requests
|
||||
from celery.utils.log import get_task_logger
|
||||
from django.conf import settings
|
||||
from django.db.models import Prefetch
|
||||
|
|
@ -28,6 +31,10 @@ logger = get_task_logger(__name__)
|
|||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
|
||||
EXECUTE_WEBHOOK_RETRIES = 3
|
||||
# these exceptions are fully out of our control (e.g. customer's network issues)
|
||||
# let's manually retry them without raising an exception
|
||||
EXECUTE_WEBHOOK_EXCEPTIONS_TO_MANUALLY_RETRY = (requests.exceptions.Timeout,)
|
||||
TRIGGER_TYPE_TO_LABEL = {
|
||||
Webhook.TRIGGER_ALERT_GROUP_CREATED: "alert group created",
|
||||
Webhook.TRIGGER_ACKNOWLEDGE: "acknowledge",
|
||||
|
|
@ -40,6 +47,17 @@ TRIGGER_TYPE_TO_LABEL = {
|
|||
}
|
||||
|
||||
|
||||
class WebhookRequestStatus(typing.TypedDict):
|
||||
url: typing.Optional[str]
|
||||
request_trigger: typing.Optional[str]
|
||||
request_headers: typing.Optional[str]
|
||||
request_data: typing.Optional[str]
|
||||
status_code: typing.Optional[int]
|
||||
content: typing.Optional[str]
|
||||
webhook: Webhook
|
||||
event_data: str
|
||||
|
||||
|
||||
@shared_dedicated_queue_retry_task(
|
||||
autoretry_for=(Exception,), retry_backoff=True, max_retries=1 if settings.DEBUG else None
|
||||
)
|
||||
|
|
@ -52,15 +70,14 @@ def send_webhook_event(trigger_type, alert_group_id, organization_id=None, user_
|
|||
).exclude(is_webhook_enabled=False)
|
||||
|
||||
for webhook in webhooks_qs:
|
||||
print(webhook.name)
|
||||
execute_webhook.apply_async((webhook.pk, alert_group_id, user_id, None))
|
||||
|
||||
|
||||
def _isoformat_date(date_value):
|
||||
def _isoformat_date(date_value: datetime) -> typing.Optional[str]:
|
||||
return date_value.isoformat() if date_value else None
|
||||
|
||||
|
||||
def _build_payload(webhook, alert_group, user):
|
||||
def _build_payload(webhook: Webhook, alert_group: AlertGroup, user: User) -> typing.Dict[str, typing.Any]:
|
||||
trigger_type = webhook.trigger_type
|
||||
event = {
|
||||
"type": TRIGGER_TYPE_TO_LABEL[trigger_type],
|
||||
|
|
@ -96,7 +113,9 @@ def _build_payload(webhook, alert_group, user):
|
|||
return data
|
||||
|
||||
|
||||
def mask_authorization_header(headers, header_keys_to_mask):
|
||||
def mask_authorization_header(
|
||||
headers: typing.Dict[str, str], header_keys_to_mask: typing.List[str]
|
||||
) -> typing.Dict[str, str]:
|
||||
masked_headers = headers.copy()
|
||||
lower_keys = set(k.lower() for k in header_keys_to_mask)
|
||||
for k in headers.keys():
|
||||
|
|
@ -105,8 +124,10 @@ def mask_authorization_header(headers, header_keys_to_mask):
|
|||
return masked_headers
|
||||
|
||||
|
||||
def make_request(webhook, alert_group, data):
|
||||
status = {
|
||||
def make_request(
|
||||
webhook: Webhook, alert_group: AlertGroup, data: typing.Dict[str, typing.Any]
|
||||
) -> typing.Tuple[bool, WebhookRequestStatus, typing.Optional[str], typing.Optional[Exception]]:
|
||||
status: WebhookRequestStatus = {
|
||||
"url": None,
|
||||
"request_trigger": None,
|
||||
"request_headers": None,
|
||||
|
|
@ -172,9 +193,9 @@ def make_request(webhook, alert_group, data):
|
|||
|
||||
|
||||
@shared_dedicated_queue_retry_task(
|
||||
autoretry_for=(Exception,), retry_backoff=True, max_retries=1 if settings.DEBUG else 3
|
||||
autoretry_for=(Exception,), retry_backoff=True, max_retries=1 if settings.DEBUG else EXECUTE_WEBHOOK_RETRIES
|
||||
)
|
||||
def execute_webhook(webhook_pk, alert_group_id, user_id, escalation_policy_id):
|
||||
def execute_webhook(webhook_pk, alert_group_id, user_id, escalation_policy_id, manual_retry_num=0):
|
||||
from apps.webhooks.models import Webhook
|
||||
|
||||
try:
|
||||
|
|
@ -244,5 +265,21 @@ def execute_webhook(webhook_pk, alert_group_id, user_id, escalation_policy_id):
|
|||
escalation_error_code=error_code,
|
||||
)
|
||||
|
||||
if exception:
|
||||
if isinstance(exception, EXECUTE_WEBHOOK_EXCEPTIONS_TO_MANUALLY_RETRY):
|
||||
msg_details = (
|
||||
f"webhook={webhook_pk} alert_group={alert_group_id} user={user_id} escalation_policy={escalation_policy_id}"
|
||||
)
|
||||
|
||||
if manual_retry_num < EXECUTE_WEBHOOK_RETRIES:
|
||||
retry_num = manual_retry_num + 1
|
||||
logger.warning(f"Manually retrying execute_webhook for {msg_details} manual_retry_num={retry_num}")
|
||||
execute_webhook.apply_async(
|
||||
(webhook_pk, alert_group_id, user_id, escalation_policy_id, retry_num),
|
||||
countdown=10,
|
||||
)
|
||||
else:
|
||||
# don't raise an exception if we've exhausted retries for
|
||||
# exceptions within EXECUTE_WEBHOOK_EXCEPTIONS_TO_MANUALLY_RETRY, simply give up trying
|
||||
logger.warning(f"Exhausted execute_webhook retries for {msg_details}")
|
||||
elif exception:
|
||||
raise exception
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ import json
|
|||
from unittest.mock import call, patch
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
from django.utils import timezone
|
||||
|
||||
from apps.alerts.models import AlertGroupLogRecord, EscalationPolicy
|
||||
|
|
@ -555,3 +556,56 @@ def test_response_content_limit(
|
|||
assert log.status_code == 200
|
||||
assert log.content == f"Response content {content_length} exceeds {WEBHOOK_RESPONSE_LIMIT} character limit"
|
||||
assert log.url == "https://test/"
|
||||
|
||||
|
||||
@patch("apps.webhooks.tasks.trigger_webhook.execute_webhook", wraps=execute_webhook)
|
||||
@patch("apps.webhooks.models.webhook.requests")
|
||||
@patch("apps.webhooks.utils.socket.gethostbyname", return_value="8.8.8.8")
|
||||
@pytest.mark.django_db
|
||||
@pytest.mark.parametrize("exception", [requests.exceptions.ConnectTimeout, requests.exceptions.ReadTimeout])
|
||||
def test_manually_retried_exceptions(
|
||||
_mock_gethostbyname,
|
||||
mock_requests,
|
||||
spy_execute_webhook,
|
||||
make_organization,
|
||||
make_user_for_organization,
|
||||
make_alert_receive_channel,
|
||||
make_alert_group,
|
||||
make_custom_webhook,
|
||||
exception,
|
||||
):
|
||||
mock_requests.post.side_effect = exception("foo bar")
|
||||
|
||||
organization = make_organization()
|
||||
user = make_user_for_organization(organization)
|
||||
alert_receive_channel = make_alert_receive_channel(organization)
|
||||
alert_group = make_alert_group(
|
||||
alert_receive_channel, acknowledged_at=timezone.now(), acknowledged=True, acknowledged_by=user.pk
|
||||
)
|
||||
webhook = make_custom_webhook(
|
||||
organization=organization,
|
||||
url="https://test/",
|
||||
http_method="POST",
|
||||
trigger_type=Webhook.TRIGGER_ACKNOWLEDGE,
|
||||
forward_all=False,
|
||||
)
|
||||
|
||||
execute_webhook_args = webhook.pk, alert_group.pk, user.pk, None
|
||||
|
||||
# should retry
|
||||
execute_webhook(*execute_webhook_args)
|
||||
|
||||
mock_requests.post.assert_called_once_with("https://test/", timeout=10, headers={})
|
||||
spy_execute_webhook.apply_async.assert_called_once_with((*execute_webhook_args, 1), countdown=10)
|
||||
|
||||
mock_requests.reset_mock()
|
||||
spy_execute_webhook.reset_mock()
|
||||
|
||||
# should stop retrying after 3 attempts without raising issue
|
||||
try:
|
||||
execute_webhook(*execute_webhook_args, manual_retry_num=3)
|
||||
except Exception:
|
||||
pytest.fail()
|
||||
|
||||
mock_requests.post.assert_called_once_with("https://test/", timeout=10, headers={})
|
||||
spy_execute_webhook.apply_async.assert_not_called()
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue