2022-06-03 08:09:47 -06:00
|
|
|
import logging
|
|
|
|
|
import random
|
2024-01-30 13:07:19 -05:00
|
|
|
import typing
|
2022-06-03 08:09:47 -06:00
|
|
|
|
|
|
|
|
from celery import shared_task
|
|
|
|
|
from celery.utils.log import get_task_logger
|
|
|
|
|
from django.conf import settings
|
|
|
|
|
from django.core.cache import cache
|
|
|
|
|
|
|
|
|
|
from apps.alerts.models.alert_group_counter import ConcurrentUpdateError
|
|
|
|
|
from apps.alerts.tasks import resolve_alert_group_by_source_if_needed
|
2023-09-12 10:49:16 +01:00
|
|
|
from apps.slack.client import SlackClient
|
|
|
|
|
from apps.slack.errors import SlackAPIError
|
2022-06-03 08:09:47 -06:00
|
|
|
from common.custom_celery_tasks import shared_dedicated_queue_retry_task
|
|
|
|
|
from common.custom_celery_tasks.create_alert_base_task import CreateAlertBaseTask
|
|
|
|
|
|
2024-01-30 13:07:19 -05:00
|
|
|
if typing.TYPE_CHECKING:
|
|
|
|
|
from apps.alerts.models import Alert
|
|
|
|
|
|
2022-06-03 08:09:47 -06:00
|
|
|
logger = get_task_logger(__name__)
|
|
|
|
|
logger.setLevel(logging.DEBUG)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@shared_task(
|
|
|
|
|
base=CreateAlertBaseTask,
|
|
|
|
|
autoretry_for=(Exception,),
|
|
|
|
|
retry_backoff=True,
|
|
|
|
|
max_retries=1 if settings.DEBUG else None,
|
|
|
|
|
)
|
2024-01-30 17:28:23 -05:00
|
|
|
def create_alertmanager_alerts(alert_receive_channel_pk, alert, is_demo=False, received_at=None):
|
2023-07-25 10:43:23 +01:00
|
|
|
from apps.alerts.models import Alert, AlertReceiveChannel
|
2022-06-03 08:09:47 -06:00
|
|
|
|
|
|
|
|
alert_receive_channel = AlertReceiveChannel.objects_with_deleted.get(pk=alert_receive_channel_pk)
|
2024-11-20 11:17:04 -05:00
|
|
|
if alert_receive_channel.deleted_at is not None or alert_receive_channel.is_maintenace_integration:
|
2023-07-26 14:45:44 +01:00
|
|
|
logger.info("AlertReceiveChannel alert ignored if deleted/maintenance")
|
2022-06-03 08:09:47 -06:00
|
|
|
return
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
alert = Alert.create(
|
|
|
|
|
title=None,
|
|
|
|
|
message=None,
|
|
|
|
|
image_url=None,
|
|
|
|
|
link_to_upstream_details=None,
|
|
|
|
|
alert_receive_channel=alert_receive_channel,
|
|
|
|
|
integration_unique_data=None,
|
|
|
|
|
raw_request_data=alert,
|
|
|
|
|
enable_autoresolve=False,
|
|
|
|
|
is_demo=is_demo,
|
2023-12-06 09:20:03 -03:00
|
|
|
received_at=received_at,
|
2022-06-03 08:09:47 -06:00
|
|
|
)
|
|
|
|
|
except ConcurrentUpdateError:
|
|
|
|
|
# This error is raised when there are concurrent updates on AlertGroupCounter due to optimistic lock on it.
|
|
|
|
|
# The idea is to not block the worker with a database lock and retry the task in case of concurrent updates.
|
|
|
|
|
countdown = random.randint(1, 10)
|
|
|
|
|
create_alertmanager_alerts.apply_async((alert_receive_channel_pk, alert), countdown=countdown)
|
|
|
|
|
logger.warning(f"Retrying the task gracefully in {countdown} seconds due to ConcurrentUpdateError")
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
if alert_receive_channel.allow_source_based_resolving:
|
2023-04-24 13:38:21 +08:00
|
|
|
alert_group = alert.group
|
|
|
|
|
if alert_group.resolved_by != alert_group.NOT_YET_STOP_AUTORESOLVE:
|
|
|
|
|
task = resolve_alert_group_by_source_if_needed.apply_async((alert.group.pk,), countdown=5)
|
|
|
|
|
alert.group.active_resolve_calculation_id = task.id
|
|
|
|
|
alert.group.save(update_fields=["active_resolve_calculation_id"])
|
2022-06-03 08:09:47 -06:00
|
|
|
|
2024-01-30 16:39:04 +08:00
|
|
|
logger.debug(
|
|
|
|
|
f"Created alertmanager alert alert_id={alert.pk} alert_group_id={alert.group.pk} channel_id={alert_receive_channel.pk}"
|
2023-11-21 09:16:15 -07:00
|
|
|
)
|
2022-06-03 08:09:47 -06:00
|
|
|
|
|
|
|
|
|
|
|
|
|
@shared_task(
|
|
|
|
|
base=CreateAlertBaseTask,
|
|
|
|
|
autoretry_for=(Exception,),
|
|
|
|
|
retry_backoff=True,
|
|
|
|
|
max_retries=1 if settings.DEBUG else None,
|
|
|
|
|
)
|
|
|
|
|
def create_alert(
|
2024-01-30 13:07:19 -05:00
|
|
|
title: typing.Optional[str],
|
|
|
|
|
message: typing.Optional[str],
|
|
|
|
|
image_url: typing.Optional[str],
|
|
|
|
|
link_to_upstream_details: typing.Optional[str],
|
|
|
|
|
alert_receive_channel_pk: int,
|
|
|
|
|
integration_unique_data: typing.Optional[typing.Dict],
|
|
|
|
|
raw_request_data: "Alert.RawRequestData",
|
|
|
|
|
is_demo: bool = False,
|
|
|
|
|
received_at: typing.Optional[str] = None,
|
|
|
|
|
) -> None:
|
2023-07-25 10:43:23 +01:00
|
|
|
from apps.alerts.models import Alert, AlertReceiveChannel
|
2022-06-03 08:09:47 -06:00
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
alert_receive_channel = AlertReceiveChannel.objects.get(pk=alert_receive_channel_pk)
|
|
|
|
|
except AlertReceiveChannel.DoesNotExist:
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
if image_url is not None:
|
|
|
|
|
image_url = str(image_url)[:299]
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
alert = Alert.create(
|
|
|
|
|
title=title,
|
|
|
|
|
message=message,
|
|
|
|
|
image_url=image_url,
|
|
|
|
|
link_to_upstream_details=link_to_upstream_details,
|
|
|
|
|
alert_receive_channel=alert_receive_channel,
|
|
|
|
|
integration_unique_data=integration_unique_data,
|
|
|
|
|
raw_request_data=raw_request_data,
|
|
|
|
|
is_demo=is_demo,
|
2023-12-06 09:20:03 -03:00
|
|
|
received_at=received_at,
|
2022-06-03 08:09:47 -06:00
|
|
|
)
|
2024-01-30 16:39:04 +08:00
|
|
|
logger.debug(
|
2023-11-21 09:16:15 -07:00
|
|
|
f"Created alert alert_id={alert.pk} alert_group_id={alert.group.pk} channel_id={alert_receive_channel.pk}"
|
|
|
|
|
)
|
2022-06-03 08:09:47 -06:00
|
|
|
except ConcurrentUpdateError:
|
|
|
|
|
# This error is raised when there are concurrent updates on AlertGroupCounter due to optimistic lock on it.
|
|
|
|
|
# The idea is to not block the worker with a database lock and retry the task in case of concurrent updates.
|
|
|
|
|
countdown = random.randint(1, 10)
|
|
|
|
|
create_alert.apply_async(
|
|
|
|
|
(
|
|
|
|
|
title,
|
|
|
|
|
message,
|
|
|
|
|
image_url,
|
|
|
|
|
link_to_upstream_details,
|
|
|
|
|
alert_receive_channel_pk,
|
|
|
|
|
integration_unique_data,
|
|
|
|
|
raw_request_data,
|
|
|
|
|
),
|
2023-12-06 09:20:03 -03:00
|
|
|
kwargs={
|
|
|
|
|
"received_at": received_at,
|
|
|
|
|
},
|
2022-06-03 08:09:47 -06:00
|
|
|
countdown=countdown,
|
|
|
|
|
)
|
2024-07-09 09:24:18 +08:00
|
|
|
logger.warning(
|
|
|
|
|
f"Retrying the task gracefully in {countdown} seconds due to ConcurrentUpdateError for alert_receive_channel={alert_receive_channel_pk}"
|
|
|
|
|
)
|
2022-06-03 08:09:47 -06:00
|
|
|
|
|
|
|
|
|
|
|
|
|
@shared_dedicated_queue_retry_task()
|
|
|
|
|
def start_notify_about_integration_ratelimit(team_id, text, **kwargs):
|
|
|
|
|
notify_about_integration_ratelimit_in_slack.apply_async(
|
|
|
|
|
args=(
|
|
|
|
|
team_id,
|
|
|
|
|
text,
|
|
|
|
|
),
|
|
|
|
|
kwargs=kwargs,
|
|
|
|
|
expires=60 * 5,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@shared_dedicated_queue_retry_task(
|
|
|
|
|
autoretry_for=(Exception,), retry_backoff=True, max_retries=1 if settings.DEBUG else 5
|
|
|
|
|
)
|
|
|
|
|
def notify_about_integration_ratelimit_in_slack(organization_id, text, **kwargs):
|
|
|
|
|
# TODO: Review ratelimits
|
2023-07-25 10:43:23 +01:00
|
|
|
from apps.user_management.models import Organization
|
2022-06-03 08:09:47 -06:00
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
organization = Organization.objects.get(pk=organization_id)
|
|
|
|
|
except Organization.DoesNotExist:
|
|
|
|
|
logger.warning(f"Organization {organization_id} does not exist")
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
cache_key = f"notify_about_integration_ratelimit_in_slack_{organization.pk}"
|
|
|
|
|
if cache.get(cache_key):
|
|
|
|
|
logger.debug(f"Message was sent recently for organization {organization_id}")
|
|
|
|
|
return
|
|
|
|
|
else:
|
|
|
|
|
cache.set(cache_key, True, 60 * 15) # Set cache before sending message to make sure we don't ratelimit slack
|
|
|
|
|
slack_team_identity = organization.slack_team_identity
|
feat: convert `organization.general_log_channel_id` to `organization.default_slack_channel` (#5191)
# What this PR does
Related to https://github.com/grafana/oncall-private/issues/2947
Right now `general_log_channel_id` is just a string value representing
the Slack Channel ID (ex. `C043HQ70QMB`). This PR migrates this instead
to be a foreign key relationship on the `slack_slackchannel` table and
updates all references to `general_log_channel_id`.
Tested migrations locally:
```bash
Operations to perform:
Apply all migrations: [redacted secret grafana-admin-creds:admin-user], alerts, auth, auth_token, base, contenttypes, email, exotel, fcm_django, google, heartbeat, labels, mobile_app, oss_installation, phone_notifications, schedules, sessions, slack, social_django, telegram, twilioapp, user_management, webhooks, zvonok
Running migrations:
Applying user_management.0024_organization_general_log_slack_channel... OK
source=engine:app google_trace_id=none logger=apps.user_management.migrations.0025_auto_20241017_1919 Starting migration to populate general_log_slack_channel field.
source=engine:app google_trace_id=none logger=apps.user_management.migrations.0025_auto_20241017_1919 Total organizations to process: 1
source=engine:app google_trace_id=none logger=apps.user_management.migrations.0025_auto_20241017_1919 Organization 1 updated with SlackChannel 2 (slack_id: C043LL6RTS7).
source=engine:app google_trace_id=none logger=apps.user_management.migrations.0025_auto_20241017_1919 Finished migration. Total organizations processed: 1. Organizations updated: 1. Missing SlackChannels: 0.
Applying user_management.0025_auto_20241017_1919... OK
```
## Future incoming PRs
- Drop `Organization.general_log_channel_id` column
- Migrate `ChannelFilter.slack_channel_id` and
`ResolutionNoteSlackMessage.slack_channel_id` to use foreign key
relationships
## Checklist
- [x] Unit, integration, and e2e (if applicable) tests updated
- [x] Documentation added (or `pr:no public docs` PR label added if not
required)
- [x] Added the relevant release notes label (see labels prefixed w/
`release:`). These labels dictate how your PR will
show up in the autogenerated release notes.
2024-11-01 06:41:38 +01:00
|
|
|
org_default_slack_channel_id = organization.default_slack_channel_slack_id
|
|
|
|
|
|
|
|
|
|
if slack_team_identity is not None and org_default_slack_channel_id is not None:
|
2022-06-03 08:09:47 -06:00
|
|
|
try:
|
2023-11-30 09:35:46 -03:00
|
|
|
sc = SlackClient(slack_team_identity, enable_ratelimit_retry=True)
|
feat: convert `organization.general_log_channel_id` to `organization.default_slack_channel` (#5191)
# What this PR does
Related to https://github.com/grafana/oncall-private/issues/2947
Right now `general_log_channel_id` is just a string value representing
the Slack Channel ID (ex. `C043HQ70QMB`). This PR migrates this instead
to be a foreign key relationship on the `slack_slackchannel` table and
updates all references to `general_log_channel_id`.
Tested migrations locally:
```bash
Operations to perform:
Apply all migrations: [redacted secret grafana-admin-creds:admin-user], alerts, auth, auth_token, base, contenttypes, email, exotel, fcm_django, google, heartbeat, labels, mobile_app, oss_installation, phone_notifications, schedules, sessions, slack, social_django, telegram, twilioapp, user_management, webhooks, zvonok
Running migrations:
Applying user_management.0024_organization_general_log_slack_channel... OK
source=engine:app google_trace_id=none logger=apps.user_management.migrations.0025_auto_20241017_1919 Starting migration to populate general_log_slack_channel field.
source=engine:app google_trace_id=none logger=apps.user_management.migrations.0025_auto_20241017_1919 Total organizations to process: 1
source=engine:app google_trace_id=none logger=apps.user_management.migrations.0025_auto_20241017_1919 Organization 1 updated with SlackChannel 2 (slack_id: C043LL6RTS7).
source=engine:app google_trace_id=none logger=apps.user_management.migrations.0025_auto_20241017_1919 Finished migration. Total organizations processed: 1. Organizations updated: 1. Missing SlackChannels: 0.
Applying user_management.0025_auto_20241017_1919... OK
```
## Future incoming PRs
- Drop `Organization.general_log_channel_id` column
- Migrate `ChannelFilter.slack_channel_id` and
`ResolutionNoteSlackMessage.slack_channel_id` to use foreign key
relationships
## Checklist
- [x] Unit, integration, and e2e (if applicable) tests updated
- [x] Documentation added (or `pr:no public docs` PR label added if not
required)
- [x] Added the relevant release notes label (see labels prefixed w/
`release:`). These labels dictate how your PR will
show up in the autogenerated release notes.
2024-11-01 06:41:38 +01:00
|
|
|
sc.chat_postMessage(channel=org_default_slack_channel_id, text=text)
|
2023-09-12 10:49:16 +01:00
|
|
|
except SlackAPIError as e:
|
2022-06-03 08:09:47 -06:00
|
|
|
logger.warning(f"Slack exception {e} while sending message for organization {organization_id}")
|
feat: convert `organization.general_log_channel_id` to `organization.default_slack_channel` (#5191)
# What this PR does
Related to https://github.com/grafana/oncall-private/issues/2947
Right now `general_log_channel_id` is just a string value representing
the Slack Channel ID (ex. `C043HQ70QMB`). This PR migrates this instead
to be a foreign key relationship on the `slack_slackchannel` table and
updates all references to `general_log_channel_id`.
Tested migrations locally:
```bash
Operations to perform:
Apply all migrations: [redacted secret grafana-admin-creds:admin-user], alerts, auth, auth_token, base, contenttypes, email, exotel, fcm_django, google, heartbeat, labels, mobile_app, oss_installation, phone_notifications, schedules, sessions, slack, social_django, telegram, twilioapp, user_management, webhooks, zvonok
Running migrations:
Applying user_management.0024_organization_general_log_slack_channel... OK
source=engine:app google_trace_id=none logger=apps.user_management.migrations.0025_auto_20241017_1919 Starting migration to populate general_log_slack_channel field.
source=engine:app google_trace_id=none logger=apps.user_management.migrations.0025_auto_20241017_1919 Total organizations to process: 1
source=engine:app google_trace_id=none logger=apps.user_management.migrations.0025_auto_20241017_1919 Organization 1 updated with SlackChannel 2 (slack_id: C043LL6RTS7).
source=engine:app google_trace_id=none logger=apps.user_management.migrations.0025_auto_20241017_1919 Finished migration. Total organizations processed: 1. Organizations updated: 1. Missing SlackChannels: 0.
Applying user_management.0025_auto_20241017_1919... OK
```
## Future incoming PRs
- Drop `Organization.general_log_channel_id` column
- Migrate `ChannelFilter.slack_channel_id` and
`ResolutionNoteSlackMessage.slack_channel_id` to use foreign key
relationships
## Checklist
- [x] Unit, integration, and e2e (if applicable) tests updated
- [x] Documentation added (or `pr:no public docs` PR label added if not
required)
- [x] Added the relevant release notes label (see labels prefixed w/
`release:`). These labels dictate how your PR will
show up in the autogenerated release notes.
2024-11-01 06:41:38 +01:00
|
|
|
else:
|
|
|
|
|
logger.info(
|
|
|
|
|
f"Slack team identity or general log channel is not set for organization {organization_id} "
|
|
|
|
|
f"skipping rest of notify_about_integration_ratelimit_in_slack "
|
|
|
|
|
f"slack_team_identity={slack_team_identity} org_default_slack_channel_id={org_default_slack_channel_id}"
|
|
|
|
|
)
|