2022-06-03 08:09:47 -06:00
|
|
|
import json
|
|
|
|
|
import logging
|
|
|
|
|
import time
|
2023-06-27 12:23:08 +02:00
|
|
|
import typing
|
2022-06-03 08:09:47 -06:00
|
|
|
from urllib.parse import urljoin
|
|
|
|
|
|
|
|
|
|
import requests
|
|
|
|
|
from django.conf import settings
|
|
|
|
|
from rest_framework import status
|
|
|
|
|
|
2024-10-10 15:02:21 -04:00
|
|
|
from apps.api.permissions import GrafanaAPIPermission, GrafanaAPIPermissions
|
2024-10-02 13:39:49 -04:00
|
|
|
from common.constants.plugin_ids import PluginID
|
2022-11-29 09:41:56 +01:00
|
|
|
|
2024-10-11 14:57:59 -04:00
|
|
|
if typing.TYPE_CHECKING:
|
|
|
|
|
from apps.user_management.models import Organization
|
|
|
|
|
|
2022-06-03 08:09:47 -06:00
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
2023-06-27 12:23:08 +02:00
|
|
|
class GrafanaUser(typing.TypedDict):
|
2022-11-29 09:41:56 +01:00
|
|
|
orgId: int
|
|
|
|
|
userId: int
|
|
|
|
|
email: str
|
|
|
|
|
name: str
|
|
|
|
|
avatarUrl: str
|
|
|
|
|
login: str
|
|
|
|
|
role: str
|
|
|
|
|
lastSeenAt: str
|
|
|
|
|
lastSeenAtAge: str
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class GrafanaUserWithPermissions(GrafanaUser):
|
2023-06-27 12:23:08 +02:00
|
|
|
permissions: typing.List[GrafanaAPIPermission]
|
2022-11-29 09:41:56 +01:00
|
|
|
|
|
|
|
|
|
2023-06-27 12:23:08 +02:00
|
|
|
GrafanaUsersWithPermissions = typing.List[GrafanaUserWithPermissions]
|
|
|
|
|
UserPermissionsDict = typing.Dict[str, typing.List[GrafanaAPIPermission]]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class GCOMInstanceInfoConfigFeatureToggles(typing.TypedDict):
|
Refactor how RBAC enabled/disabled status is determined for Grafana Cloud stacks (#4279)
# What this PR does
In cloud we are currently (somewhat) improperly determining whether or
not a Grafana stack had the `accessControlOnCall` feature flag enabled.
At first things worked fine. We would enable this feature toggle via the
Grafana Admin UI, and then the OnCall backend would read this value from
GCOM's `GET /instance/<stack_id>` endpoint (via
`config.feature_toggles`), and everything worked as expected.
There was a recent change made in `grafana/deployment_tools` to set this
feature flag to True for all stacks. However, for some reason, the GCOM
endpoint above doesn't return the `accessControlOnCall` feature toggle
value in `config.feature_toggles` if it is set in this manner (it only
returns the value if it is set via the Grafana Admin UI).
So what we should instead be doing is such instead of asking GCOM for
this feature toggle, infer whether RBAC is enabled on the stack by doing
a `HEAD /api/access-control/users/permissions/search` (this endpoint _is
only_ available on a Grafana stack if `accessControlOnCall` is enabled).
**Few caveats to this ☝️**
1. we first have to make sure that the cloud stack is in an `active`
state (ie. not paused). This is because, no matter if the
`accessControlOnCall` is enabled or not, if the stack is in a `paused`
state it will ALWAYS return `HTTP 200` which can be misleading and lead
to bugs (this feels like a bug on the Grafana API, will follow up with
core grafana team)
2. Once we roll out this change we will effectively **actually** be
enabling RBAC for OnCall for all orgs. The Identity Access team would
prefer a progressive rollout, which is why I decided to introduce the
concept of
[`settings.CLOUD_RBAC_ROLLOUT_PERCENTAGE`](https://github.com/grafana/oncall/pull/4279/files#diff-3383aef931e41e44d95829ad971641eeb98fe001be2f5da92217446d300ea1b3R918)
(see also [`Organization.
should_be_considered_for_rbac_permissioning`](https://github.com/grafana/oncall/pull/4279/files#diff-2ca9917f4f56349be39545ee8abd459be5076295d02ca3a7ec545152fcddccdfR348-R362))
## Which issue(s) this PR closes
Related to https://github.com/grafana/identity-access-team/issues/667
## Checklist
- [x] Unit, integration, and e2e (if applicable) tests updated
- [x] Documentation added (or `pr:no public docs` PR label added if not
required)
- [x] Added the relevant release notes label (see labels prefixed w/
`release:`). These labels dictate how your PR will
show up in the autogenerated release notes.
2024-05-14 12:30:16 -04:00
|
|
|
accessControlOnCall: typing.NotRequired[str]
|
2023-01-11 12:48:30 +01:00
|
|
|
|
|
|
|
|
|
2023-06-27 12:23:08 +02:00
|
|
|
class GCOMInstanceInfoConfig(typing.TypedDict):
|
2023-01-11 12:48:30 +01:00
|
|
|
feature_toggles: GCOMInstanceInfoConfigFeatureToggles
|
|
|
|
|
|
|
|
|
|
|
2023-06-27 12:23:08 +02:00
|
|
|
class GCOMInstanceInfo(typing.TypedDict):
|
2022-11-29 09:41:56 +01:00
|
|
|
id: int
|
|
|
|
|
orgId: int
|
|
|
|
|
slug: str
|
|
|
|
|
orgSlug: str
|
|
|
|
|
orgName: str
|
|
|
|
|
url: str
|
|
|
|
|
status: str
|
2023-03-09 14:30:54 +08:00
|
|
|
clusterSlug: str
|
2023-08-03 11:43:03 +02:00
|
|
|
config: typing.NotRequired[GCOMInstanceInfoConfig]
|
2023-06-27 12:23:08 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
class ApiClientResponseCallStatus(typing.TypedDict):
|
|
|
|
|
url: str
|
|
|
|
|
connected: bool
|
|
|
|
|
status_code: int
|
|
|
|
|
message: str
|
|
|
|
|
|
|
|
|
|
|
2023-08-03 11:43:03 +02:00
|
|
|
_RT = typing.TypeVar("_RT")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class APIClientResponse(typing.Generic[_RT], typing.Tuple[typing.Optional[_RT], ApiClientResponseCallStatus]):
|
|
|
|
|
pass
|
2023-06-27 12:23:08 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
# can't define this using class syntax because one of the keys contains a dash
|
|
|
|
|
# https://docs.python.org/3/library/typing.html#typing.TypedDict:~:text=The%20functional%20syntax%20should%20also%20be%20used%20when%20any%20of%20the%20keys%20are%20not%20valid%20identifiers%2C%20for%20example%20because%20they%20are%20keywords%20or%20contain%20hyphens.%20Example%3A
|
|
|
|
|
APIRequestHeaders = typing.TypedDict(
|
|
|
|
|
"APIRequestHeaders",
|
|
|
|
|
{
|
|
|
|
|
"User-Agent": str,
|
|
|
|
|
"Authorization": str,
|
|
|
|
|
},
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class HttpMethod(typing.Protocol):
|
|
|
|
|
"""
|
|
|
|
|
TODO: can probably replace this with something from the requests library?
|
|
|
|
|
https://github.com/psf/requests/blob/main/requests/api.py#L14
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
@property
|
|
|
|
|
def __name__(self) -> str:
|
|
|
|
|
...
|
|
|
|
|
|
|
|
|
|
def __call__(self, *args, **kwargs) -> requests.Response:
|
|
|
|
|
...
|
2022-11-29 09:41:56 +01:00
|
|
|
|
|
|
|
|
|
2022-06-03 08:09:47 -06:00
|
|
|
class APIClient:
|
2023-06-27 12:23:08 +02:00
|
|
|
def __init__(self, api_url: str, api_token: str) -> None:
|
2022-06-03 08:09:47 -06:00
|
|
|
self.api_url = api_url
|
|
|
|
|
self.api_token = api_token
|
|
|
|
|
|
2023-08-03 11:43:03 +02:00
|
|
|
def api_head(self, endpoint: str, body: typing.Optional[typing.Dict] = None, **kwargs) -> APIClientResponse[_RT]:
|
2023-01-24 13:44:07 +08:00
|
|
|
return self.call_api(endpoint, requests.head, body, **kwargs)
|
2022-11-29 09:41:56 +01:00
|
|
|
|
2023-08-03 11:43:03 +02:00
|
|
|
def api_get(self, endpoint: str, **kwargs) -> APIClientResponse[_RT]:
|
2023-01-24 13:44:07 +08:00
|
|
|
return self.call_api(endpoint, requests.get, **kwargs)
|
2022-06-03 08:09:47 -06:00
|
|
|
|
2023-08-03 11:43:03 +02:00
|
|
|
def api_post(self, endpoint: str, body: typing.Optional[typing.Dict] = None, **kwargs) -> APIClientResponse[_RT]:
|
2023-01-24 13:44:07 +08:00
|
|
|
return self.call_api(endpoint, requests.post, body, **kwargs)
|
2022-06-03 08:09:47 -06:00
|
|
|
|
2023-10-20 09:30:11 +02:00
|
|
|
def api_put(self, endpoint: str, body: typing.Optional[typing.Dict] = None, **kwargs) -> APIClientResponse[_RT]:
|
|
|
|
|
return self.call_api(endpoint, requests.put, body, **kwargs)
|
|
|
|
|
|
2023-06-27 12:23:08 +02:00
|
|
|
def call_api(
|
|
|
|
|
self, endpoint: str, http_method: HttpMethod, body: typing.Optional[typing.Dict] = None, **kwargs
|
2023-08-03 11:43:03 +02:00
|
|
|
) -> APIClientResponse[_RT]:
|
2022-06-03 08:09:47 -06:00
|
|
|
request_start = time.perf_counter()
|
2023-06-27 12:23:08 +02:00
|
|
|
call_status: ApiClientResponseCallStatus = {
|
2022-06-03 08:09:47 -06:00
|
|
|
"url": urljoin(self.api_url, endpoint),
|
|
|
|
|
"connected": False,
|
|
|
|
|
"status_code": status.HTTP_503_SERVICE_UNAVAILABLE,
|
|
|
|
|
"message": "",
|
|
|
|
|
}
|
|
|
|
|
try:
|
2023-01-24 13:44:07 +08:00
|
|
|
response = http_method(call_status["url"], json=body, headers=self.request_headers, **kwargs)
|
2022-06-03 08:09:47 -06:00
|
|
|
call_status["status_code"] = response.status_code
|
|
|
|
|
response.raise_for_status()
|
|
|
|
|
|
|
|
|
|
call_status["connected"] = True
|
|
|
|
|
call_status["message"] = response.reason
|
|
|
|
|
|
|
|
|
|
if response.status_code == status.HTTP_204_NO_CONTENT:
|
|
|
|
|
return {}, call_status
|
|
|
|
|
|
2023-01-20 09:19:41 +01:00
|
|
|
# ex. a HEAD call (self.api_head) would have a response.content of b''
|
|
|
|
|
# and hence calling response.json() throws a json.JSONDecodeError
|
|
|
|
|
return response.json() if response.content else None, call_status
|
2022-06-03 08:09:47 -06:00
|
|
|
except (
|
|
|
|
|
requests.exceptions.ConnectionError,
|
|
|
|
|
requests.exceptions.HTTPError,
|
|
|
|
|
requests.exceptions.TooManyRedirects,
|
2023-01-24 13:44:07 +08:00
|
|
|
requests.exceptions.Timeout,
|
2022-06-03 08:09:47 -06:00
|
|
|
json.JSONDecodeError,
|
|
|
|
|
) as e:
|
|
|
|
|
logger.warning("Error connecting to api instance " + str(e))
|
|
|
|
|
call_status["message"] = "{0}".format(e)
|
|
|
|
|
finally:
|
|
|
|
|
request_end = time.perf_counter()
|
|
|
|
|
status_code = call_status["status_code"]
|
|
|
|
|
url = call_status["url"]
|
|
|
|
|
seconds = request_end - request_start
|
|
|
|
|
logging.info(
|
|
|
|
|
f"outbound latency={str(seconds)} status={status_code} "
|
|
|
|
|
f"method={http_method.__name__.upper()} url={url} "
|
|
|
|
|
f"slow={int(seconds > settings.SLOW_THRESHOLD_SECONDS)} "
|
|
|
|
|
)
|
|
|
|
|
return None, call_status
|
|
|
|
|
|
|
|
|
|
@property
|
2023-06-27 12:23:08 +02:00
|
|
|
def request_headers(self) -> APIRequestHeaders:
|
2022-06-03 08:09:47 -06:00
|
|
|
return {"User-Agent": settings.GRAFANA_COM_USER_AGENT, "Authorization": f"Bearer {self.api_token}"}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class GrafanaAPIClient(APIClient):
|
2023-12-05 14:58:05 -05:00
|
|
|
GRAFANA_INCIDENT_PLUGIN_BACKEND_URL_KEY = "backendUrl"
|
|
|
|
|
|
2024-10-02 13:39:49 -04:00
|
|
|
USER_PERMISSION_ENDPOINT = f"api/access-control/users/permissions/search?actionPrefix={PluginID.ONCALL}"
|
2022-11-29 09:41:56 +01:00
|
|
|
|
2024-08-23 13:52:53 -06:00
|
|
|
MIN_GRAFANA_TOKEN_LENGTH = 16
|
|
|
|
|
|
2023-08-03 11:43:03 +02:00
|
|
|
class Types:
|
|
|
|
|
class _BaseGrafanaAPIResponse(typing.TypedDict):
|
|
|
|
|
totalCount: int
|
|
|
|
|
page: int
|
|
|
|
|
perPage: int
|
|
|
|
|
|
|
|
|
|
class GrafanaTeam(typing.TypedDict):
|
|
|
|
|
id: int
|
|
|
|
|
orgId: int
|
|
|
|
|
name: str
|
|
|
|
|
email: str
|
|
|
|
|
avatarUrl: str
|
|
|
|
|
memberCount: int
|
|
|
|
|
|
2023-11-23 09:42:27 -07:00
|
|
|
class GrafanaServiceAccount(typing.TypedDict):
|
|
|
|
|
id: int
|
|
|
|
|
name: str
|
|
|
|
|
login: str
|
|
|
|
|
orgId: int
|
|
|
|
|
isDisabled: bool
|
|
|
|
|
role: str
|
|
|
|
|
tokens: int
|
|
|
|
|
avatarUrl: str
|
|
|
|
|
|
|
|
|
|
class GrafanaServiceAccountToken(typing.TypedDict):
|
|
|
|
|
id: int
|
|
|
|
|
name: str
|
|
|
|
|
key: str
|
|
|
|
|
|
2023-12-05 14:58:05 -05:00
|
|
|
class PluginSettings(typing.TypedDict):
|
|
|
|
|
enabled: bool
|
2024-01-15 11:34:40 -05:00
|
|
|
jsonData: typing.NotRequired[typing.Dict[str, str]]
|
2023-12-05 14:58:05 -05:00
|
|
|
|
2023-08-03 11:43:03 +02:00
|
|
|
class TeamsResponse(_BaseGrafanaAPIResponse):
|
|
|
|
|
teams: typing.List["GrafanaAPIClient.Types.GrafanaTeam"]
|
|
|
|
|
|
2023-11-23 09:42:27 -07:00
|
|
|
class ServiceAccountResponse(_BaseGrafanaAPIResponse):
|
|
|
|
|
serviceAccounts: typing.List["GrafanaAPIClient.Types.GrafanaServiceAccount"]
|
|
|
|
|
|
2023-06-27 12:23:08 +02:00
|
|
|
def __init__(self, api_url: str, api_token: str) -> None:
|
2022-06-03 08:09:47 -06:00
|
|
|
super().__init__(api_url, api_token)
|
|
|
|
|
|
2023-06-27 12:23:08 +02:00
|
|
|
def check_token(self) -> APIClientResponse:
|
2022-11-21 16:26:00 +01:00
|
|
|
return self.api_head("api/org")
|
2022-06-03 08:09:47 -06:00
|
|
|
|
Refactor how RBAC enabled/disabled status is determined for Grafana Cloud stacks (#4279)
# What this PR does
In cloud we are currently (somewhat) improperly determining whether or
not a Grafana stack had the `accessControlOnCall` feature flag enabled.
At first things worked fine. We would enable this feature toggle via the
Grafana Admin UI, and then the OnCall backend would read this value from
GCOM's `GET /instance/<stack_id>` endpoint (via
`config.feature_toggles`), and everything worked as expected.
There was a recent change made in `grafana/deployment_tools` to set this
feature flag to True for all stacks. However, for some reason, the GCOM
endpoint above doesn't return the `accessControlOnCall` feature toggle
value in `config.feature_toggles` if it is set in this manner (it only
returns the value if it is set via the Grafana Admin UI).
So what we should instead be doing is such instead of asking GCOM for
this feature toggle, infer whether RBAC is enabled on the stack by doing
a `HEAD /api/access-control/users/permissions/search` (this endpoint _is
only_ available on a Grafana stack if `accessControlOnCall` is enabled).
**Few caveats to this ☝️**
1. we first have to make sure that the cloud stack is in an `active`
state (ie. not paused). This is because, no matter if the
`accessControlOnCall` is enabled or not, if the stack is in a `paused`
state it will ALWAYS return `HTTP 200` which can be misleading and lead
to bugs (this feels like a bug on the Grafana API, will follow up with
core grafana team)
2. Once we roll out this change we will effectively **actually** be
enabling RBAC for OnCall for all orgs. The Identity Access team would
prefer a progressive rollout, which is why I decided to introduce the
concept of
[`settings.CLOUD_RBAC_ROLLOUT_PERCENTAGE`](https://github.com/grafana/oncall/pull/4279/files#diff-3383aef931e41e44d95829ad971641eeb98fe001be2f5da92217446d300ea1b3R918)
(see also [`Organization.
should_be_considered_for_rbac_permissioning`](https://github.com/grafana/oncall/pull/4279/files#diff-2ca9917f4f56349be39545ee8abd459be5076295d02ca3a7ec545152fcddccdfR348-R362))
## Which issue(s) this PR closes
Related to https://github.com/grafana/identity-access-team/issues/667
## Checklist
- [x] Unit, integration, and e2e (if applicable) tests updated
- [x] Documentation added (or `pr:no public docs` PR label added if not
required)
- [x] Added the relevant release notes label (see labels prefixed w/
`release:`). These labels dictate how your PR will
show up in the autogenerated release notes.
2024-05-14 12:30:16 -04:00
|
|
|
def get_users_permissions(self) -> typing.Optional[UserPermissionsDict]:
|
2022-06-03 08:09:47 -06:00
|
|
|
"""
|
2022-11-29 09:41:56 +01:00
|
|
|
It is possible that this endpoint may not be available for certain Grafana orgs.
|
|
|
|
|
Ex: for Grafana Cloud orgs whom have pinned their Grafana version to an earlier version
|
|
|
|
|
where this endpoint is not available
|
|
|
|
|
|
|
|
|
|
The response from the Grafana endpoint will look something like this:
|
|
|
|
|
{
|
|
|
|
|
"1": {
|
|
|
|
|
"grafana-oncall-app.alert-groups:read": [
|
|
|
|
|
""
|
|
|
|
|
],
|
|
|
|
|
"grafana-oncall-app.alert-groups:write": [
|
|
|
|
|
""
|
|
|
|
|
]
|
|
|
|
|
}
|
|
|
|
|
}
|
2022-06-03 08:09:47 -06:00
|
|
|
"""
|
2023-06-27 12:23:08 +02:00
|
|
|
response, _ = self.api_get(self.USER_PERMISSION_ENDPOINT)
|
Refactor how RBAC enabled/disabled status is determined for Grafana Cloud stacks (#4279)
# What this PR does
In cloud we are currently (somewhat) improperly determining whether or
not a Grafana stack had the `accessControlOnCall` feature flag enabled.
At first things worked fine. We would enable this feature toggle via the
Grafana Admin UI, and then the OnCall backend would read this value from
GCOM's `GET /instance/<stack_id>` endpoint (via
`config.feature_toggles`), and everything worked as expected.
There was a recent change made in `grafana/deployment_tools` to set this
feature flag to True for all stacks. However, for some reason, the GCOM
endpoint above doesn't return the `accessControlOnCall` feature toggle
value in `config.feature_toggles` if it is set in this manner (it only
returns the value if it is set via the Grafana Admin UI).
So what we should instead be doing is such instead of asking GCOM for
this feature toggle, infer whether RBAC is enabled on the stack by doing
a `HEAD /api/access-control/users/permissions/search` (this endpoint _is
only_ available on a Grafana stack if `accessControlOnCall` is enabled).
**Few caveats to this ☝️**
1. we first have to make sure that the cloud stack is in an `active`
state (ie. not paused). This is because, no matter if the
`accessControlOnCall` is enabled or not, if the stack is in a `paused`
state it will ALWAYS return `HTTP 200` which can be misleading and lead
to bugs (this feels like a bug on the Grafana API, will follow up with
core grafana team)
2. Once we roll out this change we will effectively **actually** be
enabling RBAC for OnCall for all orgs. The Identity Access team would
prefer a progressive rollout, which is why I decided to introduce the
concept of
[`settings.CLOUD_RBAC_ROLLOUT_PERCENTAGE`](https://github.com/grafana/oncall/pull/4279/files#diff-3383aef931e41e44d95829ad971641eeb98fe001be2f5da92217446d300ea1b3R918)
(see also [`Organization.
should_be_considered_for_rbac_permissioning`](https://github.com/grafana/oncall/pull/4279/files#diff-2ca9917f4f56349be39545ee8abd459be5076295d02ca3a7ec545152fcddccdfR348-R362))
## Which issue(s) this PR closes
Related to https://github.com/grafana/identity-access-team/issues/667
## Checklist
- [x] Unit, integration, and e2e (if applicable) tests updated
- [x] Documentation added (or `pr:no public docs` PR label added if not
required)
- [x] Added the relevant release notes label (see labels prefixed w/
`release:`). These labels dictate how your PR will
show up in the autogenerated release notes.
2024-05-14 12:30:16 -04:00
|
|
|
if response is None or isinstance(response, list):
|
|
|
|
|
return None
|
2022-11-29 09:41:56 +01:00
|
|
|
|
2023-06-27 12:23:08 +02:00
|
|
|
data: typing.Dict[str, typing.Dict[str, typing.List[str]]] = response
|
|
|
|
|
|
|
|
|
|
all_users_permissions: UserPermissionsDict = {}
|
2022-11-29 09:41:56 +01:00
|
|
|
for user_id, user_permissions in data.items():
|
2024-10-10 15:02:21 -04:00
|
|
|
all_users_permissions[user_id] = GrafanaAPIPermissions.construct_permissions(user_permissions.keys())
|
2022-11-29 09:41:56 +01:00
|
|
|
|
|
|
|
|
return all_users_permissions
|
|
|
|
|
|
2024-07-29 17:28:35 +01:00
|
|
|
def is_rbac_enabled_for_organization(self) -> tuple[bool, bool]:
|
2022-11-29 09:41:56 +01:00
|
|
|
_, resp_status = self.api_head(self.USER_PERMISSION_ENDPOINT)
|
2024-07-29 17:28:35 +01:00
|
|
|
return resp_status["connected"], resp_status["status_code"] >= status.HTTP_500_INTERNAL_SERVER_ERROR
|
2022-11-29 09:41:56 +01:00
|
|
|
|
2023-06-27 12:23:08 +02:00
|
|
|
def get_users(self, rbac_is_enabled_for_org: bool, **kwargs) -> GrafanaUsersWithPermissions:
|
|
|
|
|
users_response, _ = self.api_get("api/org/users", **kwargs)
|
2022-11-29 09:41:56 +01:00
|
|
|
|
2023-06-27 12:23:08 +02:00
|
|
|
if not users_response:
|
|
|
|
|
return []
|
|
|
|
|
elif isinstance(users_response, dict):
|
2022-11-29 09:41:56 +01:00
|
|
|
return []
|
|
|
|
|
|
2023-06-27 12:23:08 +02:00
|
|
|
users: GrafanaUsersWithPermissions = users_response
|
|
|
|
|
|
Refactor how RBAC enabled/disabled status is determined for Grafana Cloud stacks (#4279)
# What this PR does
In cloud we are currently (somewhat) improperly determining whether or
not a Grafana stack had the `accessControlOnCall` feature flag enabled.
At first things worked fine. We would enable this feature toggle via the
Grafana Admin UI, and then the OnCall backend would read this value from
GCOM's `GET /instance/<stack_id>` endpoint (via
`config.feature_toggles`), and everything worked as expected.
There was a recent change made in `grafana/deployment_tools` to set this
feature flag to True for all stacks. However, for some reason, the GCOM
endpoint above doesn't return the `accessControlOnCall` feature toggle
value in `config.feature_toggles` if it is set in this manner (it only
returns the value if it is set via the Grafana Admin UI).
So what we should instead be doing is such instead of asking GCOM for
this feature toggle, infer whether RBAC is enabled on the stack by doing
a `HEAD /api/access-control/users/permissions/search` (this endpoint _is
only_ available on a Grafana stack if `accessControlOnCall` is enabled).
**Few caveats to this ☝️**
1. we first have to make sure that the cloud stack is in an `active`
state (ie. not paused). This is because, no matter if the
`accessControlOnCall` is enabled or not, if the stack is in a `paused`
state it will ALWAYS return `HTTP 200` which can be misleading and lead
to bugs (this feels like a bug on the Grafana API, will follow up with
core grafana team)
2. Once we roll out this change we will effectively **actually** be
enabling RBAC for OnCall for all orgs. The Identity Access team would
prefer a progressive rollout, which is why I decided to introduce the
concept of
[`settings.CLOUD_RBAC_ROLLOUT_PERCENTAGE`](https://github.com/grafana/oncall/pull/4279/files#diff-3383aef931e41e44d95829ad971641eeb98fe001be2f5da92217446d300ea1b3R918)
(see also [`Organization.
should_be_considered_for_rbac_permissioning`](https://github.com/grafana/oncall/pull/4279/files#diff-2ca9917f4f56349be39545ee8abd459be5076295d02ca3a7ec545152fcddccdfR348-R362))
## Which issue(s) this PR closes
Related to https://github.com/grafana/identity-access-team/issues/667
## Checklist
- [x] Unit, integration, and e2e (if applicable) tests updated
- [x] Documentation added (or `pr:no public docs` PR label added if not
required)
- [x] Added the relevant release notes label (see labels prefixed w/
`release:`). These labels dictate how your PR will
show up in the autogenerated release notes.
2024-05-14 12:30:16 -04:00
|
|
|
user_permissions = {}
|
|
|
|
|
if rbac_is_enabled_for_org:
|
2024-05-15 11:10:04 -04:00
|
|
|
user_permissions = self.get_users_permissions()
|
Refactor how RBAC enabled/disabled status is determined for Grafana Cloud stacks (#4279)
# What this PR does
In cloud we are currently (somewhat) improperly determining whether or
not a Grafana stack had the `accessControlOnCall` feature flag enabled.
At first things worked fine. We would enable this feature toggle via the
Grafana Admin UI, and then the OnCall backend would read this value from
GCOM's `GET /instance/<stack_id>` endpoint (via
`config.feature_toggles`), and everything worked as expected.
There was a recent change made in `grafana/deployment_tools` to set this
feature flag to True for all stacks. However, for some reason, the GCOM
endpoint above doesn't return the `accessControlOnCall` feature toggle
value in `config.feature_toggles` if it is set in this manner (it only
returns the value if it is set via the Grafana Admin UI).
So what we should instead be doing is such instead of asking GCOM for
this feature toggle, infer whether RBAC is enabled on the stack by doing
a `HEAD /api/access-control/users/permissions/search` (this endpoint _is
only_ available on a Grafana stack if `accessControlOnCall` is enabled).
**Few caveats to this ☝️**
1. we first have to make sure that the cloud stack is in an `active`
state (ie. not paused). This is because, no matter if the
`accessControlOnCall` is enabled or not, if the stack is in a `paused`
state it will ALWAYS return `HTTP 200` which can be misleading and lead
to bugs (this feels like a bug on the Grafana API, will follow up with
core grafana team)
2. Once we roll out this change we will effectively **actually** be
enabling RBAC for OnCall for all orgs. The Identity Access team would
prefer a progressive rollout, which is why I decided to introduce the
concept of
[`settings.CLOUD_RBAC_ROLLOUT_PERCENTAGE`](https://github.com/grafana/oncall/pull/4279/files#diff-3383aef931e41e44d95829ad971641eeb98fe001be2f5da92217446d300ea1b3R918)
(see also [`Organization.
should_be_considered_for_rbac_permissioning`](https://github.com/grafana/oncall/pull/4279/files#diff-2ca9917f4f56349be39545ee8abd459be5076295d02ca3a7ec545152fcddccdfR348-R362))
## Which issue(s) this PR closes
Related to https://github.com/grafana/identity-access-team/issues/667
## Checklist
- [x] Unit, integration, and e2e (if applicable) tests updated
- [x] Documentation added (or `pr:no public docs` PR label added if not
required)
- [x] Added the relevant release notes label (see labels prefixed w/
`release:`). These labels dictate how your PR will
show up in the autogenerated release notes.
2024-05-14 12:30:16 -04:00
|
|
|
if user_permissions is None:
|
|
|
|
|
# If we cannot fetch permissions when RBAC is enabled (ex. HTTP 500), we should not return any users
|
|
|
|
|
# to avoid potentially wiping-out OnCall's copy of permissions for all users
|
|
|
|
|
return []
|
2022-11-29 09:41:56 +01:00
|
|
|
|
|
|
|
|
# merge the users permissions response into the org users response
|
|
|
|
|
for user in users:
|
|
|
|
|
user["permissions"] = user_permissions.get(str(user["userId"]), [])
|
|
|
|
|
return users
|
2022-06-03 08:09:47 -06:00
|
|
|
|
2023-08-03 11:43:03 +02:00
|
|
|
def get_teams(self, **kwargs) -> APIClientResponse["GrafanaAPIClient.Types.TeamsResponse"]:
|
|
|
|
|
"""
|
|
|
|
|
[Grafana API Docs](https://grafana.com/docs/grafana/latest/developers/http_api/team/#team-search-with-paging)
|
|
|
|
|
"""
|
2023-01-24 13:44:07 +08:00
|
|
|
return self.api_get("api/teams/search?perpage=1000000", **kwargs)
|
2022-06-03 08:09:47 -06:00
|
|
|
|
2023-06-27 12:23:08 +02:00
|
|
|
def get_team_members(self, team_id: int) -> APIClientResponse:
|
2022-06-03 08:09:47 -06:00
|
|
|
return self.api_get(f"api/teams/{team_id}/members")
|
|
|
|
|
|
2023-06-27 12:23:08 +02:00
|
|
|
def get_datasources(self) -> APIClientResponse:
|
2022-06-03 08:09:47 -06:00
|
|
|
return self.api_get("api/datasources")
|
|
|
|
|
|
2023-06-27 12:23:08 +02:00
|
|
|
def get_datasource_by_id(self, datasource_id) -> APIClientResponse:
|
2022-06-16 17:16:31 +03:00
|
|
|
# This endpoint is deprecated for Grafana version >= 9. Use get_datasource instead
|
2022-06-03 08:09:47 -06:00
|
|
|
return self.api_get(f"api/datasources/{datasource_id}")
|
|
|
|
|
|
2023-06-27 12:23:08 +02:00
|
|
|
def get_datasource(self, datasource_uid) -> APIClientResponse:
|
2022-06-16 17:16:31 +03:00
|
|
|
return self.api_get(f"api/datasources/uid/{datasource_uid}")
|
|
|
|
|
|
2023-06-27 12:23:08 +02:00
|
|
|
def get_alertmanager_status_with_config(self, recipient) -> APIClientResponse:
|
2022-06-03 08:09:47 -06:00
|
|
|
return self.api_get(f"api/alertmanager/{recipient}/api/v2/status")
|
|
|
|
|
|
2023-06-27 12:23:08 +02:00
|
|
|
def get_alerting_config(self, recipient: str) -> APIClientResponse:
|
2022-06-03 08:09:47 -06:00
|
|
|
return self.api_get(f"api/alertmanager/{recipient}/config/api/v1/alerts")
|
|
|
|
|
|
2023-06-27 12:23:08 +02:00
|
|
|
def update_alerting_config(self, recipient, config) -> APIClientResponse:
|
2022-06-03 08:09:47 -06:00
|
|
|
return self.api_post(f"api/alertmanager/{recipient}/config/api/v1/alerts", config)
|
|
|
|
|
|
2023-08-18 12:12:29 +02:00
|
|
|
def get_alerting_notifiers(self):
|
|
|
|
|
return self.api_get("api/alert-notifiers")
|
|
|
|
|
|
2023-12-05 14:58:05 -05:00
|
|
|
def get_grafana_plugin_settings(self, recipient: str) -> APIClientResponse["GrafanaAPIClient.Types.PluginSettings"]:
|
2023-01-17 13:04:50 +01:00
|
|
|
return self.api_get(f"api/plugins/{recipient}/settings")
|
|
|
|
|
|
2023-12-05 14:58:05 -05:00
|
|
|
def get_grafana_incident_plugin_settings(self) -> APIClientResponse["GrafanaAPIClient.Types.PluginSettings"]:
|
2024-10-02 13:39:49 -04:00
|
|
|
return self.get_grafana_plugin_settings(PluginID.INCIDENT)
|
2023-12-05 14:58:05 -05:00
|
|
|
|
2024-01-30 15:29:16 +08:00
|
|
|
def get_grafana_labels_plugin_settings(self) -> APIClientResponse["GrafanaAPIClient.Types.PluginSettings"]:
|
2024-10-02 13:39:49 -04:00
|
|
|
return self.get_grafana_plugin_settings(PluginID.LABELS)
|
2024-01-30 15:29:16 +08:00
|
|
|
|
2024-10-11 14:57:59 -04:00
|
|
|
def get_grafana_irm_plugin_settings(self) -> APIClientResponse["GrafanaAPIClient.Types.PluginSettings"]:
|
|
|
|
|
return self.get_grafana_plugin_settings(PluginID.IRM)
|
|
|
|
|
|
2024-11-19 09:52:23 -03:00
|
|
|
def get_current_user(self) -> APIClientResponse[typing.Dict[str, typing.List[str]]]:
|
|
|
|
|
return self.api_get("api/user")
|
|
|
|
|
|
2023-11-23 09:42:27 -07:00
|
|
|
def get_service_account(self, login: str) -> APIClientResponse["GrafanaAPIClient.Types.ServiceAccountResponse"]:
|
|
|
|
|
return self.api_get(f"api/serviceaccounts/search?query={login}")
|
|
|
|
|
|
|
|
|
|
def create_service_account(
|
|
|
|
|
self, name: str, role: str
|
|
|
|
|
) -> APIClientResponse["GrafanaAPIClient.Types.GrafanaServiceAccount"]:
|
|
|
|
|
return self.api_post("api/serviceaccounts", {"name": name, "role": role})
|
|
|
|
|
|
|
|
|
|
def create_service_account_token(
|
|
|
|
|
self, service_account_id: int, name: str, seconds_to_live=int | None
|
|
|
|
|
) -> APIClientResponse["GrafanaAPIClient.Types.GrafanaServiceAccountToken"]:
|
|
|
|
|
token_config = {"name": name}
|
|
|
|
|
if seconds_to_live:
|
|
|
|
|
token_config["secondsToLive"] = seconds_to_live
|
|
|
|
|
return self.api_post(f"api/serviceaccounts/{service_account_id}/tokens", token_config)
|
|
|
|
|
|
|
|
|
|
def get_service_account_token_permissions(self) -> APIClientResponse[typing.Dict[str, typing.List[str]]]:
|
|
|
|
|
return self.api_get("api/access-control/user/permissions")
|
|
|
|
|
|
2024-11-28 16:03:07 -03:00
|
|
|
def setup_organization(self) -> APIClientResponse:
|
|
|
|
|
return self.api_post(f"api/plugins/{PluginID.ONCALL}/resources/plugin/sync?wait=true&force=true")
|
|
|
|
|
|
2024-10-11 14:57:59 -04:00
|
|
|
def sync(self, organization: "Organization") -> APIClientResponse:
|
|
|
|
|
return self.api_post(f"api/plugins/{organization.active_ui_plugin_id}/resources/plugin/sync")
|
2024-07-31 13:12:56 -03:00
|
|
|
|
2024-08-23 13:52:53 -06:00
|
|
|
@staticmethod
|
|
|
|
|
def validate_grafana_token_format(grafana_token: str) -> bool:
|
|
|
|
|
if not grafana_token or not isinstance(grafana_token, str):
|
|
|
|
|
return False
|
|
|
|
|
if len(grafana_token) < GrafanaAPIClient.MIN_GRAFANA_TOKEN_LENGTH:
|
|
|
|
|
return False
|
|
|
|
|
return True
|
|
|
|
|
|
2022-06-03 08:09:47 -06:00
|
|
|
|
|
|
|
|
class GcomAPIClient(APIClient):
|
2022-09-06 10:21:05 -06:00
|
|
|
ACTIVE_INSTANCE_QUERY = "instances?status=active"
|
|
|
|
|
DELETED_INSTANCE_QUERY = "instances?status=deleted&includeDeleted=true"
|
2022-06-03 08:09:47 -06:00
|
|
|
STACK_STATUS_DELETED = "deleted"
|
2022-11-29 09:41:56 +01:00
|
|
|
STACK_STATUS_ACTIVE = "active"
|
2023-07-28 15:19:27 -06:00
|
|
|
PAGE_SIZE = 1000
|
2022-06-03 08:09:47 -06:00
|
|
|
|
2023-06-27 12:23:08 +02:00
|
|
|
def __init__(self, api_token: str) -> None:
|
2022-06-03 08:09:47 -06:00
|
|
|
super().__init__(settings.GRAFANA_COM_API_URL, api_token)
|
|
|
|
|
|
2023-06-27 12:23:08 +02:00
|
|
|
def get_instance_info(
|
|
|
|
|
self, stack_id: str, include_config_query_param: bool = False
|
|
|
|
|
) -> typing.Optional[GCOMInstanceInfo]:
|
2023-01-11 12:48:30 +01:00
|
|
|
"""
|
|
|
|
|
NOTE: in order to use ?config=true, an "Admin" GCOM token must be used to make the API call
|
|
|
|
|
"""
|
|
|
|
|
url = f"instances/{stack_id}"
|
|
|
|
|
if include_config_query_param:
|
|
|
|
|
url += "?config=true"
|
|
|
|
|
|
|
|
|
|
data, _ = self.api_get(url)
|
2022-11-29 09:41:56 +01:00
|
|
|
return data
|
2022-06-03 08:09:47 -06:00
|
|
|
|
2023-07-28 15:19:27 -06:00
|
|
|
def get_instances(self, query: str, page_size=None):
|
2024-08-09 11:42:10 -04:00
|
|
|
MAX_RETRIES = 3
|
|
|
|
|
|
2023-07-28 15:19:27 -06:00
|
|
|
if not page_size:
|
|
|
|
|
page, _ = self.api_get(query)
|
|
|
|
|
yield page
|
|
|
|
|
else:
|
2024-08-09 11:42:10 -04:00
|
|
|
previous_cursor = None
|
|
|
|
|
retry_count = 0
|
2023-07-28 15:19:27 -06:00
|
|
|
cursor = 0
|
2024-08-09 11:42:10 -04:00
|
|
|
|
2023-07-28 15:19:27 -06:00
|
|
|
while cursor is not None:
|
2024-08-09 11:42:10 -04:00
|
|
|
previous_cursor = cursor
|
|
|
|
|
page, call_status = self.api_get(f"{query}&cursor={cursor}&pageSize={page_size}")
|
|
|
|
|
|
|
|
|
|
if "nextCursor" in page:
|
|
|
|
|
cursor = page["nextCursor"]
|
|
|
|
|
yield page
|
|
|
|
|
elif retry_count == MAX_RETRIES:
|
|
|
|
|
break
|
2023-07-28 15:19:27 -06:00
|
|
|
else:
|
2024-08-09 11:42:10 -04:00
|
|
|
# nextCursor is missing from the response JSON, lets retry the request..
|
|
|
|
|
#
|
|
|
|
|
# NOTE: this is here because there seems to be a bug in GCOM's API where when using cursor based
|
|
|
|
|
# pagination, the request is aborted on the GCOM side but still sends HTTP 200 w/ a partial
|
|
|
|
|
# JSON response. This was leading to KeyErrors when trying to read the nextCursor key.
|
|
|
|
|
#
|
|
|
|
|
# How the JSON is actually properly decoded is aside me 🤷♂️, but for now lets simply retry the
|
|
|
|
|
# request if this scenario arises
|
|
|
|
|
#
|
|
|
|
|
# See this conversation for more context
|
|
|
|
|
# https://raintank-corp.slack.com/archives/C0K031RP1/p1723158123932529
|
|
|
|
|
logger.warning(
|
|
|
|
|
f"GcomAPIClient.get_instances response was missing nextCursor key, likely a decoding error. "
|
|
|
|
|
f"http_response={page} call_status={call_status}"
|
|
|
|
|
)
|
|
|
|
|
cursor = previous_cursor # retry the request using the previous nextCursor value
|
|
|
|
|
retry_count += 1
|
2022-06-03 08:09:47 -06:00
|
|
|
|
Refactor how RBAC enabled/disabled status is determined for Grafana Cloud stacks (#4279)
# What this PR does
In cloud we are currently (somewhat) improperly determining whether or
not a Grafana stack had the `accessControlOnCall` feature flag enabled.
At first things worked fine. We would enable this feature toggle via the
Grafana Admin UI, and then the OnCall backend would read this value from
GCOM's `GET /instance/<stack_id>` endpoint (via
`config.feature_toggles`), and everything worked as expected.
There was a recent change made in `grafana/deployment_tools` to set this
feature flag to True for all stacks. However, for some reason, the GCOM
endpoint above doesn't return the `accessControlOnCall` feature toggle
value in `config.feature_toggles` if it is set in this manner (it only
returns the value if it is set via the Grafana Admin UI).
So what we should instead be doing is such instead of asking GCOM for
this feature toggle, infer whether RBAC is enabled on the stack by doing
a `HEAD /api/access-control/users/permissions/search` (this endpoint _is
only_ available on a Grafana stack if `accessControlOnCall` is enabled).
**Few caveats to this ☝️**
1. we first have to make sure that the cloud stack is in an `active`
state (ie. not paused). This is because, no matter if the
`accessControlOnCall` is enabled or not, if the stack is in a `paused`
state it will ALWAYS return `HTTP 200` which can be misleading and lead
to bugs (this feels like a bug on the Grafana API, will follow up with
core grafana team)
2. Once we roll out this change we will effectively **actually** be
enabling RBAC for OnCall for all orgs. The Identity Access team would
prefer a progressive rollout, which is why I decided to introduce the
concept of
[`settings.CLOUD_RBAC_ROLLOUT_PERCENTAGE`](https://github.com/grafana/oncall/pull/4279/files#diff-3383aef931e41e44d95829ad971641eeb98fe001be2f5da92217446d300ea1b3R918)
(see also [`Organization.
should_be_considered_for_rbac_permissioning`](https://github.com/grafana/oncall/pull/4279/files#diff-2ca9917f4f56349be39545ee8abd459be5076295d02ca3a7ec545152fcddccdfR348-R362))
## Which issue(s) this PR closes
Related to https://github.com/grafana/identity-access-team/issues/667
## Checklist
- [x] Unit, integration, and e2e (if applicable) tests updated
- [x] Documentation added (or `pr:no public docs` PR label added if not
required)
- [x] Added the relevant release notes label (see labels prefixed w/
`release:`). These labels dictate how your PR will
show up in the autogenerated release notes.
2024-05-14 12:30:16 -04:00
|
|
|
def _is_stack_in_certain_state(self, stack_id: str, state: str) -> bool:
|
|
|
|
|
instance_info = self.get_instance_info(stack_id)
|
|
|
|
|
if not instance_info:
|
|
|
|
|
return False
|
|
|
|
|
return instance_info.get("status") == state
|
|
|
|
|
|
2022-06-03 08:09:47 -06:00
|
|
|
def is_stack_deleted(self, stack_id: str) -> bool:
|
2023-05-17 06:56:57 -06:00
|
|
|
url = f"instances?includeDeleted=true&id={stack_id}"
|
|
|
|
|
instance_infos, _ = self.api_get(url)
|
|
|
|
|
return instance_infos["items"] and instance_infos["items"][0].get("status") == self.STACK_STATUS_DELETED
|
2022-06-03 08:09:47 -06:00
|
|
|
|
Refactor how RBAC enabled/disabled status is determined for Grafana Cloud stacks (#4279)
# What this PR does
In cloud we are currently (somewhat) improperly determining whether or
not a Grafana stack had the `accessControlOnCall` feature flag enabled.
At first things worked fine. We would enable this feature toggle via the
Grafana Admin UI, and then the OnCall backend would read this value from
GCOM's `GET /instance/<stack_id>` endpoint (via
`config.feature_toggles`), and everything worked as expected.
There was a recent change made in `grafana/deployment_tools` to set this
feature flag to True for all stacks. However, for some reason, the GCOM
endpoint above doesn't return the `accessControlOnCall` feature toggle
value in `config.feature_toggles` if it is set in this manner (it only
returns the value if it is set via the Grafana Admin UI).
So what we should instead be doing is such instead of asking GCOM for
this feature toggle, infer whether RBAC is enabled on the stack by doing
a `HEAD /api/access-control/users/permissions/search` (this endpoint _is
only_ available on a Grafana stack if `accessControlOnCall` is enabled).
**Few caveats to this ☝️**
1. we first have to make sure that the cloud stack is in an `active`
state (ie. not paused). This is because, no matter if the
`accessControlOnCall` is enabled or not, if the stack is in a `paused`
state it will ALWAYS return `HTTP 200` which can be misleading and lead
to bugs (this feels like a bug on the Grafana API, will follow up with
core grafana team)
2. Once we roll out this change we will effectively **actually** be
enabling RBAC for OnCall for all orgs. The Identity Access team would
prefer a progressive rollout, which is why I decided to introduce the
concept of
[`settings.CLOUD_RBAC_ROLLOUT_PERCENTAGE`](https://github.com/grafana/oncall/pull/4279/files#diff-3383aef931e41e44d95829ad971641eeb98fe001be2f5da92217446d300ea1b3R918)
(see also [`Organization.
should_be_considered_for_rbac_permissioning`](https://github.com/grafana/oncall/pull/4279/files#diff-2ca9917f4f56349be39545ee8abd459be5076295d02ca3a7ec545152fcddccdfR348-R362))
## Which issue(s) this PR closes
Related to https://github.com/grafana/identity-access-team/issues/667
## Checklist
- [x] Unit, integration, and e2e (if applicable) tests updated
- [x] Documentation added (or `pr:no public docs` PR label added if not
required)
- [x] Added the relevant release notes label (see labels prefixed w/
`release:`). These labels dictate how your PR will
show up in the autogenerated release notes.
2024-05-14 12:30:16 -04:00
|
|
|
def is_stack_active(self, stack_id: str) -> bool:
|
|
|
|
|
return self._is_stack_in_certain_state(stack_id, self.STACK_STATUS_ACTIVE)
|
|
|
|
|
|
2023-06-27 12:23:08 +02:00
|
|
|
def post_active_users(self, body) -> APIClientResponse:
|
2022-06-03 08:09:47 -06:00
|
|
|
return self.api_post("app-active-users", body)
|
2022-10-24 21:25:32 -06:00
|
|
|
|
2023-06-27 12:23:08 +02:00
|
|
|
def get_stack_regions(self) -> APIClientResponse:
|
2022-10-24 21:25:32 -06:00
|
|
|
return self.api_get("stack-regions")
|