Use Tilt CI to run e2e tests on Github workflows (#3842)

# What this PR does

- Reuse Tiltfile from local environment and use `tilt ci` to run e2e
tests on Github
- Use Playwright Docker image to get rid of installing Playwright
browsers and system dependencies
- Use ubuntu-latest-16-cores runner for e2e tests job on CI

## Which issue(s) this PR fixes

Closes https://github.com/grafana/oncall/issues/4018

## Checklist

- [x] Unit, integration, and e2e (if applicable) tests updated
- [x] Documentation added (or `pr:no public docs` PR label added if not
required)
- [x] Added the relevant release notes label (see labels prefixed w/
`release:`). These labels dictate how your PR will
    show up in the autogenerated release notes.
This commit is contained in:
Dominik Broj 2024-03-22 13:29:22 +01:00 committed by GitHub
parent 73cfaf25c0
commit 9ff486078f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 79 additions and 300 deletions

View file

@ -1,119 +0,0 @@
base_url: 172.17.0.1:30001
base_url_protocol: http
env:
- name: GRAFANA_CLOUD_NOTIFICATIONS_ENABLED
value: "False"
- name: FEATURE_PROMETHEUS_EXPORTER_ENABLED
value: "True"
image:
repository: oncall/engine
tag: latest
pullPolicy: IfNotPresent
oncall:
devMode: true
broker:
type: redis
redis:
architecture: standalone # don't run replicas, just eats up resources
rabbitmq:
enabled: false
engine:
replicaCount: 1
celery:
replicaCount: 1
worker_beat_enabled: false
grafana:
replicas: 1
extraInitContainers:
- name: create-db-if-not-exists
image: mysql:8.0.32
command:
# yamllint disable rule:line-length
[
"bash",
"-c",
'while ! mysqladmin ping -h "$DATABASE_HOST" --silent; do echo ''awaiting mysql db to be available'' && sleep 1; done && mysql -h "$DATABASE_HOST" -u "$DATABASE_USER" -p"$DATABASE_PASSWORD" -e ''CREATE DATABASE IF NOT EXISTS grafana CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;''',
]
# yamllint enable rule:line-length
env:
- name: DATABASE_HOST
value: oncall-ci-mariadb
- name: DATABASE_USER
value: root
- name: DATABASE_PASSWORD
valueFrom:
secretKeyRef:
name: oncall-ci-mariadb
key: mariadb-root-password
env:
GF_FEATURE_TOGGLES_ENABLE: topnav
GF_SECURITY_ADMIN_PASSWORD: oncall
GF_SECURITY_ADMIN_USER: oncall
GF_PLUGINS_ALLOW_LOADING_UNSIGNED_PLUGINS: grafana-oncall-app
GF_DATABASE_TYPE: mysql
GF_DATABASE_HOST: oncall-ci-mariadb:3306
GF_DATABASE_USER: root
GF_DATABASE_SSL_MODE: disable
envValueFrom:
GF_DATABASE_PASSWORD:
secretKeyRef:
name: oncall-ci-mariadb
key: mariadb-root-password
# by settings grafana.plugins to [] and configuring grafana.extraVolumeMounts we are using the locally built
# OnCall plugin rather than the latest published version
plugins: []
extraVolumeMounts:
- name: plugins
mountPath: /var/lib/grafana/plugins/grafana-plugin
# hostPath is defined in .github/kind.yml
hostPath: /oncall-plugin
readOnly: true
service:
type: NodePort
nodePort: 30002
database:
type: mysql
mariadb:
enabled: true
primary:
service:
type: NodePort
nodePort: 30003
extraEnvVars:
# See "Passing extra command line flags to mysqld startup" section
# https://hub.docker.com/r/bitnami/mariadb
#
# max_allowed_packet is set to 128mb in bytes
#
# this avoids "Got an error reading communication packets" errors that arise from the grafana container
# apparently sending too much data to mariadb at once
# https://mariadb.com/docs/skysql-dbaas/ref/mdb/system-variables/max_allowed_packet/
- name: MARIADB_EXTRA_FLAGS
value: "--max_allowed_packet=134217728 --max_connections=1024"
- name: MARIADB_CHARACTER_SET
value: utf8mb4
- name: MARIADB_COLLATE
value: utf8mb4_unicode_ci
ingress:
enabled: false
ingress-nginx:
enabled: false
cert-manager:
enabled: false
service:
enabled: true
type: NodePort
port: 8080
nodePort: 30001
prometheus:
enabled: true
extraScrapeConfigs: |
- job_name: 'oncall-exporter'
metrics_path: /metrics/
static_configs:
- targets:
- oncall-dev-engine.default.svc.cluster.local:8080

19
.github/kind.yml vendored
View file

@ -1,19 +0,0 @@
kind: Cluster
apiVersion: kind.x-k8s.io/v1alpha4
nodes:
- role: control-plane
extraPortMappings:
- containerPort: 30001
hostPort: 30001
- containerPort: 30002
hostPort: 30002
# https://stackoverflow.com/a/62695918
extraMounts:
# this basically mounts our local ./grafana-plugin (frontend) directory into the kind node
# so that we can later use a volumeMount to mount from the kind-control-plane Docker container -> grafana
# k8s pod. This will allow us to mount the current frontend source code
#
# NOTE: this is a bit hacky and implies that kind create is run from the root of the project
# but for now it works... alternative would be to use something like $(pwd)/grafana-plugin
- hostPath: ./grafana-plugin
containerPath: /oncall-plugin

View file

@ -33,7 +33,7 @@ jobs:
# default "ubuntu-latest" runners only provide 2 CPU cores + 7GB of RAM. this seems to lead to HTTP 504s from
# the oncall backend, and hence, flaky tests. Let's use CI runners w/ more resources to avoid this (plus
# this will allow us to run more backend containers and parralelize the tests)
runs-on: ubuntu-latest-8-cores
runs-on: ubuntu-latest-16-cores
name: "Grafana: ${{ inputs.grafana-image-tag }}"
environment:
name: github-pages
@ -44,15 +44,6 @@ jobs:
- name: Checkout
uses: actions/checkout@v3
# TODO: re-enable this when we get the docker build build-context caching working.. see other TODO comment below
# - uses: actions/setup-python@v4
# with:
# python-version: "3.11.4"
# cache: "pip"
# cache-dependency-path: |
# engine/requirements.txt
# engine/requirements-dev.txt
- name: Collect Workflow Telemetry
uses: runforesight/workflow-telemetry-action@v1
with:
@ -60,10 +51,11 @@ jobs:
proc_trace_chart_show: false
proc_trace_table_show: false
- name: Create k8s Kind Cluster
- name: Install Kind
uses: helm/kind-action@v1.3.0
with:
config: ./.github/kind.yml
config: ./dev/kind.yml
install_only: true
- uses: actions/setup-node@v3
with:
@ -71,6 +63,17 @@ jobs:
cache: "yarn"
cache-dependency-path: grafana-plugin/yarn.lock
- name: Install Tilt
run: |
curl -fsSL https://raw.githubusercontent.com/tilt-dev/tilt/master/scripts/install.sh | bash
- name: Install ctlptl
run: |
CTLPTL_VERSION="0.8.20"
CTLPTL_FILE_NAME="ctlptl.$CTLPTL_VERSION.linux.x86_64.tar.gz"
curl -fsSL https://github.com/tilt-dev/ctlptl/releases/download/v$CTLPTL_VERSION/$CTLPTL_FILE_NAME | \
tar -xzv -C /usr/local/bin ctlptl
- name: Use cached frontend dependencies
id: cache-frontend-dependencies
uses: actions/cache@v3
@ -95,42 +98,6 @@ jobs:
working-directory: grafana-plugin
run: yarn build:dev
- name: Set up Docker Buildx # We need this step for docker caching
uses: docker/setup-buildx-action@v2
- name: Build engine Docker image locally
uses: docker/build-push-action@v4
with:
context: ./engine
file: ./engine/Dockerfile
push: false
tags: oncall/engine:latest
outputs: type=docker,dest=/tmp/oncall-engine.tar
# TODO: figure out how to get this to work.. this will substantially speed up building our docker image here
# because right now most time is spent building wheels for python dependencies
# (even though they rarely change).. this portion "should" work however I haven't yet figured out how to
# get the cache bind mount in engine/Dockerfile to work optionally (ie. when we don't specify
# the --build-context flag to docker build.. otherwise it fails if pip_cache is not available)
#
# references
# https://github.com/moby/buildkit/blob/master/frontend/dockerfile/docs/reference.md#run---mounttypecache
# https://stackoverflow.com/a/71846527
# build-contexts: pip_cache=/home/runner/.cache/pip
- name: Load engine Docker image on the nodes of the cluster
run: kind load image-archive --name=chart-testing /tmp/oncall-engine.tar
- name: Install helm chart
run: |
helm install oncall-ci \
--values ./.github/helm-values.yml \
--set oncall.twilio.accountSid="${{ secrets.TWILIO_ACCOUNT_SID }}" \
--set oncall.twilio.authToken="${{ secrets.TWILIO_AUTH_TOKEN }}" \
--set oncall.twilio.phoneNumber="\"${{ secrets.TWILIO_PHONE_NUMBER }}"\" \
--set oncall.twilio.verifySid="${{ secrets.TWILIO_VERIFY_SID }}" \
--set grafana.image.tag=${{ inputs.grafana-image-tag }} \
./helm/oncall
# helpful reference for properly caching the playwright binaries/dependencies
# https://playwrightsolutions.com/playwright-github-action-to-cache-the-browser-binaries/
- name: Get installed Playwright version
@ -147,87 +114,34 @@ jobs:
path: "~/.cache/ms-playwright"
key: ${{ runner.os }}-playwright-${{ env.PLAYWRIGHT_VERSION }}-${{ inputs.browsers }}
# For the next two steps, use the binary directly from node_modules/.bin as opposed to npx playwright
# due to this bug (https://github.com/microsoft/playwright/issues/13188)
- name: Install Playwright Browsers
if: steps.playwright-cache.outputs.cache-hit != 'true'
working-directory: grafana-plugin
run: ./node_modules/.bin/playwright install --with-deps ${{ inputs.browsers }}
# use the cached browsers, but we still need to install the necessary system dependencies
# (system deps are installed in the cache-miss step above by the --with-deps flag)
- name: Install Playwright System Dependencies
if: steps.playwright-cache.outputs.cache-hit == 'true'
working-directory: grafana-plugin
run: ./node_modules/.bin/playwright install-deps ${{ inputs.browsers }}
# we could instead use the --wait flag for the helm install command above
# but there's no reason to block on that step
# instead we can let the k8s resources start up behind the scenes and do other
# setup tasks (ex. install playwright + its dependencies)
- name: Wait until k8s resources are ready
- name: Create cluster
run: |
kubectl rollout status deployment/oncall-ci-grafana --timeout=300s
kubectl rollout status deployment/oncall-ci-engine --timeout=300s
kubectl rollout status deployment/oncall-ci-celery --timeout=300s
make cluster/up
- name: Run e2e Tests
- name: Install Playwright deps
uses: docker://mcr.microsoft.com/playwright:next-jammy
- name: Tilt CI
shell: bash
env:
# BASE_URL represents what is accessed via a browser
BASE_URL: http://localhost:30002/grafana
# ONCALL_API_URL is what is configured in the plugin configuration form
# it is what the grafana container uses to communicate with the OnCall backend
#
# 172.17.0.1 is the docker bridge network default gateway. Requests originate in the grafana container
# hit 172.17.0.1 which proxies the request onto the host where port 30001 is the node port that is mapped
# to the OnCall API
ONCALL_API_URL: http://172.17.0.1:30001
GRAFANA_ADMIN_USERNAME: oncall
GRAFANA_ADMIN_PASSWORD: oncall
GRAFANA_EDITOR_USERNAME: editor
GRAFANA_EDITOR_PASSWORD: editor
GRAFANA_VIEWER_USERNAME: viewer
GRAFANA_VIEWER_PASSWORD: viewer
MAILSLURP_API_KEY: ${{ secrets.MAILSLURP_API_KEY }}
GRAFANA_IMAGE_TAG: ${{ inputs.grafana-image-tag }}
BROWSERS: ${{ inputs.browsers }}
working-directory: ./grafana-plugin
run: yarn test:e2e
run: tilt ci
- name: Run expensive e2e Tests
- name: Tilt CI - expensive E2E tests
if: inputs.run-expensive-tests
shell: bash
env:
BASE_URL: http://localhost:30002/grafana
ONCALL_API_URL: http://172.17.0.1:30001
GRAFANA_ADMIN_USERNAME: oncall
GRAFANA_ADMIN_PASSWORD: oncall
GRAFANA_EDITOR_USERNAME: editor
GRAFANA_EDITOR_PASSWORD: editor
GRAFANA_VIEWER_USERNAME: viewer
GRAFANA_VIEWER_PASSWORD: viewer
E2E_TESTS_CMD: "cd grafana-plugin && yarn test:e2e-expensive"
GRAFANA_IMAGE_TAG: ${{ inputs.grafana-image-tag }}
BROWSERS: ${{ inputs.browsers }}
MAILSLURP_API_KEY: ${{ secrets.MAILSLURP_API_KEY }}
working-directory: ./grafana-plugin
run: yarn test:e2e-expensive
# spit out the engine, celery, and grafana logs, if the the e2e tests have failed (or were flaky)
# can be helpful for debugging these tests
# GitHub Action reference: https://github.com/jupyterhub/action-k8s-namespace-report
- name: oncall-engine logs
if: always()
uses: jupyterhub/action-k8s-namespace-report@v1
with:
important-workloads: deploy/oncall-ci-engine
- name: oncall-celery logs
if: always()
uses: jupyterhub/action-k8s-namespace-report@v1
with:
important-workloads: deploy/oncall-ci-celery
- name: grafana logs
if: always()
uses: jupyterhub/action-k8s-namespace-report@v1
with:
important-workloads: deploy/oncall-ci-grafana
TWILIO_ACCOUNT_SID: ${{ secrets.TWILIO_ACCOUNT_SID }}
TWILIO_AUTH_TOKEN: ${{ secrets.TWILIO_AUTH_TOKEN }}
# wrapping single quotes are required to prevent stripping leading "+" from the number
TWILIO_PHONE_NUMBER: '"${{ secrets.TWILIO_PHONE_NUMBER }}"'
TWILIO_VERIFY_SID: ${{ secrets.TWILIO_VERIFY_SID }}
run: tilt ci
- name: Setup Pages
if: failure()

View file

@ -2,6 +2,15 @@ load('ext://uibutton', 'cmd_button', 'location', 'text_input', 'bool_input')
running_under_parent_tiltfile = os.getenv("TILT_PARENT", "false") == "true"
# The user/pass that you will login to Grafana with
grafana_admin_user_pass = os.getenv("GRAFANA_ADMIN_USER_PASS", "oncall")
grafana_image_tag = os.getenv("GRAFANA_IMAGE_TAG", "latest")
e2e_tests_cmd=os.getenv("E2E_TESTS_CMD", "cd grafana-plugin && yarn test:e2e")
twilio_values=[
"oncall.twilio.accountSid=" + os.getenv("TWILIO_ACCOUNT_SID", ""),
"oncall.twilio.authToken=" + os.getenv("TWILIO_AUTH_TOKEN", ""),
"oncall.twilio.phoneNumber=" + os.getenv("TWILIO_PHONE_NUMBER", ""),
"oncall.twilio.verifySid=" + os.getenv("TWILIO_VERIFY_SID", ""),
]
is_ci=config.tilt_subcommand == "ci"
# HELM_PREFIX must be "oncall-dev" as it is hardcoded in dev/helm-local.yml
HELM_PREFIX = "oncall-dev"
# Use docker registery generated by ctlptl (dev/kind-config.yaml)
@ -54,7 +63,6 @@ docker_build_sub(
local_resource(
"build-ui",
labels=["OnCallUI"],
cmd="cd grafana-plugin && yarn install && yarn build:dev",
serve_cmd="cd grafana-plugin && yarn watch",
allow_parallel=True,
)
@ -62,10 +70,10 @@ local_resource(
local_resource(
"e2e-tests",
labels=["E2eTests"],
cmd="cd grafana-plugin && yarn test:e2e",
cmd=e2e_tests_cmd,
trigger_mode=TRIGGER_MODE_MANUAL,
auto_init=False,
resource_deps=["build-ui", "grafana", "grafana-oncall-app-provisioning-configmap", "engine"]
auto_init=is_ci,
resource_deps=["build-ui", "grafana", "grafana-oncall-app-provisioning-configmap", "engine", "celery"]
)
cmd_button(
@ -77,7 +85,6 @@ cmd_button(
inputs=[
text_input("BROWSERS", "Browsers (e.g. \"chromium,firefox,webkit\")", "chromium", "chromium,firefox,webkit"),
text_input("TESTS_FILTER", "Test filter (e.g. \"timezones.test quality.test\")", "", "Test file names to run"),
bool_input("REPORTER", "Use HTML reporter", True, 'html', 'line'),
bool_input("STOP_ON_FIRST_FAILURE", "Stop on first failure", True, "-x", ""),
]
)
@ -106,7 +113,7 @@ cmd_button(
icon_name="dangerous",
)
yaml = helm("helm/oncall", name=HELM_PREFIX, values=["./dev/helm-local.yml", "./dev/helm-local.dev.yml"])
yaml = helm("helm/oncall", name=HELM_PREFIX, values=["./dev/helm-local.yml", "./dev/helm-local.dev.yml"], set=twilio_values)
k8s_yaml(yaml)
@ -127,6 +134,7 @@ k8s_resource(
# Use separate grafana helm chart
if not running_under_parent_tiltfile:
grafana(
grafana_version=grafana_image_tag,
context="grafana-plugin",
plugin_files=["grafana-plugin/src/plugin.json"],
namespace="default",
@ -161,5 +169,4 @@ k8s_resource(
def resource_name(id):
return id.name.replace(HELM_PREFIX + "-", "")
workload_to_resource_function(resource_name)

View file

@ -28,14 +28,7 @@ engine:
replicaCount: 1
celery:
replicaCount: 1
ui:
enabled: false
image:
repository: localhost:63628/oncall/ui
env:
ONCALL_API_URL: http://oncall-dev-engine:8080
MOBILE_APP_QR_INTERVAL_QUEUE: 290000 # 4 minutes and 50 seconds
worker_beat_enabled: false
externalGrafana:
url: http://grafana:3000
@ -47,8 +40,6 @@ grafana:
domain: localhost:3000
root_url: "%(protocol)s://%(domain)s"
replicas: 1
image:
tag: 10.0.2
extraInitContainers:
- name: create-db-if-not-exists
image: mysql:8.0.32
@ -137,6 +128,9 @@ service:
nodePort: 30001
prometheus:
enabled: true
server:
global:
scrape_interval: 10s
extraScrapeConfigs: |
- job_name: 'oncall-exporter'
metrics_path: /metrics/

View file

@ -21,7 +21,7 @@ test.skip(
);
test.describe('Insights', () => {
test.beforeAll(async ({ adminRolePage: { page, userName } }) => {
test.beforeAll(async ({ adminRolePage: { page } }) => {
const DATASOURCE_NAME = 'OnCall Prometheus';
const DATASOURCE_URL = 'http://oncall-dev-prometheus-server.default.svc.cluster.local';
@ -37,21 +37,6 @@ test.describe('Insights', () => {
await page.getByPlaceholder('http://localhost:9090').fill(DATASOURCE_URL);
await clickButton({ page, buttonText: 'Save & test' });
}
// send alert and resolve to get some values in insights
const escalationChainName = generateRandomValue();
const integrationName = generateRandomValue();
const onCallScheduleName = generateRandomValue();
await createOnCallScheduleWithRotation(page, onCallScheduleName, userName);
await createEscalationChain(
page,
escalationChainName,
EscalationStep.NotifyUsersFromOnCallSchedule,
onCallScheduleName
);
await createIntegrationAndSendDemoAlert(page, integrationName, escalationChainName);
await resolveFiringAlert(page);
await page.waitForTimeout(5000);
});
test('Viewer can see all the panels in OnCall insights', async ({ viewerRolePage: { page } }) => {
@ -69,11 +54,30 @@ test.describe('Insights', () => {
});
});
test('There is no panel that misses data', async ({ adminRolePage: { page } }) => {
test('There is no panel that misses data', async ({ adminRolePage: { page, userName } }) => {
test.setTimeout(90_000);
// send alert and resolve to get some values in insights
const escalationChainName = generateRandomValue();
const integrationName = generateRandomValue();
const onCallScheduleName = generateRandomValue();
await createOnCallScheduleWithRotation(page, onCallScheduleName, userName);
await createEscalationChain(
page,
escalationChainName,
EscalationStep.NotifyUsersFromOnCallSchedule,
onCallScheduleName
);
await createIntegrationAndSendDemoAlert(page, integrationName, escalationChainName);
await resolveFiringAlert(page);
// wait for Prometheus to scrape the data
await page.waitForTimeout(5000);
// check that we have data in insights panels
await goToOnCallPage(page, 'insights');
await page.getByText('Last 24 hours').click();
await page.getByText('Last 1 hour').click();
await page.waitForTimeout(2000);
await page.waitForTimeout(3000);
await expect(page.getByText('No data')).toBeHidden();
});
});

View file

@ -10,7 +10,8 @@ type OnCallPage =
| 'outgoing_webhooks'
| 'users'
| 'users/me'
| 'insights';
| 'insights'
| 'settings';
const _goToPage = async (page: Page, url = '') => page.goto(`${BASE_URL}${url}`);

View file

@ -17,6 +17,7 @@
"test:e2e": "yarn playwright test --grep-invert @expensive",
"test:e2e-expensive": "yarn playwright test --grep @expensive",
"test:e2e:watch": "yarn test:e2e --ui",
"test:e2e-expensive:watch": "yarn test:e2e-expensive --ui",
"test:e2e:gen": "yarn playwright codegen http://localhost:3000",
"e2e-show-report": "yarn playwright show-report",
"generate-types": "cd ./src/network/oncall-api/types-generator && yarn generate",

View file

@ -13,10 +13,6 @@ export const ADMIN_USER_STORAGE_STATE = path.join(__dirname, 'e2e-tests/.auth/ad
const IS_CI = !!process.env.CI;
const BROWSERS = process.env.BROWSERS || 'chromium';
const REPORTER_WITH_DEFAULT = process.env.REPORTER || 'html';
const REPORTER = (
process.env.REPORTER === 'html' ? [['html', { open: 'never' }]] : REPORTER_WITH_DEFAULT
) as PlaywrightTestConfig['reporter'];
const SETUP_PROJECT_NAME = 'setup';
const getEnabledBrowsers = (browsers: PlaywrightTestProject[]) =>
@ -31,7 +27,7 @@ export default defineConfig({
/* Maximum time all the tests can run for. */
globalTimeout: 20 * 60 * 1_000, // 20 minutes
reporter: REPORTER,
reporter: [['html', { open: IS_CI ? 'never' : 'always' }]],
/* Maximum time one test can run for. */
timeout: 60_000,