e2e tests on CI - actually await k8s resources to be ready before starting tests (#1997)

Occasionally, the Playwright global setup step (which authenticates w/
the Grafana API + configures the plugin) would fail, leading to the CI
job to instantly fail (playwright doesn't retry global setup if it
fails).

My current hypothesis as to why this is happening is because the
`oncall-engine` and `oncall-celery` pods aren't _actually_ ready in
these cases based on the way the `jupyterhub/action-k8s-await-workloads`
action await k8s workloads:

<img width="1076" alt="Screenshot 2023-05-23 at 18 24 36"
src="https://github.com/grafana/oncall/assets/9406895/68d8d2d9-4274-4749-8788-e0a9a3dbad83">


By using the `kubectl rollout status deployment/<deployment-name>
--timeout=300s` instead, we can be sure that these pods are _actually_
ready to receive traffic before we start the tests.
```bash
❯ kubectl rollout status --help
Show the status of the rollout.

 By default 'rollout status' will watch the status of the latest rollout until it's done. If you don't want to wait for
the rollout to finish then you can use --watch=false. Note that if a new rollout starts in-between, then 'rollout
status' will continue watching the latest revision. If you want to pin to a specific revision and abort if it is rolled
over by another revision, use --revision=N where N is the revision you need to watch for.
```

Lastly, even despite this, sometimes the `POST
/api/internal/v1/plugin/sync` endpoint will return HTTP 500 ([example
logs](https://github.com/grafana/oncall/actions/runs/5062712137/jobs/9088529416#step:19:2536)
from failed CI job). In this case, let's setup the Playwright global
setup to retry 3 times.
This commit is contained in:
Joey Orlando 2023-05-23 20:20:46 -04:00 committed by GitHub
parent c793e550c6
commit eefe7be56a
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 47 additions and 14 deletions

View file

@ -287,6 +287,13 @@ jobs:
- name: Checkout
uses: actions/checkout@v3
- name: Collect Workflow Telemetry
uses: runforesight/workflow-telemetry-action@v1
with:
comment_on_pr: false
proc_trace_chart_show: false
proc_trace_table_show: false
- name: Create k8s Kind Cluster
uses: helm/kind-action@v1.3.0
with:
@ -340,9 +347,12 @@ jobs:
- name: Load engine Docker image on the nodes of the cluster
run: kind load image-archive --name=chart-testing /tmp/oncall-engine.tar
# spin up 3 engine, 3 celery, and 3 grafana pods, this will allow us to parralelize the integration tests,
# spin up 3 engine and 3 celery pods, this will allow us to parralelize the integration tests,
# and complete them much faster by using multiple test processes
# With just 1 engine/celery/grafana pod, the backend crawls to a halt when there is > 1 parallelized integration test process
# NOTE: it appears that using > 1 grafana container w/ SQLite as the database sometimes leads to failed
# grafana database migrations (this is documented in this GitHub issue
# https://github.com/bitnami/charts/issues/10905)
#
# by settings grafana.plugins to [] and configuring grafana.extraVolumeMounts we are using the locally built
# OnCall plugin rather than the latest published version
@ -361,7 +371,7 @@ jobs:
--set oncall.twilio.authToken="${{ secrets.TWILIO_AUTH_TOKEN }}" \
--set oncall.twilio.phoneNumber="\"${{ secrets.TWILIO_PHONE_NUMBER }}"\" \
--set oncall.twilio.verifySid="${{ secrets.TWILIO_VERIFY_SID }}" \
--set grafana.replicas=3 \
--set grafana.replicas=1 \
--set grafana.image.tag=${{ matrix.grafana-image-tag }} \
--set grafana.env.GF_SECURITY_ADMIN_USER=oncall \
--set grafana.env.GF_SECURITY_ADMIN_PASSWORD=oncall \
@ -400,13 +410,15 @@ jobs:
working-directory: grafana-plugin
run: ./node_modules/.bin/playwright install-deps chromium firefox webkit
- name: Await k8s pods and other resources up
uses: jupyterhub/action-k8s-await-workloads@v1
with:
workloads: "" # all
namespace: "" # default
timeout: 300
max-restarts: -1
# we could instead use the --wait flag for the helm install command above
# but there's no reason to block on that step
# instead we can let the k8s resources start up behind the scenes and do other
# setup tasks (ex. install playwright + its dependencies)
- name: Wait until k8s resources are ready
run: |
kubectl rollout status deployment/helm-testing-grafana --timeout=300s
kubectl rollout status deployment/helm-testing-oncall-engine --timeout=300s
kubectl rollout status deployment/helm-testing-oncall-celery --timeout=300s
- name: Run Integration Tests
env:
@ -425,14 +437,14 @@ jobs:
working-directory: ./grafana-plugin
run: yarn test:integration
# always spit out the engine and celery logs, AFTER the e2e tests have completed
# can be helpful for debugging failing/flaky tests
# spit out the engine, celery, and grafana logs, if the the e2e tests have failed
# can be helpful for debugging failing tests
# GitHub Action reference: https://github.com/jupyterhub/action-k8s-namespace-report
- name: Kubernetes namespace report
uses: jupyterhub/action-k8s-namespace-report@v1
if: failure()
with:
important-workloads: "deploy/helm-testing-oncall-engine deploy/helm-testing-oncall-celery"
important-workloads: "deploy/helm-testing-oncall-engine deploy/helm-testing-oncall-celery deploy/helm-testing-grafana"
- uses: actions/upload-artifact@v3
if: failure()

View file

@ -4,6 +4,8 @@ import { BASE_URL, GRAFANA_PASSWORD, GRAFANA_USERNAME, IS_OPEN_SOURCE, ONCALL_AP
import { clickButton, getInputByName } from './utils/forms';
import { goToGrafanaPage } from './utils/navigation';
const GLOBAL_SETUP_RETRIES = 3;
/**
* go to config page and wait for plugin icon to be available on left-hand navigation
*/
@ -67,4 +69,23 @@ const globalSetup = async (config: FullConfig): Promise<void> => {
await browserContext.close();
};
export default globalSetup;
/**
* Let's retry global setup, in the event that it fails due to an oncall-engine/oncall-celery backend error.
* Sometimes the sync endpoint will randomly return HTTP 500.
* See here for an example CI job which failed global setup
* https://github.com/grafana/oncall/actions/runs/5062712137/jobs/9088529416#step:19:2536
*
* References on retrying playwright global setup
* https://github.com/microsoft/playwright/discussions/11371
*/
const globalSetupWithRetries = async (config: FullConfig): Promise<void> => {
for (let i = 0; i < GLOBAL_SETUP_RETRIES - 1; i++) {
try {
return await globalSetup(config);
} catch (e) {}
}
// One last time, throwing an error if it fails.
await globalSetup(config);
};
export default globalSetupWithRetries;

View file

@ -33,7 +33,7 @@ const config: PlaywrightTestConfig = {
* to flaky tests.. let's just retry failed tests. If the same test fails 3 times, you know something must be up
*/
retries: !!process.env.CI ? 3 : 0,
workers: 1,
workers: !!process.env.CI ? 2 : 1,
/* Reporter to use. See https://playwright.dev/docs/test-reporters */
reporter: 'html',
/* Shared settings for all the projects below. See https://playwright.dev/docs/api/class-testoptions. */