e2e tests on CI - actually await k8s resources to be ready before starting tests (#1997)

Occasionally, the Playwright global setup step (which authenticates w/ the Grafana API + configures the plugin) would fail, leading to the CI job to instantly fail (playwright doesn't retry global setup if it fails). My current hypothesis as to why this is happening is because the `oncall-engine` and `oncall-celery` pods aren't _actually_ ready in these cases based on the way the `jupyterhub/action-k8s-await-workloads` action await k8s workloads: <img width="1076" alt="Screenshot 2023-05-23 at 18 24 36" src="https://github.com/grafana/oncall/assets/9406895/68d8d2d9-4274-4749-8788-e0a9a3dbad83"> By using the `kubectl rollout status deployment/<deployment-name> --timeout=300s` instead, we can be sure that these pods are _actually_ ready to receive traffic before we start the tests. ```bash ❯ kubectl rollout status --help Show the status of the rollout. By default 'rollout status' will watch the status of the latest rollout until it's done. If you don't want to wait for the rollout to finish then you can use --watch=false. Note that if a new rollout starts in-between, then 'rollout status' will continue watching the latest revision. If you want to pin to a specific revision and abort if it is rolled over by another revision, use --revision=N where N is the revision you need to watch for. ``` Lastly, even despite this, sometimes the `POST /api/internal/v1/plugin/sync` endpoint will return HTTP 500 ([example logs](https://github.com/grafana/oncall/actions/runs/5062712137/jobs/9088529416#step:19:2536) from failed CI job). In this case, let's setup the Playwright global setup to retry 3 times.
2023-05-23 20:20:46 -04:00 · 2023-05-23 20:20:46 -04:00 · eefe7be56a
commit eefe7be56a
parent c793e550c6
3 changed files with 47 additions and 14 deletions
--- a/.github/workflows/linting-and-tests.yml
+++ b/.github/workflows/linting-and-tests.yml
@ -287,6 +287,13 @@ jobs:
      - name: Checkout
        uses: actions/checkout@v3

+      - name: Collect Workflow Telemetry
+        uses: runforesight/workflow-telemetry-action@v1
+        with:
+          comment_on_pr: false
+          proc_trace_chart_show: false
+          proc_trace_table_show: false
+
      - name: Create k8s Kind Cluster
        uses: helm/kind-action@v1.3.0
        with:
@ -340,9 +347,12 @@ jobs:
      - name: Load engine Docker image on the nodes of the cluster
        run: kind load image-archive --name=chart-testing /tmp/oncall-engine.tar

-      # spin up 3 engine, 3 celery, and 3 grafana pods, this will allow us to parralelize the integration tests,
+      # spin up 3 engine and 3 celery pods, this will allow us to parralelize the integration tests,
      # and complete them much faster by using multiple test processes
      # With just 1 engine/celery/grafana pod, the backend crawls to a halt when there is > 1 parallelized integration test process
+      # NOTE: it appears that using > 1 grafana container w/ SQLite as the database sometimes leads to failed
+      # grafana database migrations (this is documented in this GitHub issue
+      # https://github.com/bitnami/charts/issues/10905)
      #
      # by settings grafana.plugins to [] and configuring grafana.extraVolumeMounts we are using the locally built
      # OnCall plugin rather than the latest published version
@ -361,7 +371,7 @@ jobs:
            --set oncall.twilio.authToken="${{ secrets.TWILIO_AUTH_TOKEN }}" \
            --set oncall.twilio.phoneNumber="\"${{ secrets.TWILIO_PHONE_NUMBER }}"\" \
            --set oncall.twilio.verifySid="${{ secrets.TWILIO_VERIFY_SID }}" \
-            --set grafana.replicas=3 \
+            --set grafana.replicas=1 \
            --set grafana.image.tag=${{ matrix.grafana-image-tag }} \
            --set grafana.env.GF_SECURITY_ADMIN_USER=oncall \
            --set grafana.env.GF_SECURITY_ADMIN_PASSWORD=oncall \
@ -400,13 +410,15 @@ jobs:
        working-directory: grafana-plugin
        run: ./node_modules/.bin/playwright install-deps chromium firefox webkit

-      - name: Await k8s pods and other resources up
-        uses: jupyterhub/action-k8s-await-workloads@v1
-        with:
-          workloads: "" # all
-          namespace: "" # default
-          timeout: 300
-          max-restarts: -1
+      # we could instead use the --wait flag for the helm install command above
+      # but there's no reason to block on that step
+      # instead we can let the k8s resources start up behind the scenes and do other
+      # setup tasks (ex. install playwright + its dependencies)
+      - name: Wait until k8s resources are ready
+        run: |
+          kubectl rollout status deployment/helm-testing-grafana --timeout=300s
+          kubectl rollout status deployment/helm-testing-oncall-engine --timeout=300s
+          kubectl rollout status deployment/helm-testing-oncall-celery --timeout=300s

      - name: Run Integration Tests
        env:
@ -425,14 +437,14 @@ jobs:
        working-directory: ./grafana-plugin
        run: yarn test:integration

-      # always spit out the engine and celery logs, AFTER the e2e tests have completed
-      # can be helpful for debugging failing/flaky tests
+      # spit out the engine, celery, and grafana logs, if the the e2e tests have failed
+      # can be helpful for debugging failing tests
      # GitHub Action reference: https://github.com/jupyterhub/action-k8s-namespace-report
      - name: Kubernetes namespace report
        uses: jupyterhub/action-k8s-namespace-report@v1
        if: failure()
        with:
-          important-workloads: "deploy/helm-testing-oncall-engine deploy/helm-testing-oncall-celery"
+          important-workloads: "deploy/helm-testing-oncall-engine deploy/helm-testing-oncall-celery deploy/helm-testing-grafana"

      - uses: actions/upload-artifact@v3
        if: failure()
--- a/grafana-plugin/integration-tests/globalSetup.ts
+++ b/grafana-plugin/integration-tests/globalSetup.ts
@ -4,6 +4,8 @@ import { BASE_URL, GRAFANA_PASSWORD, GRAFANA_USERNAME, IS_OPEN_SOURCE, ONCALL_AP
 import { clickButton, getInputByName } from './utils/forms';
 import { goToGrafanaPage } from './utils/navigation';

+const GLOBAL_SETUP_RETRIES = 3;
+
 /**
 * go to config page and wait for plugin icon to be available on left-hand navigation
 */
@ -67,4 +69,23 @@ const globalSetup = async (config: FullConfig): Promise<void> => {
  await browserContext.close();
 };

-export default globalSetup;
+/**
+ * Let's retry global setup, in the event that it fails due to an oncall-engine/oncall-celery backend error.
+ * Sometimes the sync endpoint will randomly return HTTP 500.
+ * See here for an example CI job which failed global setup
+ * https://github.com/grafana/oncall/actions/runs/5062712137/jobs/9088529416#step:19:2536
+ *
+ * References on retrying playwright global setup
+ * https://github.com/microsoft/playwright/discussions/11371
+ */
+const globalSetupWithRetries = async (config: FullConfig): Promise<void> => {
+  for (let i = 0; i < GLOBAL_SETUP_RETRIES - 1; i++) {
+    try {
+      return await globalSetup(config);
+    } catch (e) {}
+  }
+  // One last time, throwing an error if it fails.
+  await globalSetup(config);
+};
+
+export default globalSetupWithRetries;
--- a/grafana-plugin/playwright.config.ts
+++ b/grafana-plugin/playwright.config.ts
@ -33,7 +33,7 @@ const config: PlaywrightTestConfig = {
   * to flaky tests.. let's just retry failed tests. If the same test fails 3 times, you know something must be up
   */
  retries: !!process.env.CI ? 3 : 0,
-  workers: 1,
+  workers: !!process.env.CI ? 2 : 1,
  /* Reporter to use. See https://playwright.dev/docs/test-reporters */
  reporter: 'html',
  /* Shared settings for all the projects below. See https://playwright.dev/docs/api/class-testoptions. */