From eefe7be56a648705f4d48f605ca539d4db956d85 Mon Sep 17 00:00:00 2001
From: Joey Orlando <joey.orlando@grafana.com>
Date: Tue, 23 May 2023 20:20:46 -0400
Subject: [PATCH] e2e tests on CI - actually await k8s resources to be ready
 before starting tests (#1997)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Occasionally, the Playwright global setup step (which authenticates w/
the Grafana API + configures the plugin) would fail, leading to the CI
job to instantly fail (playwright doesn't retry global setup if it
fails).

My current hypothesis as to why this is happening is because the
`oncall-engine` and `oncall-celery` pods aren't _actually_ ready in
these cases based on the way the `jupyterhub/action-k8s-await-workloads`
action await k8s workloads:

<img width="1076" alt="Screenshot 2023-05-23 at 18 24 36"
src="https://github.com/grafana/oncall/assets/9406895/68d8d2d9-4274-4749-8788-e0a9a3dbad83">


By using the `kubectl rollout status deployment/<deployment-name>
--timeout=300s` instead, we can be sure that these pods are _actually_
ready to receive traffic before we start the tests.
```bash
❯ kubectl rollout status --help
Show the status of the rollout.

 By default 'rollout status' will watch the status of the latest rollout until it's done. If you don't want to wait for
the rollout to finish then you can use --watch=false. Note that if a new rollout starts in-between, then 'rollout
status' will continue watching the latest revision. If you want to pin to a specific revision and abort if it is rolled
over by another revision, use --revision=N where N is the revision you need to watch for.
```

Lastly, even despite this, sometimes the `POST
/api/internal/v1/plugin/sync` endpoint will return HTTP 500 ([example
logs](https://github.com/grafana/oncall/actions/runs/5062712137/jobs/9088529416#step:19:2536)
from failed CI job). In this case, let's setup the Playwright global
setup to retry 3 times.
---
 .github/workflows/linting-and-tests.yml       | 36 ++++++++++++-------
 .../integration-tests/globalSetup.ts          | 23 +++++++++++-
 grafana-plugin/playwright.config.ts           |  2 +-
 3 files changed, 47 insertions(+), 14 deletions(-)
diff --git a/.github/workflows/linting-and-tests.yml b/.github/workflows/linting-and-tests.yml
index d85932ce..456b8d34 100644
--- a/.github/workflows/linting-and-tests.yml
+++ b/.github/workflows/linting-and-tests.yml
@@ -287,6 +287,13 @@ jobs:
       - name: Checkout
         uses: actions/checkout@v3
 
+      - name: Collect Workflow Telemetry
+        uses: runforesight/workflow-telemetry-action@v1
+        with:
+          comment_on_pr: false
+          proc_trace_chart_show: false
+          proc_trace_table_show: false
+
       - name: Create k8s Kind Cluster
         uses: helm/kind-action@v1.3.0
         with:
@@ -340,9 +347,12 @@ jobs:
       - name: Load engine Docker image on the nodes of the cluster
         run: kind load image-archive --name=chart-testing /tmp/oncall-engine.tar
 
-      # spin up 3 engine, 3 celery, and 3 grafana pods, this will allow us to parralelize the integration tests,
+      # spin up 3 engine and 3 celery pods, this will allow us to parralelize the integration tests,
       # and complete them much faster by using multiple test processes
       # With just 1 engine/celery/grafana pod, the backend crawls to a halt when there is > 1 parallelized integration test process
+      # NOTE: it appears that using > 1 grafana container w/ SQLite as the database sometimes leads to failed
+      # grafana database migrations (this is documented in this GitHub issue
+      # https://github.com/bitnami/charts/issues/10905)
       #
       # by settings grafana.plugins to [] and configuring grafana.extraVolumeMounts we are using the locally built
       # OnCall plugin rather than the latest published version
@@ -361,7 +371,7 @@ jobs:
             --set oncall.twilio.authToken="${{ secrets.TWILIO_AUTH_TOKEN }}" \
             --set oncall.twilio.phoneNumber="\"${{ secrets.TWILIO_PHONE_NUMBER }}"\" \
             --set oncall.twilio.verifySid="${{ secrets.TWILIO_VERIFY_SID }}" \
-            --set grafana.replicas=3 \
+            --set grafana.replicas=1 \
             --set grafana.image.tag=${{ matrix.grafana-image-tag }} \
             --set grafana.env.GF_SECURITY_ADMIN_USER=oncall \
             --set grafana.env.GF_SECURITY_ADMIN_PASSWORD=oncall \
@@ -400,13 +410,15 @@ jobs:
         working-directory: grafana-plugin
         run: ./node_modules/.bin/playwright install-deps chromium firefox webkit
 
-      - name: Await k8s pods and other resources up
-        uses: jupyterhub/action-k8s-await-workloads@v1
-        with:
-          workloads: "" # all
-          namespace: "" # default
-          timeout: 300
-          max-restarts: -1
+      # we could instead use the --wait flag for the helm install command above
+      # but there's no reason to block on that step
+      # instead we can let the k8s resources start up behind the scenes and do other
+      # setup tasks (ex. install playwright + its dependencies)
+      - name: Wait until k8s resources are ready
+        run: |
+          kubectl rollout status deployment/helm-testing-grafana --timeout=300s
+          kubectl rollout status deployment/helm-testing-oncall-engine --timeout=300s
+          kubectl rollout status deployment/helm-testing-oncall-celery --timeout=300s
 
       - name: Run Integration Tests
         env:
@@ -425,14 +437,14 @@ jobs:
         working-directory: ./grafana-plugin
         run: yarn test:integration
 
-      # always spit out the engine and celery logs, AFTER the e2e tests have completed
-      # can be helpful for debugging failing/flaky tests
+      # spit out the engine, celery, and grafana logs, if the the e2e tests have failed
+      # can be helpful for debugging failing tests
       # GitHub Action reference: https://github.com/jupyterhub/action-k8s-namespace-report
       - name: Kubernetes namespace report
         uses: jupyterhub/action-k8s-namespace-report@v1
         if: failure()
         with:
-          important-workloads: "deploy/helm-testing-oncall-engine deploy/helm-testing-oncall-celery"
+          important-workloads: "deploy/helm-testing-oncall-engine deploy/helm-testing-oncall-celery deploy/helm-testing-grafana"
 
       - uses: actions/upload-artifact@v3
         if: failure()
diff --git a/grafana-plugin/integration-tests/globalSetup.ts b/grafana-plugin/integration-tests/globalSetup.ts
index 04af1cfc..79fb4b0c 100644
--- a/grafana-plugin/integration-tests/globalSetup.ts
+++ b/grafana-plugin/integration-tests/globalSetup.ts
@@ -4,6 +4,8 @@ import { BASE_URL, GRAFANA_PASSWORD, GRAFANA_USERNAME, IS_OPEN_SOURCE, ONCALL_AP
 import { clickButton, getInputByName } from './utils/forms';
 import { goToGrafanaPage } from './utils/navigation';
 
+const GLOBAL_SETUP_RETRIES = 3;
+
 /**
  * go to config page and wait for plugin icon to be available on left-hand navigation
  */
@@ -67,4 +69,23 @@ const globalSetup = async (config: FullConfig): Promise<void> => {
   await browserContext.close();
 };
 
-export default globalSetup;
+/**
+ * Let's retry global setup, in the event that it fails due to an oncall-engine/oncall-celery backend error.
+ * Sometimes the sync endpoint will randomly return HTTP 500.
+ * See here for an example CI job which failed global setup
+ * https://github.com/grafana/oncall/actions/runs/5062712137/jobs/9088529416#step:19:2536
+ *
+ * References on retrying playwright global setup
+ * https://github.com/microsoft/playwright/discussions/11371
+ */
+const globalSetupWithRetries = async (config: FullConfig): Promise<void> => {
+  for (let i = 0; i < GLOBAL_SETUP_RETRIES - 1; i++) {
+    try {
+      return await globalSetup(config);
+    } catch (e) {}
+  }
+  // One last time, throwing an error if it fails.
+  await globalSetup(config);
+};
+
+export default globalSetupWithRetries;
diff --git a/grafana-plugin/playwright.config.ts b/grafana-plugin/playwright.config.ts
index 2ca27677..5cae7ef8 100644
--- a/grafana-plugin/playwright.config.ts
+++ b/grafana-plugin/playwright.config.ts
@@ -33,7 +33,7 @@ const config: PlaywrightTestConfig = {
    * to flaky tests.. let's just retry failed tests. If the same test fails 3 times, you know something must be up
    */
   retries: !!process.env.CI ? 3 : 0,
-  workers: 1,
+  workers: !!process.env.CI ? 2 : 1,
   /* Reporter to use. See https://playwright.dev/docs/test-reporters */
   reporter: 'html',
   /* Shared settings for all the projects below. See https://playwright.dev/docs/api/class-testoptions. */