diff --git a/.github/workflows/linting-and-tests.yml b/.github/workflows/linting-and-tests.yml index d85932ce..456b8d34 100644 --- a/.github/workflows/linting-and-tests.yml +++ b/.github/workflows/linting-and-tests.yml @@ -287,6 +287,13 @@ jobs: - name: Checkout uses: actions/checkout@v3 + - name: Collect Workflow Telemetry + uses: runforesight/workflow-telemetry-action@v1 + with: + comment_on_pr: false + proc_trace_chart_show: false + proc_trace_table_show: false + - name: Create k8s Kind Cluster uses: helm/kind-action@v1.3.0 with: @@ -340,9 +347,12 @@ jobs: - name: Load engine Docker image on the nodes of the cluster run: kind load image-archive --name=chart-testing /tmp/oncall-engine.tar - # spin up 3 engine, 3 celery, and 3 grafana pods, this will allow us to parralelize the integration tests, + # spin up 3 engine and 3 celery pods, this will allow us to parralelize the integration tests, # and complete them much faster by using multiple test processes # With just 1 engine/celery/grafana pod, the backend crawls to a halt when there is > 1 parallelized integration test process + # NOTE: it appears that using > 1 grafana container w/ SQLite as the database sometimes leads to failed + # grafana database migrations (this is documented in this GitHub issue + # https://github.com/bitnami/charts/issues/10905) # # by settings grafana.plugins to [] and configuring grafana.extraVolumeMounts we are using the locally built # OnCall plugin rather than the latest published version @@ -361,7 +371,7 @@ jobs: --set oncall.twilio.authToken="${{ secrets.TWILIO_AUTH_TOKEN }}" \ --set oncall.twilio.phoneNumber="\"${{ secrets.TWILIO_PHONE_NUMBER }}"\" \ --set oncall.twilio.verifySid="${{ secrets.TWILIO_VERIFY_SID }}" \ - --set grafana.replicas=3 \ + --set grafana.replicas=1 \ --set grafana.image.tag=${{ matrix.grafana-image-tag }} \ --set grafana.env.GF_SECURITY_ADMIN_USER=oncall \ --set grafana.env.GF_SECURITY_ADMIN_PASSWORD=oncall \ @@ -400,13 +410,15 @@ jobs: working-directory: grafana-plugin run: ./node_modules/.bin/playwright install-deps chromium firefox webkit - - name: Await k8s pods and other resources up - uses: jupyterhub/action-k8s-await-workloads@v1 - with: - workloads: "" # all - namespace: "" # default - timeout: 300 - max-restarts: -1 + # we could instead use the --wait flag for the helm install command above + # but there's no reason to block on that step + # instead we can let the k8s resources start up behind the scenes and do other + # setup tasks (ex. install playwright + its dependencies) + - name: Wait until k8s resources are ready + run: | + kubectl rollout status deployment/helm-testing-grafana --timeout=300s + kubectl rollout status deployment/helm-testing-oncall-engine --timeout=300s + kubectl rollout status deployment/helm-testing-oncall-celery --timeout=300s - name: Run Integration Tests env: @@ -425,14 +437,14 @@ jobs: working-directory: ./grafana-plugin run: yarn test:integration - # always spit out the engine and celery logs, AFTER the e2e tests have completed - # can be helpful for debugging failing/flaky tests + # spit out the engine, celery, and grafana logs, if the the e2e tests have failed + # can be helpful for debugging failing tests # GitHub Action reference: https://github.com/jupyterhub/action-k8s-namespace-report - name: Kubernetes namespace report uses: jupyterhub/action-k8s-namespace-report@v1 if: failure() with: - important-workloads: "deploy/helm-testing-oncall-engine deploy/helm-testing-oncall-celery" + important-workloads: "deploy/helm-testing-oncall-engine deploy/helm-testing-oncall-celery deploy/helm-testing-grafana" - uses: actions/upload-artifact@v3 if: failure() diff --git a/grafana-plugin/integration-tests/globalSetup.ts b/grafana-plugin/integration-tests/globalSetup.ts index 04af1cfc..79fb4b0c 100644 --- a/grafana-plugin/integration-tests/globalSetup.ts +++ b/grafana-plugin/integration-tests/globalSetup.ts @@ -4,6 +4,8 @@ import { BASE_URL, GRAFANA_PASSWORD, GRAFANA_USERNAME, IS_OPEN_SOURCE, ONCALL_AP import { clickButton, getInputByName } from './utils/forms'; import { goToGrafanaPage } from './utils/navigation'; +const GLOBAL_SETUP_RETRIES = 3; + /** * go to config page and wait for plugin icon to be available on left-hand navigation */ @@ -67,4 +69,23 @@ const globalSetup = async (config: FullConfig): Promise => { await browserContext.close(); }; -export default globalSetup; +/** + * Let's retry global setup, in the event that it fails due to an oncall-engine/oncall-celery backend error. + * Sometimes the sync endpoint will randomly return HTTP 500. + * See here for an example CI job which failed global setup + * https://github.com/grafana/oncall/actions/runs/5062712137/jobs/9088529416#step:19:2536 + * + * References on retrying playwright global setup + * https://github.com/microsoft/playwright/discussions/11371 + */ +const globalSetupWithRetries = async (config: FullConfig): Promise => { + for (let i = 0; i < GLOBAL_SETUP_RETRIES - 1; i++) { + try { + return await globalSetup(config); + } catch (e) {} + } + // One last time, throwing an error if it fails. + await globalSetup(config); +}; + +export default globalSetupWithRetries; diff --git a/grafana-plugin/playwright.config.ts b/grafana-plugin/playwright.config.ts index 2ca27677..5cae7ef8 100644 --- a/grafana-plugin/playwright.config.ts +++ b/grafana-plugin/playwright.config.ts @@ -33,7 +33,7 @@ const config: PlaywrightTestConfig = { * to flaky tests.. let's just retry failed tests. If the same test fails 3 times, you know something must be up */ retries: !!process.env.CI ? 3 : 0, - workers: 1, + workers: !!process.env.CI ? 2 : 1, /* Reporter to use. See https://playwright.dev/docs/test-reporters */ reporter: 'html', /* Shared settings for all the projects below. See https://playwright.dev/docs/api/class-testoptions. */