oncall-engine/.github/workflows/linting-and-tests.yml

name: Linting and Unit/e2e Tests

on:
  push:
    branches:
      - main
      - dev
  pull_request:
  # You can use the merge_group event to trigger your GitHub Actions workflow when
  # a pull request is added to a merge queue
  # https://docs.github.com/en/repositories/configuring-branches-and-merges-in-your-repository/configuring-pull-request-merges/managing-a-merge-queue#triggering-merge-group-checks-with-github-actions
  merge_group:

concurrency:
  # Cancel any running workflow for the same branch when new commits are pushed.
  # We group both by ref_name (available when CI is triggered by a push to a branch/tag)
  # and head_ref (available when CI is triggered by a PR).
  group: "${{ github.ref_name }}-${{ github.head_ref }}"
  cancel-in-progress: true

jobs:
  lint-entire-project:
    name: "Lint entire project"
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v3
      - uses: actions/setup-python@v4
        with:
          python-version: "3.11.3"
          cache: "pip"
          cache-dependency-path: engine/requirements.txt
      # following 2 steps - need to install the frontend dependencies for the eslint/prettier/stylelint steps
      - uses: actions/setup-node@v3
        with:
          node-version: 18.16.0
          cache: "yarn"
          cache-dependency-path: grafana-plugin/yarn.lock
      - name: Use cached frontend dependencies
        id: cache-frontend-dependencies
        uses: actions/cache@v3
        with:
          path: grafana-plugin/node_modules
          key: ${{ runner.os }}-frontend-node-modules-${{ hashFiles('grafana-plugin/yarn.lock') }}
      - name: Install frontend dependencies
        if: steps.cache-frontend-dependencies.outputs.cache-hit != 'true'
        working-directory: grafana-plugin
        run: yarn install --frozen-lockfile --prefer-offline --network-timeout 500000
      - uses: pre-commit/action@v3.0.0

  lint-test-and-build-frontend:
    name: "Lint, test, and build frontend"
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v3
      - uses: actions/setup-node@v3
        with:
          node-version: 18.16.0
          cache: "yarn"
          cache-dependency-path: grafana-plugin/yarn.lock
      - name: Use cached frontend dependencies
        id: cache-frontend-dependencies
        uses: actions/cache@v3
        with:
          path: grafana-plugin/node_modules
          key: ${{ runner.os }}-frontend-node-modules-${{ hashFiles('grafana-plugin/yarn.lock') }}
      - name: Install frontend dependencies
        if: steps.cache-frontend-dependencies.outputs.cache-hit != 'true'
        working-directory: grafana-plugin
        run: yarn install --frozen-lockfile --prefer-offline --network-timeout 500000
      - name: Build frontend (will run linter and tests)
        working-directory: grafana-plugin
        run: yarn build

  test-technical-documentation:
    name: "Test technical documentation"
    runs-on: ubuntu-latest
    steps:
      - name: "Check out code"
        uses: "actions/checkout@v3"
      - name: "Build website"
        # -e HUGO_REFLINKSERRORLEVEL=ERROR prevents merging broken refs with the downside
        # that no refs to external content can be used as these refs will not resolve in the
        # docs-base image.
        run: |
          docker run -v ${PWD}/docs/sources:/hugo/content/docs/oncall/latest -e HUGO_REFLINKSERRORLEVEL=ERROR --rm grafana/docs-base:latest /bin/bash -c 'make hugo'

  lint-migrations-backend-mysql-rabbitmq:
    name: "Lint database migrations"
    runs-on: ubuntu-latest
    env:
      DATABASE_HOST: localhost
      RABBITMQ_URI: amqp://rabbitmq:rabbitmq@localhost:5672
      DJANGO_SETTINGS_MODULE: settings.ci-test
      SLACK_CLIENT_OAUTH_ID: 1
    services:
      rabbit_test:
        image: rabbitmq:3.7.19
        env:
          RABBITMQ_DEFAULT_USER: rabbitmq
          RABBITMQ_DEFAULT_PASS: rabbitmq
        ports:
          - 5672:5672
      mysql_test:
        image: mysql:8.0.32
        env:
          MYSQL_DATABASE: oncall_local_dev
          MYSQL_ROOT_PASSWORD: local_dev_pwd
        ports:
          - 3306:3306
    steps:
      - uses: actions/checkout@v3
      - uses: actions/setup-python@v4
        with:
          python-version: "3.11.3"
          cache: "pip"
          cache-dependency-path: engine/requirements.txt
      - name: Lint migrations
        working-directory: engine
        run: |
          pip install -r requirements.txt
          python manage.py lintmigrations

  unit-test-helm-chart:
    name: "Helm Chart Unit Tests"
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v3
      - uses: d3adb5/helm-unittest-action@v2
        with:
          helm-version: v3.8.0
          charts: ./helm/oncall

  unit-test-backend-mysql-rabbitmq:
    name: "Backend Tests: MySQL + RabbitMQ (RBAC enabled: ${{ matrix.rbac_enabled }})"
    runs-on: ubuntu-latest
    strategy:
      matrix:
        rbac_enabled: ["True", "False"]
    env:
      DJANGO_SETTINGS_MODULE: settings.ci-test
      DATABASE_HOST: localhost
      RABBITMQ_URI: amqp://rabbitmq:rabbitmq@localhost:5672
      SLACK_CLIENT_OAUTH_ID: 1
      ONCALL_TESTING_RBAC_ENABLED: ${{ matrix.rbac_enabled }}
    services:
      rabbit_test:
        image: rabbitmq:3.7.19
        env:
          RABBITMQ_DEFAULT_USER: rabbitmq
          RABBITMQ_DEFAULT_PASS: rabbitmq
        ports:
          - 5672:5672
      mysql_test:
        image: mysql:8.0.32
        env:
          MYSQL_DATABASE: oncall_local_dev
          MYSQL_ROOT_PASSWORD: local_dev_pwd
        ports:
          - 3306:3306
    steps:
      - uses: actions/checkout@v3
      - uses: actions/setup-python@v4
        with:
          python-version: "3.11.3"
          cache: "pip"
          cache-dependency-path: engine/requirements.txt
      - name: Unit Test Backend
        working-directory: engine
        run: |
          apt-get update && apt-get install -y netcat
          pip install -r requirements.txt
          ./wait_for_test_mysql_start.sh && pytest -x

  unit-test-backend-postgresql-rabbitmq:
    name: "Backend Tests: PostgreSQL + RabbitMQ (RBAC enabled: ${{ matrix.rbac_enabled }})"
    runs-on: ubuntu-latest
    strategy:
      matrix:
        rbac_enabled: ["True", "False"]
    env:
      DATABASE_TYPE: postgresql
      DATABASE_HOST: localhost
      RABBITMQ_URI: amqp://rabbitmq:rabbitmq@localhost:5672
      DJANGO_SETTINGS_MODULE: settings.ci-test
      SLACK_CLIENT_OAUTH_ID: 1
      ONCALL_TESTING_RBAC_ENABLED: ${{ matrix.rbac_enabled }}
    services:
      rabbit_test:
        image: rabbitmq:3.7.19
        env:
          RABBITMQ_DEFAULT_USER: rabbitmq
          RABBITMQ_DEFAULT_PASS: rabbitmq
        ports:
          - 5672:5672
      postgresql_test:
        image: postgres:14.4
        env:
          POSTGRES_DB: oncall_local_dev
          POSTGRES_PASSWORD: local_dev_pwd
        ports:
          - 5432:5432
        # Set health checks to wait until postgres has started
        options: >-
          --health-cmd pg_isready
          --health-interval 10s
          --health-timeout 5s
          --health-retries 5
    steps:
      - uses: actions/checkout@v3
      - uses: actions/setup-python@v4
        with:
          python-version: "3.11.3"
          cache: "pip"
          cache-dependency-path: engine/requirements.txt
      - name: Unit Test Backend
        working-directory: engine
        run: |
          pip install -r requirements.txt
          pytest -x

  unit-test-backend-sqlite-redis:
    name: "Backend Tests: SQLite + Redis (RBAC enabled: ${{ matrix.rbac_enabled }})"
    runs-on: ubuntu-latest
    strategy:
      matrix:
        rbac_enabled: ["True", "False"]
    env:
      DATABASE_TYPE: sqlite3
      BROKER_TYPE: redis
      REDIS_URI: redis://localhost:6379
      DJANGO_SETTINGS_MODULE: settings.ci-test
      SLACK_CLIENT_OAUTH_ID: 1
      ONCALL_TESTING_RBAC_ENABLED: ${{ matrix.rbac_enabled }}
    services:
      redis_test:
        image: redis:7.0.5
        ports:
          - 6379:6379
        options: >-
          --health-cmd "redis-cli ping"
          --health-interval 10s
          --health-timeout 5s
          --health-retries 5
    steps:
      - uses: actions/checkout@v3
      - uses: actions/setup-python@v4
        with:
          python-version: "3.11.3"
          cache: "pip"
          cache-dependency-path: engine/requirements.txt
      - name: Unit Test Backend
        working-directory: engine
        run: |
          apt-get update && apt-get install -y netcat
          pip install -r requirements.txt
          pytest -x

  unit-test-pd-migrator:
    name: "Unit tests - PagerDuty Migrator"
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v3
      - uses: actions/setup-python@v4
        with:
          python-version: "3.11.3"
          cache: "pip"
          cache-dependency-path: tools/pagerduty-migrator/requirements.txt
      - name: Unit Test PD Migrator
        working-directory: tools/pagerduty-migrator
        run: |
          pip install -r requirements.txt
          pytest -x

  end-to-end-tests:
    # default "ubuntu-latest" runners only provide 2 CPU cores + 7GB of RAM. this seems to lead to HTTP 504s from
    # the oncall backend, and hence, flaky tests. Let's use CI runners w/ more resources to avoid this (plus
    # this will allow us to run more backend containers and parralelize the tests)
    runs-on: ubuntu-latest-8-cores
    name: "End to end tests - Grafana: ${{ matrix.grafana-image-tag }}"
    strategy:
      matrix:
        grafana-image-tag:
          # OnCall doesn't work on the following versions of Grafana
          # - 8.5.22
          # - 9.0.0
          # - 9.1.0

          # 9.2.0 is the earliest version where things work
          - 9.2.13
          - 9.3.14
          - 9.4.10
          - 9.5.2
          - main
          - latest
      fail-fast: false
    steps:
      - name: Checkout
        uses: actions/checkout@v3

      - name: Collect Workflow Telemetry
        uses: runforesight/workflow-telemetry-action@v1
        with:
          comment_on_pr: false
          proc_trace_chart_show: false
          proc_trace_table_show: false

      - name: Create k8s Kind Cluster
        uses: helm/kind-action@v1.3.0
        with:
          config: ./helm/kind.yml

      - uses: actions/setup-node@v3
        with:
          node-version: 18.16.0
          cache: "yarn"
          cache-dependency-path: grafana-plugin/yarn.lock

      - name: Use cached frontend dependencies
        id: cache-frontend-dependencies
        uses: actions/cache@v3
        with:
          path: grafana-plugin/node_modules
          key: ${{ runner.os }}-frontend-node-modules-${{ hashFiles('grafana-plugin/yarn.lock') }}

      - name: Install frontend dependencies
        if: steps.cache-frontend-dependencies.outputs.cache-hit != 'true'
        working-directory: grafana-plugin
        run: yarn install --frozen-lockfile --prefer-offline --network-timeout 500000

      - name: Use cached plugin frontend build
        id: cache-plugin-frontend
        uses: actions/cache@v3
        with:
          path: grafana-plugin/dist
          key: ${{ runner.os }}-plugin-frontend-${{ hashFiles('grafana-plugin/src/**/*', 'grafana-plugin/yarn.lock') }}

      - name: Build plugin frontend
        if: steps.cache-plugin-frontend.outputs.cache-hit != 'true'
        working-directory: grafana-plugin
        run: yarn build:dev

      - name: Set up Docker Buildx # We need this step for docker caching
        uses: docker/setup-buildx-action@v2

      - name: Build engine Docker image locally # using github actions docker cache
        uses: docker/build-push-action@v2
        with:
          context: ./engine
          file: ./engine/Dockerfile
          push: false
          load: true
          tags: oncall/engine:latest
          cache-from: type=gha
          cache-to: type=gha,mode=max
          outputs: type=docker,dest=/tmp/oncall-engine.tar

      - name: Load engine Docker image on the nodes of the cluster
        run: kind load image-archive --name=chart-testing /tmp/oncall-engine.tar

      # spin up 3 engine and 3 celery pods, this will allow us to parralelize the integration tests,
      # and complete them much faster by using multiple test processes
      # With just 1 engine/celery/grafana pod, the backend crawls to a halt when there is > 1 parallelized integration test process
      # NOTE: it appears that using > 1 grafana container w/ SQLite as the database sometimes leads to failed
      # grafana database migrations (this is documented in this GitHub issue
      # https://github.com/bitnami/charts/issues/10905)
      #
      # by settings grafana.plugins to [] and configuring grafana.extraVolumeMounts we are using the locally built
      # OnCall plugin rather than the latest published version
      # the /oncall-plugin hostPath refers to the kind volumeMount that points to the ./grafana-plugin dir
      # see ./helm/kind.yml for more details
      - name: Install helm chart
        run: |
          helm install helm-testing \
            --values ./helm/simple.yml \
            --values ./helm/values-local-image.yml \
            --set-json 'env=[{"name":"GRAFANA_CLOUD_NOTIFICATIONS_ENABLED","value":"False"}]' \
            --set engine.replicaCount=3 \
            --set celery.replicaCount=3 \
            --set celery.worker_beat_enabled="False" \
            --set oncall.twilio.accountSid="${{ secrets.TWILIO_ACCOUNT_SID }}" \
            --set oncall.twilio.authToken="${{ secrets.TWILIO_AUTH_TOKEN }}" \
            --set oncall.twilio.phoneNumber="\"${{ secrets.TWILIO_PHONE_NUMBER }}"\" \
            --set oncall.twilio.verifySid="${{ secrets.TWILIO_VERIFY_SID }}" \
            --set grafana.replicas=1 \
            --set grafana.image.tag=${{ matrix.grafana-image-tag }} \
            --set grafana.env.GF_SECURITY_ADMIN_USER=oncall \
            --set grafana.env.GF_SECURITY_ADMIN_PASSWORD=oncall \
            --set grafana.env.GF_FEATURE_TOGGLES_ENABLE=topnav \
            --set grafana.env.GF_PLUGINS_ALLOW_LOADING_UNSIGNED_PLUGINS=grafana-oncall-app \
            --set-json "grafana.plugins=[]" \
            --set-json 'grafana.extraVolumeMounts=[{"name":"plugins","mountPath":"/var/lib/grafana/plugins/grafana-plugin","hostPath":"/oncall-plugin","readOnly":true}]' \
            ./helm/oncall

      # helpful reference for properly caching the playwright binaries/dependencies
      # https://playwrightsolutions.com/playwright-github-action-to-cache-the-browser-binaries/
      - name: Get installed Playwright version
        id: playwright-version
        working-directory: grafana-plugin
        run: echo "PLAYWRIGHT_VERSION=$(cat ./package.json | jq -r '.devDependencies["@playwright/test"]')" >> $GITHUB_ENV

      - name: Cache Playwright binaries/dependencies
        id: playwright-cache
        uses: actions/cache@v3
        with:
          path: "~/.cache/ms-playwright"
          key: ${{ runner.os }}-playwright-${{ env.PLAYWRIGHT_VERSION }}-chromium-firefox-webkit

      # For the next two steps, use the binary directly from node_modules/.bin as opposed to npx playwright
      # due to this bug (https://github.com/microsoft/playwright/issues/13188)
      - name: Install Playwright Browsers
        if: steps.playwright-cache.outputs.cache-hit != 'true'
        working-directory: grafana-plugin
        run: ./node_modules/.bin/playwright install --with-deps chromium firefox webkit

      # use the cached browsers, but we still need to install the necessary system dependencies
      # (system deps are installed in the cache-miss step above by the --with-deps flag)
      - name: Install Playwright System Dependencies
        if: steps.playwright-cache.outputs.cache-hit == 'true'
        working-directory: grafana-plugin
        run: ./node_modules/.bin/playwright install-deps chromium firefox webkit

      # we could instead use the --wait flag for the helm install command above
      # but there's no reason to block on that step
      # instead we can let the k8s resources start up behind the scenes and do other
      # setup tasks (ex. install playwright + its dependencies)
      - name: Wait until k8s resources are ready
        run: |
          kubectl rollout status deployment/helm-testing-grafana --timeout=300s
          kubectl rollout status deployment/helm-testing-oncall-engine --timeout=300s
          kubectl rollout status deployment/helm-testing-oncall-celery --timeout=300s

      - name: Run Integration Tests
        env:
          # BASE_URL represents what is accessed via a browser
          BASE_URL: http://localhost:30002/grafana
          # ONCALL_API_URL is what is configured in the plugin configuration form
          # it is what the grafana container uses to communicate with the OnCall backend
          #
          # 172.17.0.1 is the docker bridge network default gateway. Requests originate in the grafana container
          # hit 172.17.0.1 which proxies the request onto the host where port 30001 is the node port that is mapped
          # to the OnCall API
          ONCALL_API_URL: http://172.17.0.1:30001
          GRAFANA_USERNAME: oncall
          GRAFANA_PASSWORD: oncall
          MAILSLURP_API_KEY: ${{ secrets.MAILSLURP_API_KEY }}
        working-directory: ./grafana-plugin
        run: yarn test:integration

      # spit out the engine, celery, and grafana logs, if the the e2e tests have failed
      # can be helpful for debugging failing tests
      # GitHub Action reference: https://github.com/jupyterhub/action-k8s-namespace-report
      - name: Kubernetes namespace report
        uses: jupyterhub/action-k8s-namespace-report@v1
        if: failure()
        with:
          important-workloads: "deploy/helm-testing-oncall-engine deploy/helm-testing-oncall-celery deploy/helm-testing-grafana"

      - uses: actions/upload-artifact@v3
        if: failure()
        with:
          name: playwright-report-grafana-${{ matrix.grafana-image-tag }}
          path: ./grafana-plugin/playwright-report/
          retention-days: 30