diff --git a/.github/helm-values.yml b/.github/helm-values.yml deleted file mode 100644 index 31ac441f..00000000 --- a/.github/helm-values.yml +++ /dev/null @@ -1,119 +0,0 @@ -base_url: 172.17.0.1:30001 -base_url_protocol: http - -env: - - name: GRAFANA_CLOUD_NOTIFICATIONS_ENABLED - value: "False" - - name: FEATURE_PROMETHEUS_EXPORTER_ENABLED - value: "True" -image: - repository: oncall/engine - tag: latest - pullPolicy: IfNotPresent -oncall: - devMode: true -broker: - type: redis -redis: - architecture: standalone # don't run replicas, just eats up resources -rabbitmq: - enabled: false -engine: - replicaCount: 1 -celery: - replicaCount: 1 - worker_beat_enabled: false - -grafana: - replicas: 1 - extraInitContainers: - - name: create-db-if-not-exists - image: mysql:8.0.32 - command: - # yamllint disable rule:line-length - [ - "bash", - "-c", - 'while ! mysqladmin ping -h "$DATABASE_HOST" --silent; do echo ''awaiting mysql db to be available'' && sleep 1; done && mysql -h "$DATABASE_HOST" -u "$DATABASE_USER" -p"$DATABASE_PASSWORD" -e ''CREATE DATABASE IF NOT EXISTS grafana CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;''', - ] - # yamllint enable rule:line-length - env: - - name: DATABASE_HOST - value: oncall-ci-mariadb - - name: DATABASE_USER - value: root - - name: DATABASE_PASSWORD - valueFrom: - secretKeyRef: - name: oncall-ci-mariadb - key: mariadb-root-password - env: - GF_FEATURE_TOGGLES_ENABLE: topnav - GF_SECURITY_ADMIN_PASSWORD: oncall - GF_SECURITY_ADMIN_USER: oncall - GF_PLUGINS_ALLOW_LOADING_UNSIGNED_PLUGINS: grafana-oncall-app - GF_DATABASE_TYPE: mysql - GF_DATABASE_HOST: oncall-ci-mariadb:3306 - GF_DATABASE_USER: root - GF_DATABASE_SSL_MODE: disable - envValueFrom: - GF_DATABASE_PASSWORD: - secretKeyRef: - name: oncall-ci-mariadb - key: mariadb-root-password - # by settings grafana.plugins to [] and configuring grafana.extraVolumeMounts we are using the locally built - # OnCall plugin rather than the latest published version - plugins: [] - extraVolumeMounts: - - name: plugins - mountPath: /var/lib/grafana/plugins/grafana-plugin - # hostPath is defined in .github/kind.yml - hostPath: /oncall-plugin - readOnly: true - service: - type: NodePort - nodePort: 30002 - -database: - type: mysql -mariadb: - enabled: true - primary: - service: - type: NodePort - nodePort: 30003 - extraEnvVars: - # See "Passing extra command line flags to mysqld startup" section - # https://hub.docker.com/r/bitnami/mariadb - # - # max_allowed_packet is set to 128mb in bytes - # - # this avoids "Got an error reading communication packets" errors that arise from the grafana container - # apparently sending too much data to mariadb at once - # https://mariadb.com/docs/skysql-dbaas/ref/mdb/system-variables/max_allowed_packet/ - - name: MARIADB_EXTRA_FLAGS - value: "--max_allowed_packet=134217728 --max_connections=1024" - - name: MARIADB_CHARACTER_SET - value: utf8mb4 - - name: MARIADB_COLLATE - value: utf8mb4_unicode_ci - -ingress: - enabled: false -ingress-nginx: - enabled: false -cert-manager: - enabled: false -service: - enabled: true - type: NodePort - port: 8080 - nodePort: 30001 -prometheus: - enabled: true - extraScrapeConfigs: | - - job_name: 'oncall-exporter' - metrics_path: /metrics/ - static_configs: - - targets: - - oncall-dev-engine.default.svc.cluster.local:8080 diff --git a/.github/kind.yml b/.github/kind.yml deleted file mode 100644 index c61ac446..00000000 --- a/.github/kind.yml +++ /dev/null @@ -1,19 +0,0 @@ -kind: Cluster -apiVersion: kind.x-k8s.io/v1alpha4 -nodes: - - role: control-plane - extraPortMappings: - - containerPort: 30001 - hostPort: 30001 - - containerPort: 30002 - hostPort: 30002 - # https://stackoverflow.com/a/62695918 - extraMounts: - # this basically mounts our local ./grafana-plugin (frontend) directory into the kind node - # so that we can later use a volumeMount to mount from the kind-control-plane Docker container -> grafana - # k8s pod. This will allow us to mount the current frontend source code - # - # NOTE: this is a bit hacky and implies that kind create is run from the root of the project - # but for now it works... alternative would be to use something like $(pwd)/grafana-plugin - - hostPath: ./grafana-plugin - containerPath: /oncall-plugin diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml index f881e152..4aa5df57 100644 --- a/.github/workflows/e2e-tests.yml +++ b/.github/workflows/e2e-tests.yml @@ -33,7 +33,7 @@ jobs: # default "ubuntu-latest" runners only provide 2 CPU cores + 7GB of RAM. this seems to lead to HTTP 504s from # the oncall backend, and hence, flaky tests. Let's use CI runners w/ more resources to avoid this (plus # this will allow us to run more backend containers and parralelize the tests) - runs-on: ubuntu-latest-8-cores + runs-on: ubuntu-latest-16-cores name: "Grafana: ${{ inputs.grafana-image-tag }}" environment: name: github-pages @@ -44,15 +44,6 @@ jobs: - name: Checkout uses: actions/checkout@v3 - # TODO: re-enable this when we get the docker build build-context caching working.. see other TODO comment below - # - uses: actions/setup-python@v4 - # with: - # python-version: "3.11.4" - # cache: "pip" - # cache-dependency-path: | - # engine/requirements.txt - # engine/requirements-dev.txt - - name: Collect Workflow Telemetry uses: runforesight/workflow-telemetry-action@v1 with: @@ -60,10 +51,11 @@ jobs: proc_trace_chart_show: false proc_trace_table_show: false - - name: Create k8s Kind Cluster + - name: Install Kind uses: helm/kind-action@v1.3.0 with: - config: ./.github/kind.yml + config: ./dev/kind.yml + install_only: true - uses: actions/setup-node@v3 with: @@ -71,6 +63,17 @@ jobs: cache: "yarn" cache-dependency-path: grafana-plugin/yarn.lock + - name: Install Tilt + run: | + curl -fsSL https://raw.githubusercontent.com/tilt-dev/tilt/master/scripts/install.sh | bash + + - name: Install ctlptl + run: | + CTLPTL_VERSION="0.8.20" + CTLPTL_FILE_NAME="ctlptl.$CTLPTL_VERSION.linux.x86_64.tar.gz" + curl -fsSL https://github.com/tilt-dev/ctlptl/releases/download/v$CTLPTL_VERSION/$CTLPTL_FILE_NAME | \ + tar -xzv -C /usr/local/bin ctlptl + - name: Use cached frontend dependencies id: cache-frontend-dependencies uses: actions/cache@v3 @@ -95,42 +98,6 @@ jobs: working-directory: grafana-plugin run: yarn build:dev - - name: Set up Docker Buildx # We need this step for docker caching - uses: docker/setup-buildx-action@v2 - - - name: Build engine Docker image locally - uses: docker/build-push-action@v4 - with: - context: ./engine - file: ./engine/Dockerfile - push: false - tags: oncall/engine:latest - outputs: type=docker,dest=/tmp/oncall-engine.tar - # TODO: figure out how to get this to work.. this will substantially speed up building our docker image here - # because right now most time is spent building wheels for python dependencies - # (even though they rarely change).. this portion "should" work however I haven't yet figured out how to - # get the cache bind mount in engine/Dockerfile to work optionally (ie. when we don't specify - # the --build-context flag to docker build.. otherwise it fails if pip_cache is not available) - # - # references - # https://github.com/moby/buildkit/blob/master/frontend/dockerfile/docs/reference.md#run---mounttypecache - # https://stackoverflow.com/a/71846527 - # build-contexts: pip_cache=/home/runner/.cache/pip - - - name: Load engine Docker image on the nodes of the cluster - run: kind load image-archive --name=chart-testing /tmp/oncall-engine.tar - - - name: Install helm chart - run: | - helm install oncall-ci \ - --values ./.github/helm-values.yml \ - --set oncall.twilio.accountSid="${{ secrets.TWILIO_ACCOUNT_SID }}" \ - --set oncall.twilio.authToken="${{ secrets.TWILIO_AUTH_TOKEN }}" \ - --set oncall.twilio.phoneNumber="\"${{ secrets.TWILIO_PHONE_NUMBER }}"\" \ - --set oncall.twilio.verifySid="${{ secrets.TWILIO_VERIFY_SID }}" \ - --set grafana.image.tag=${{ inputs.grafana-image-tag }} \ - ./helm/oncall - # helpful reference for properly caching the playwright binaries/dependencies # https://playwrightsolutions.com/playwright-github-action-to-cache-the-browser-binaries/ - name: Get installed Playwright version @@ -147,87 +114,34 @@ jobs: path: "~/.cache/ms-playwright" key: ${{ runner.os }}-playwright-${{ env.PLAYWRIGHT_VERSION }}-${{ inputs.browsers }} - # For the next two steps, use the binary directly from node_modules/.bin as opposed to npx playwright - # due to this bug (https://github.com/microsoft/playwright/issues/13188) - - name: Install Playwright Browsers - if: steps.playwright-cache.outputs.cache-hit != 'true' - working-directory: grafana-plugin - run: ./node_modules/.bin/playwright install --with-deps ${{ inputs.browsers }} - - # use the cached browsers, but we still need to install the necessary system dependencies - # (system deps are installed in the cache-miss step above by the --with-deps flag) - - name: Install Playwright System Dependencies - if: steps.playwright-cache.outputs.cache-hit == 'true' - working-directory: grafana-plugin - run: ./node_modules/.bin/playwright install-deps ${{ inputs.browsers }} - - # we could instead use the --wait flag for the helm install command above - # but there's no reason to block on that step - # instead we can let the k8s resources start up behind the scenes and do other - # setup tasks (ex. install playwright + its dependencies) - - name: Wait until k8s resources are ready + - name: Create cluster run: | - kubectl rollout status deployment/oncall-ci-grafana --timeout=300s - kubectl rollout status deployment/oncall-ci-engine --timeout=300s - kubectl rollout status deployment/oncall-ci-celery --timeout=300s + make cluster/up - - name: Run e2e Tests + - name: Install Playwright deps + uses: docker://mcr.microsoft.com/playwright:next-jammy + + - name: Tilt CI + shell: bash env: - # BASE_URL represents what is accessed via a browser - BASE_URL: http://localhost:30002/grafana - # ONCALL_API_URL is what is configured in the plugin configuration form - # it is what the grafana container uses to communicate with the OnCall backend - # - # 172.17.0.1 is the docker bridge network default gateway. Requests originate in the grafana container - # hit 172.17.0.1 which proxies the request onto the host where port 30001 is the node port that is mapped - # to the OnCall API - ONCALL_API_URL: http://172.17.0.1:30001 - GRAFANA_ADMIN_USERNAME: oncall - GRAFANA_ADMIN_PASSWORD: oncall - GRAFANA_EDITOR_USERNAME: editor - GRAFANA_EDITOR_PASSWORD: editor - GRAFANA_VIEWER_USERNAME: viewer - GRAFANA_VIEWER_PASSWORD: viewer - MAILSLURP_API_KEY: ${{ secrets.MAILSLURP_API_KEY }} + GRAFANA_IMAGE_TAG: ${{ inputs.grafana-image-tag }} BROWSERS: ${{ inputs.browsers }} - working-directory: ./grafana-plugin - run: yarn test:e2e + run: tilt ci - - name: Run expensive e2e Tests + - name: Tilt CI - expensive E2E tests if: inputs.run-expensive-tests + shell: bash env: - BASE_URL: http://localhost:30002/grafana - ONCALL_API_URL: http://172.17.0.1:30001 - GRAFANA_ADMIN_USERNAME: oncall - GRAFANA_ADMIN_PASSWORD: oncall - GRAFANA_EDITOR_USERNAME: editor - GRAFANA_EDITOR_PASSWORD: editor - GRAFANA_VIEWER_USERNAME: viewer - GRAFANA_VIEWER_PASSWORD: viewer + E2E_TESTS_CMD: "cd grafana-plugin && yarn test:e2e-expensive" + GRAFANA_IMAGE_TAG: ${{ inputs.grafana-image-tag }} + BROWSERS: ${{ inputs.browsers }} MAILSLURP_API_KEY: ${{ secrets.MAILSLURP_API_KEY }} - working-directory: ./grafana-plugin - run: yarn test:e2e-expensive - - # spit out the engine, celery, and grafana logs, if the the e2e tests have failed (or were flaky) - # can be helpful for debugging these tests - # GitHub Action reference: https://github.com/jupyterhub/action-k8s-namespace-report - - name: oncall-engine logs - if: always() - uses: jupyterhub/action-k8s-namespace-report@v1 - with: - important-workloads: deploy/oncall-ci-engine - - - name: oncall-celery logs - if: always() - uses: jupyterhub/action-k8s-namespace-report@v1 - with: - important-workloads: deploy/oncall-ci-celery - - - name: grafana logs - if: always() - uses: jupyterhub/action-k8s-namespace-report@v1 - with: - important-workloads: deploy/oncall-ci-grafana + TWILIO_ACCOUNT_SID: ${{ secrets.TWILIO_ACCOUNT_SID }} + TWILIO_AUTH_TOKEN: ${{ secrets.TWILIO_AUTH_TOKEN }} + # wrapping single quotes are required to prevent stripping leading "+" from the number + TWILIO_PHONE_NUMBER: '"${{ secrets.TWILIO_PHONE_NUMBER }}"' + TWILIO_VERIFY_SID: ${{ secrets.TWILIO_VERIFY_SID }} + run: tilt ci - name: Setup Pages if: failure() diff --git a/Tiltfile b/Tiltfile index 9cfeb003..a366862f 100644 --- a/Tiltfile +++ b/Tiltfile @@ -2,6 +2,15 @@ load('ext://uibutton', 'cmd_button', 'location', 'text_input', 'bool_input') running_under_parent_tiltfile = os.getenv("TILT_PARENT", "false") == "true" # The user/pass that you will login to Grafana with grafana_admin_user_pass = os.getenv("GRAFANA_ADMIN_USER_PASS", "oncall") +grafana_image_tag = os.getenv("GRAFANA_IMAGE_TAG", "latest") +e2e_tests_cmd=os.getenv("E2E_TESTS_CMD", "cd grafana-plugin && yarn test:e2e") +twilio_values=[ + "oncall.twilio.accountSid=" + os.getenv("TWILIO_ACCOUNT_SID", ""), + "oncall.twilio.authToken=" + os.getenv("TWILIO_AUTH_TOKEN", ""), + "oncall.twilio.phoneNumber=" + os.getenv("TWILIO_PHONE_NUMBER", ""), + "oncall.twilio.verifySid=" + os.getenv("TWILIO_VERIFY_SID", ""), +] +is_ci=config.tilt_subcommand == "ci" # HELM_PREFIX must be "oncall-dev" as it is hardcoded in dev/helm-local.yml HELM_PREFIX = "oncall-dev" # Use docker registery generated by ctlptl (dev/kind-config.yaml) @@ -54,7 +63,6 @@ docker_build_sub( local_resource( "build-ui", labels=["OnCallUI"], - cmd="cd grafana-plugin && yarn install && yarn build:dev", serve_cmd="cd grafana-plugin && yarn watch", allow_parallel=True, ) @@ -62,10 +70,10 @@ local_resource( local_resource( "e2e-tests", labels=["E2eTests"], - cmd="cd grafana-plugin && yarn test:e2e", + cmd=e2e_tests_cmd, trigger_mode=TRIGGER_MODE_MANUAL, - auto_init=False, - resource_deps=["build-ui", "grafana", "grafana-oncall-app-provisioning-configmap", "engine"] + auto_init=is_ci, + resource_deps=["build-ui", "grafana", "grafana-oncall-app-provisioning-configmap", "engine", "celery"] ) cmd_button( @@ -77,7 +85,6 @@ cmd_button( inputs=[ text_input("BROWSERS", "Browsers (e.g. \"chromium,firefox,webkit\")", "chromium", "chromium,firefox,webkit"), text_input("TESTS_FILTER", "Test filter (e.g. \"timezones.test quality.test\")", "", "Test file names to run"), - bool_input("REPORTER", "Use HTML reporter", True, 'html', 'line'), bool_input("STOP_ON_FIRST_FAILURE", "Stop on first failure", True, "-x", ""), ] ) @@ -106,7 +113,7 @@ cmd_button( icon_name="dangerous", ) -yaml = helm("helm/oncall", name=HELM_PREFIX, values=["./dev/helm-local.yml", "./dev/helm-local.dev.yml"]) +yaml = helm("helm/oncall", name=HELM_PREFIX, values=["./dev/helm-local.yml", "./dev/helm-local.dev.yml"], set=twilio_values) k8s_yaml(yaml) @@ -127,6 +134,7 @@ k8s_resource( # Use separate grafana helm chart if not running_under_parent_tiltfile: grafana( + grafana_version=grafana_image_tag, context="grafana-plugin", plugin_files=["grafana-plugin/src/plugin.json"], namespace="default", @@ -161,5 +169,4 @@ k8s_resource( def resource_name(id): return id.name.replace(HELM_PREFIX + "-", "") - workload_to_resource_function(resource_name) diff --git a/dev/helm-local.yml b/dev/helm-local.yml index d33f216c..938b387a 100644 --- a/dev/helm-local.yml +++ b/dev/helm-local.yml @@ -28,14 +28,7 @@ engine: replicaCount: 1 celery: replicaCount: 1 - -ui: - enabled: false - image: - repository: localhost:63628/oncall/ui - env: - ONCALL_API_URL: http://oncall-dev-engine:8080 - MOBILE_APP_QR_INTERVAL_QUEUE: 290000 # 4 minutes and 50 seconds + worker_beat_enabled: false externalGrafana: url: http://grafana:3000 @@ -47,8 +40,6 @@ grafana: domain: localhost:3000 root_url: "%(protocol)s://%(domain)s" replicas: 1 - image: - tag: 10.0.2 extraInitContainers: - name: create-db-if-not-exists image: mysql:8.0.32 @@ -137,6 +128,9 @@ service: nodePort: 30001 prometheus: enabled: true + server: + global: + scrape_interval: 10s extraScrapeConfigs: | - job_name: 'oncall-exporter' metrics_path: /metrics/ diff --git a/grafana-plugin/e2e-tests/insights/insights.test.ts b/grafana-plugin/e2e-tests/insights/insights.test.ts index cf29ee22..9320d290 100644 --- a/grafana-plugin/e2e-tests/insights/insights.test.ts +++ b/grafana-plugin/e2e-tests/insights/insights.test.ts @@ -21,7 +21,7 @@ test.skip( ); test.describe('Insights', () => { - test.beforeAll(async ({ adminRolePage: { page, userName } }) => { + test.beforeAll(async ({ adminRolePage: { page } }) => { const DATASOURCE_NAME = 'OnCall Prometheus'; const DATASOURCE_URL = 'http://oncall-dev-prometheus-server.default.svc.cluster.local'; @@ -37,21 +37,6 @@ test.describe('Insights', () => { await page.getByPlaceholder('http://localhost:9090').fill(DATASOURCE_URL); await clickButton({ page, buttonText: 'Save & test' }); } - - // send alert and resolve to get some values in insights - const escalationChainName = generateRandomValue(); - const integrationName = generateRandomValue(); - const onCallScheduleName = generateRandomValue(); - await createOnCallScheduleWithRotation(page, onCallScheduleName, userName); - await createEscalationChain( - page, - escalationChainName, - EscalationStep.NotifyUsersFromOnCallSchedule, - onCallScheduleName - ); - await createIntegrationAndSendDemoAlert(page, integrationName, escalationChainName); - await resolveFiringAlert(page); - await page.waitForTimeout(5000); }); test('Viewer can see all the panels in OnCall insights', async ({ viewerRolePage: { page } }) => { @@ -69,11 +54,30 @@ test.describe('Insights', () => { }); }); - test('There is no panel that misses data', async ({ adminRolePage: { page } }) => { + test('There is no panel that misses data', async ({ adminRolePage: { page, userName } }) => { + test.setTimeout(90_000); + + // send alert and resolve to get some values in insights + const escalationChainName = generateRandomValue(); + const integrationName = generateRandomValue(); + const onCallScheduleName = generateRandomValue(); + await createOnCallScheduleWithRotation(page, onCallScheduleName, userName); + await createEscalationChain( + page, + escalationChainName, + EscalationStep.NotifyUsersFromOnCallSchedule, + onCallScheduleName + ); + await createIntegrationAndSendDemoAlert(page, integrationName, escalationChainName); + await resolveFiringAlert(page); + // wait for Prometheus to scrape the data + await page.waitForTimeout(5000); + + // check that we have data in insights panels await goToOnCallPage(page, 'insights'); await page.getByText('Last 24 hours').click(); await page.getByText('Last 1 hour').click(); - await page.waitForTimeout(2000); + await page.waitForTimeout(3000); await expect(page.getByText('No data')).toBeHidden(); }); }); diff --git a/grafana-plugin/e2e-tests/utils/navigation.ts b/grafana-plugin/e2e-tests/utils/navigation.ts index 44c042b8..af9435df 100644 --- a/grafana-plugin/e2e-tests/utils/navigation.ts +++ b/grafana-plugin/e2e-tests/utils/navigation.ts @@ -10,7 +10,8 @@ type OnCallPage = | 'outgoing_webhooks' | 'users' | 'users/me' - | 'insights'; + | 'insights' + | 'settings'; const _goToPage = async (page: Page, url = '') => page.goto(`${BASE_URL}${url}`); diff --git a/grafana-plugin/package.json b/grafana-plugin/package.json index 3bc45f0a..ff70411c 100644 --- a/grafana-plugin/package.json +++ b/grafana-plugin/package.json @@ -17,6 +17,7 @@ "test:e2e": "yarn playwright test --grep-invert @expensive", "test:e2e-expensive": "yarn playwright test --grep @expensive", "test:e2e:watch": "yarn test:e2e --ui", + "test:e2e-expensive:watch": "yarn test:e2e-expensive --ui", "test:e2e:gen": "yarn playwright codegen http://localhost:3000", "e2e-show-report": "yarn playwright show-report", "generate-types": "cd ./src/network/oncall-api/types-generator && yarn generate", diff --git a/grafana-plugin/playwright.config.ts b/grafana-plugin/playwright.config.ts index 3d3dff3e..a4bb0316 100644 --- a/grafana-plugin/playwright.config.ts +++ b/grafana-plugin/playwright.config.ts @@ -13,10 +13,6 @@ export const ADMIN_USER_STORAGE_STATE = path.join(__dirname, 'e2e-tests/.auth/ad const IS_CI = !!process.env.CI; const BROWSERS = process.env.BROWSERS || 'chromium'; -const REPORTER_WITH_DEFAULT = process.env.REPORTER || 'html'; -const REPORTER = ( - process.env.REPORTER === 'html' ? [['html', { open: 'never' }]] : REPORTER_WITH_DEFAULT -) as PlaywrightTestConfig['reporter']; const SETUP_PROJECT_NAME = 'setup'; const getEnabledBrowsers = (browsers: PlaywrightTestProject[]) => @@ -31,7 +27,7 @@ export default defineConfig({ /* Maximum time all the tests can run for. */ globalTimeout: 20 * 60 * 1_000, // 20 minutes - reporter: REPORTER, + reporter: [['html', { open: IS_CI ? 'never' : 'always' }]], /* Maximum time one test can run for. */ timeout: 60_000,