From fa3413d2a90bf8690f77f373337ac8c51caf7199 Mon Sep 17 00:00:00 2001 From: Ildar Iskhakov Date: Mon, 19 Dec 2022 17:13:52 +0800 Subject: [PATCH] Add tracing support --- .markdownlintignore | 8 ++++ dev/.env.traces.dev | 3 ++ dev/README.md | 9 +++++ dev/conf/agent.yaml | 16 ++++++++ dev/conf/grafana-datasources.yaml | 30 +++++++++++++++ dev/conf/tempo-local.yaml | 62 +++++++++++++++++++++++++++++++ docker-compose-developer.yml | 30 +++++++++++++++ engine/engine/celery.py | 17 +++++++++ engine/engine/wsgi.py | 18 +++++++++ engine/requirements.txt | 4 ++ engine/settings/base.py | 3 ++ 11 files changed, 200 insertions(+) create mode 100644 .markdownlintignore create mode 100644 dev/.env.traces.dev create mode 100644 dev/conf/agent.yaml create mode 100644 dev/conf/grafana-datasources.yaml create mode 100644 dev/conf/tempo-local.yaml diff --git a/.markdownlintignore b/.markdownlintignore new file mode 100644 index 00000000..0e47d8af --- /dev/null +++ b/.markdownlintignore @@ -0,0 +1,8 @@ +*.pyc +venv +.python-version + +.vscode +.idea +.DS_Store +.env diff --git a/dev/.env.traces.dev b/dev/.env.traces.dev new file mode 100644 index 00000000..bc2f6093 --- /dev/null +++ b/dev/.env.traces.dev @@ -0,0 +1,3 @@ +OTEL_TRACING_ENABLED=True +OTEL_SERVICE_NAME=oncall +OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317 \ No newline at end of file diff --git a/dev/README.md b/dev/README.md index 4ffbfebc..33f18380 100644 --- a/dev/README.md +++ b/dev/README.md @@ -69,6 +69,7 @@ The possible profiles values are: - `rabbitmq` - `postgres` - `mysql` +- `tracing` The default is `engine,oncall_ui,redis,grafana`. This runs: @@ -172,6 +173,14 @@ export DRONE_TOKEN= drone sign --save grafana/oncall .drone.yml ``` +## Tracing setup + +Run these steps to enable tracing in your local deployment + +1. Add `tracing` to COMPOSE_PROFILES variable (more in [`COMPOSE_PROFILES`](#compose_profiles)) +2. Copy content of `.env.tracing.dev` to your env.dev +3. Start the application and check tracing spans at [Grafana Explore Tab](http://localhost:3000/explore), datasource Tempo + ## Troubleshooting ### ld: library not found for -lssl diff --git a/dev/conf/agent.yaml b/dev/conf/agent.yaml new file mode 100644 index 00000000..24945c99 --- /dev/null +++ b/dev/conf/agent.yaml @@ -0,0 +1,16 @@ +server: + log_level: debug + +traces: + configs: + - name: default + receivers: + otlp: + protocols: + grpc: + remote_write: + - endpoint: tempo:4317 + insecure: true + batch: + timeout: 5s + send_batch_size: 100 \ No newline at end of file diff --git a/dev/conf/grafana-datasources.yaml b/dev/conf/grafana-datasources.yaml new file mode 100644 index 00000000..4a3bc2c4 --- /dev/null +++ b/dev/conf/grafana-datasources.yaml @@ -0,0 +1,30 @@ +apiVersion: 1 + +datasources: +- name: Prometheus + type: prometheus + uid: prometheus + access: proxy + orgId: 1 + url: http://prometheus:9090 + basicAuth: false + isDefault: false + version: 1 + editable: false + jsonData: + httpMethod: GET +- name: Tempo + type: tempo + access: proxy + orgId: 1 + url: http://tempo:3200 + basicAuth: false + isDefault: true + version: 1 + editable: false + apiVersion: 1 + uid: tempo + jsonData: + httpMethod: GET + serviceMap: + datasourceUid: prometheus diff --git a/dev/conf/tempo-local.yaml b/dev/conf/tempo-local.yaml new file mode 100644 index 00000000..21440817 --- /dev/null +++ b/dev/conf/tempo-local.yaml @@ -0,0 +1,62 @@ +search_enabled: true +metrics_generator_enabled: true + +server: + http_listen_port: 3200 + +distributor: + receivers: # this configuration will listen on all ports and protocols that tempo is capable of. + jaeger: # the receives all come from the OpenTelemetry collector. more configuration information can + protocols: # be found there: https://github.com/open-telemetry/opentelemetry-collector/tree/main/receiver + thrift_http: # + grpc: # for a production deployment you should only enable the receivers you need! + thrift_binary: + thrift_compact: + zipkin: + otlp: + protocols: + http: + grpc: + opencensus: + +ingester: + trace_idle_period: 10s # the length of time after a trace has not received spans to consider it complete and flush it + max_block_bytes: 1_000_000 # cut the head block when it hits this size or ... + max_block_duration: 5m # this much time passes + +compactor: + compaction: + compaction_window: 1h # blocks in this time window will be compacted together + max_block_bytes: 100_000_000 # maximum size of compacted blocks + block_retention: 1h + compacted_block_retention: 10m + +metrics_generator: + registry: + external_labels: + source: tempo + cluster: docker-compose + storage: + path: /tmp/tempo/generator/wal + remote_write: + - url: http://prometheus:9090/api/v1/write + send_exemplars: true + +storage: + trace: + backend: local # backend configuration to use + block: + bloom_filter_false_positive: .05 # bloom filter false positive rate. lower values create larger filters but fewer false positives + index_downsample_bytes: 1000 # number of bytes per index record + encoding: zstd # block encoding/compression. options: none, gzip, lz4-64k, lz4-256k, lz4-1M, lz4, snappy, zstd, s2 + wal: + path: /tmp/tempo/wal # where to store the the wal locally + encoding: snappy # wal encoding/compression. options: none, gzip, lz4-64k, lz4-256k, lz4-1M, lz4, snappy, zstd, s2 + local: + path: /tmp/tempo/blocks + pool: + max_workers: 100 # worker pool determines the number of parallel requests to the object store backend + queue_depth: 10000 + +overrides: + metrics_generator_processors: [service-graphs, span-metrics] \ No newline at end of file diff --git a/docker-compose-developer.yml b/docker-compose-developer.yml index 55884d05..8f00e0f2 100644 --- a/docker-compose-developer.yml +++ b/docker-compose-developer.yml @@ -278,6 +278,7 @@ services: volumes: - grafanadata_dev:/var/lib/grafana - ./grafana-plugin:/var/lib/grafana/plugins/grafana-plugin + - ./dev/conf/grafana-datasources.yaml:/etc/grafana/provisioning/datasources/datasources.yaml depends_on: postgres: condition: service_healthy @@ -286,6 +287,33 @@ services: profiles: - grafana + agent: + image: grafana/agent:v0.27.1 + volumes: + - ./dev/conf/agent.yaml:/etc/agent.yaml + entrypoint: + - /bin/agent + - -config.file=/etc/agent.yaml + ports: + - "4317:4317" + profiles: + - tracing + + tempo: + image: grafana/tempo:latest + command: [ "-config.file=/etc/tempo.yaml" ] + volumes: + - ./dev/conf/tempo-local.yaml:/etc/tempo.yaml + - tempodata_dev:/tmp/tempo + ports: + - "14268" # jaeger ingest + - "3200" # tempo + - "4317" # otlp grpc + - "4318" # otlp http + - "9411" # zipkin + profiles: + - tracing + volumes: redisdata_dev: labels: *oncall-labels @@ -297,6 +325,8 @@ volumes: labels: *oncall-labels mysqldata_dev: labels: *oncall-labels + tempodata_dev: + labels: *oncall-labels networks: default: diff --git a/engine/engine/celery.py b/engine/engine/celery.py index c023eb6e..39a4f03f 100644 --- a/engine/engine/celery.py +++ b/engine/engine/celery.py @@ -7,6 +7,12 @@ from celery.app.log import TaskFormatter from celery.utils.debug import memdump, sample_mem from celery.utils.log import get_task_logger from django.conf import settings +from opentelemetry import trace +from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter +from opentelemetry.instrumentation.celery import CeleryInstrumentor +from opentelemetry.instrumentation.pymysql import PyMySQLInstrumentor +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import BatchSpanProcessor os.environ.setdefault("DJANGO_SETTINGS_MODULE", "settings.prod") @@ -50,6 +56,17 @@ def on_after_setup_logger(logger, **kwargs): ) +if settings.OTEL_TRACING_ENABLED and settings.OTEL_EXPORTER_OTLP_ENDPOINT: + + @celery.signals.worker_process_init.connect(weak=False) + def init_celery_tracing(*args, **kwargs): + trace.set_tracer_provider(TracerProvider()) + span_processor = BatchSpanProcessor(OTLPSpanExporter(endpoint="http://localhost:4317")) + trace.get_tracer_provider().add_span_processor(span_processor) + PyMySQLInstrumentor().instrument() + CeleryInstrumentor().instrument() + + if settings.DEBUG_CELERY_TASKS_PROFILING: @celery.signals.task_prerun.connect diff --git a/engine/engine/wsgi.py b/engine/engine/wsgi.py index 05f06afd..9dcae1f9 100644 --- a/engine/engine/wsgi.py +++ b/engine/engine/wsgi.py @@ -9,10 +9,28 @@ https://docs.djangoproject.com/en/2.1/howto/deployment/wsgi/ import os +from django.conf import settings from django.core.wsgi import get_wsgi_application +from opentelemetry import trace +from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter +from opentelemetry.instrumentation.pymysql import PyMySQLInstrumentor +from opentelemetry.instrumentation.wsgi import OpenTelemetryMiddleware +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import BatchSpanProcessor +from uwsgidecorators import postfork from whitenoise import WhiteNoise os.environ.setdefault("DJANGO_SETTINGS_MODULE", "settings.prod") application = get_wsgi_application() application = WhiteNoise(application) + +if settings.OTEL_TRACING_ENABLED and settings.OTEL_EXPORTER_OTLP_ENDPOINT: + application = OpenTelemetryMiddleware(application) + + @postfork + def init_tracing(): + trace.set_tracer_provider(TracerProvider()) + span_processor = BatchSpanProcessor(OTLPSpanExporter()) + trace.get_tracer_provider().add_span_processor(span_processor) + PyMySQLInstrumentor().instrument() diff --git a/engine/requirements.txt b/engine/requirements.txt index 3e45d9a4..f179c8d9 100644 --- a/engine/requirements.txt +++ b/engine/requirements.txt @@ -41,3 +41,7 @@ psycopg2-binary==2.9.3 emoji==1.7.0 regex==2021.11.2 psutil==5.9.4 +opentelemetry-instrumentation-celery==0.36b0 +opentelemetry-instrumentation-pymysql==0.36b0 +opentelemetry-instrumentation-wsgi==0.36b0 +opentelemetry-exporter-otlp-proto-grpc==1.15.0 diff --git a/engine/settings/base.py b/engine/settings/base.py index e5aac426..96737486 100644 --- a/engine/settings/base.py +++ b/engine/settings/base.py @@ -41,6 +41,9 @@ DEBUG = False DEBUG_CELERY_TASKS_PROFILING = getenv_boolean("DEBUG_CELERY_TASKS_PROFILING", False) +OTEL_TRACING_ENABLED = getenv_boolean("OTEL_TRACING_ENABLED", False) +OTEL_EXPORTER_OTLP_ENDPOINT = os.environ.get("OTEL_EXPORTER_OTLP_ENDPOINT") + ALLOWED_HOSTS = [item.strip() for item in os.environ.get("ALLOWED_HOSTS", "*").split(",")] # TODO: update link to up-to-date docs