Monitoring and Observability

Track errors with Sentry and collect metrics with Prometheus/Grafana.

Installation

pip install sentry-sdk prometheus-client opentelemetry-sdk

Sentry — Error Tracking

import sentry_sdk
from sentry_sdk.integrations.fastapi import FastApiIntegration
from sentry_sdk.integrations.sqlalchemy import SqlalchemyIntegration
from sentry_sdk.integrations.redis import RedisIntegration


def setup_sentry(dsn: str, environment: str):
    sentry_sdk.init(
        dsn=dsn,
        environment=environment,          # "production", "staging"
        traces_sample_rate=0.1,           # trace 10% of transactions
        profiles_sample_rate=0.1,         # profile 10%
        integrations=[
            FastApiIntegration(
                transaction_style="endpoint",
            ),
            SqlalchemyIntegration(),
            RedisIntegration(),
        ],
        before_send=filter_sensitive_data,
    )


def filter_sensitive_data(event, hint):
    """Strip sensitive data before sending"""
    if "request" in event:
        headers = event["request"].get("headers", {})
        headers.pop("Authorization", None)
        headers.pop("Cookie", None)
    return event


# FastAPI integration
from fastapi import FastAPI
import os

app = FastAPI()

setup_sentry(
    dsn=os.environ["SENTRY_DSN"],
    environment=os.getenv("ENVIRONMENT", "production"),
)


# Manual error capture
@app.get("/risky")
async def risky_endpoint():
    try:
        result = 1 / 0
    except ZeroDivisionError as e:
        sentry_sdk.capture_exception(e)
        return {"error": "Calculation failed"}
    return {"result": result}


# Add custom context
def process_order(order_id: int, user_id: int):
    with sentry_sdk.new_scope() as scope:
        scope.set_user({"id": user_id})
        scope.set_tag("order_id", order_id)
        scope.set_context("order", {"id": order_id, "status": "processing"})

        # ... order processing logic
        sentry_sdk.capture_message("Order processed successfully", level="info")

Prometheus — Metrics Collection

from prometheus_client import (
    Counter, Histogram, Gauge, Summary,
    start_http_server, make_asgi_app,
    CollectorRegistry, multiprocess,
)
import time
from fastapi import FastAPI, Request, Response


# ── Metric definitions ────────────────────────────────────
REQUEST_COUNT = Counter(
    "http_requests_total",
    "Total HTTP request count",
    labelnames=["method", "endpoint", "status_code"],
)

REQUEST_DURATION = Histogram(
    "http_request_duration_seconds",
    "HTTP request processing time",
    labelnames=["method", "endpoint"],
    buckets=[0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0],
)

ACTIVE_CONNECTIONS = Gauge(
    "active_connections",
    "Number of currently active connections",
)

DB_QUERY_DURATION = Histogram(
    "db_query_duration_seconds",
    "DB query execution time",
    labelnames=["operation", "table"],
)

CACHE_HITS = Counter("cache_hits_total", "Cache hit count", labelnames=["cache_name"])
CACHE_MISSES = Counter("cache_misses_total", "Cache miss count", labelnames=["cache_name"])


# ── FastAPI middleware ────────────────────────────────────
app = FastAPI()


@app.middleware("http")
async def metrics_middleware(request: Request, call_next):
    endpoint = request.url.path
    method = request.method

    ACTIVE_CONNECTIONS.inc()
    start = time.perf_counter()

    try:
        response: Response = await call_next(request)
        duration = time.perf_counter() - start

        REQUEST_COUNT.labels(
            method=method,
            endpoint=endpoint,
            status_code=response.status_code,
        ).inc()

        REQUEST_DURATION.labels(
            method=method,
            endpoint=endpoint,
        ).observe(duration)

        return response
    finally:
        ACTIVE_CONNECTIONS.dec()


# Expose /metrics endpoint
metrics_app = make_asgi_app()
app.mount("/metrics", metrics_app)


# ── Business metrics ──────────────────────────────────────
ORDER_CREATED = Counter("orders_created_total", "Number of orders created", labelnames=["type"])
ORDER_REVENUE = Counter("order_revenue_total", "Total revenue")
INVENTORY_LEVEL = Gauge("inventory_level", "Inventory quantity", labelnames=["product_id"])


@app.post("/orders")
async def create_order(order_type: str, amount: float, product_id: str):
    # ... order processing
    ORDER_CREATED.labels(type=order_type).inc()
    ORDER_REVENUE.inc(amount)
    INVENTORY_LEVEL.labels(product_id=product_id).dec()
    return {"status": "created"}

prometheus.yml — Scrape Configuration

# prometheus.yml
global:
  scrape_interval: 15s
  evaluation_interval: 15s

scrape_configs:
  - job_name: "fastapi-app"
    static_configs:
      - targets: ["app:8000"]
    metrics_path: /metrics

  - job_name: "postgres"
    static_configs:
      - targets: ["postgres-exporter:9187"]

  - job_name: "redis"
    static_configs:
      - targets: ["redis-exporter:9121"]

alerting:
  alertmanagers:
    - static_configs:
        - targets: ["alertmanager:9093"]

rule_files:
  - "alerts.yml"

Grafana Alert Rules

# alerts.yml
groups:
  - name: app_alerts
    rules:
      - alert: HighErrorRate
        expr: |
          rate(http_requests_total{status_code=~"5.."}[5m]) /
          rate(http_requests_total[5m]) > 0.05
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: "High error rate detected"
          description: "5xx error rate exceeded 5% (current: {{ $value | humanizePercentage }})"

      - alert: SlowResponseTime
        expr: |
          histogram_quantile(0.95,
            rate(http_request_duration_seconds_bucket[5m])
          ) > 1.0
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Response time degraded"
          description: "P95 response time exceeded 1 second"

OpenTelemetry — Distributed Tracing

from opentelemetry import trace
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.exporter.jaeger.thrift import JaegerExporter
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor


def setup_tracing(service_name: str, jaeger_host: str = "localhost"):
    provider = TracerProvider()
    jaeger_exporter = JaegerExporter(
        agent_host_name=jaeger_host,
        agent_port=6831,
    )
    provider.add_span_processor(BatchSpanProcessor(jaeger_exporter))
    trace.set_tracer_provider(provider)

    # Auto-instrumentation
    FastAPIInstrumentor.instrument_app(app)
    SQLAlchemyInstrumentor().instrument()


tracer = trace.get_tracer(__name__)


@app.get("/users/{user_id}")
async def get_user(user_id: int):
    with tracer.start_as_current_span("fetch-user") as span:
        span.set_attribute("user.id", user_id)

        with tracer.start_as_current_span("db-query"):
            # DB query (auto-instrumented)
            user = {"id": user_id, "name": "Alice"}

        return user

Summary

Tool	Observability Pillar	Role
`Sentry`	Error tracking	Exception capture, stack traces, alerts
`Prometheus`	Metrics	Numeric time-series data
`Grafana`	Visualization + alerts	Dashboards, threshold alerts
`OpenTelemetry`	Distributed tracing	Request flow across microservices

The three pillars of observability: Logs + Metrics + Traces

Installation​

Sentry — Error Tracking​

Prometheus — Metrics Collection​

prometheus.yml — Scrape Configuration​

Grafana Alert Rules​

OpenTelemetry — Distributed Tracing​

Summary​

Installation

Sentry — Error Tracking

Prometheus — Metrics Collection

prometheus.yml — Scrape Configuration

Grafana Alert Rules

OpenTelemetry — Distributed Tracing

Summary