Skip to main content
Advertisement

Monitoring and Observability

Track errors with Sentry and collect metrics with Prometheus/Grafana.


Installation

pip install sentry-sdk prometheus-client opentelemetry-sdk

Sentry — Error Tracking

import sentry_sdk
from sentry_sdk.integrations.fastapi import FastApiIntegration
from sentry_sdk.integrations.sqlalchemy import SqlalchemyIntegration
from sentry_sdk.integrations.redis import RedisIntegration


def setup_sentry(dsn: str, environment: str):
sentry_sdk.init(
dsn=dsn,
environment=environment, # "production", "staging"
traces_sample_rate=0.1, # trace 10% of transactions
profiles_sample_rate=0.1, # profile 10%
integrations=[
FastApiIntegration(
transaction_style="endpoint",
),
SqlalchemyIntegration(),
RedisIntegration(),
],
before_send=filter_sensitive_data,
)


def filter_sensitive_data(event, hint):
"""Strip sensitive data before sending"""
if "request" in event:
headers = event["request"].get("headers", {})
headers.pop("Authorization", None)
headers.pop("Cookie", None)
return event


# FastAPI integration
from fastapi import FastAPI
import os

app = FastAPI()

setup_sentry(
dsn=os.environ["SENTRY_DSN"],
environment=os.getenv("ENVIRONMENT", "production"),
)


# Manual error capture
@app.get("/risky")
async def risky_endpoint():
try:
result = 1 / 0
except ZeroDivisionError as e:
sentry_sdk.capture_exception(e)
return {"error": "Calculation failed"}
return {"result": result}


# Add custom context
def process_order(order_id: int, user_id: int):
with sentry_sdk.new_scope() as scope:
scope.set_user({"id": user_id})
scope.set_tag("order_id", order_id)
scope.set_context("order", {"id": order_id, "status": "processing"})

# ... order processing logic
sentry_sdk.capture_message("Order processed successfully", level="info")

Prometheus — Metrics Collection

from prometheus_client import (
Counter, Histogram, Gauge, Summary,
start_http_server, make_asgi_app,
CollectorRegistry, multiprocess,
)
import time
from fastapi import FastAPI, Request, Response


# ── Metric definitions ────────────────────────────────────
REQUEST_COUNT = Counter(
"http_requests_total",
"Total HTTP request count",
labelnames=["method", "endpoint", "status_code"],
)

REQUEST_DURATION = Histogram(
"http_request_duration_seconds",
"HTTP request processing time",
labelnames=["method", "endpoint"],
buckets=[0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0],
)

ACTIVE_CONNECTIONS = Gauge(
"active_connections",
"Number of currently active connections",
)

DB_QUERY_DURATION = Histogram(
"db_query_duration_seconds",
"DB query execution time",
labelnames=["operation", "table"],
)

CACHE_HITS = Counter("cache_hits_total", "Cache hit count", labelnames=["cache_name"])
CACHE_MISSES = Counter("cache_misses_total", "Cache miss count", labelnames=["cache_name"])


# ── FastAPI middleware ────────────────────────────────────
app = FastAPI()


@app.middleware("http")
async def metrics_middleware(request: Request, call_next):
endpoint = request.url.path
method = request.method

ACTIVE_CONNECTIONS.inc()
start = time.perf_counter()

try:
response: Response = await call_next(request)
duration = time.perf_counter() - start

REQUEST_COUNT.labels(
method=method,
endpoint=endpoint,
status_code=response.status_code,
).inc()

REQUEST_DURATION.labels(
method=method,
endpoint=endpoint,
).observe(duration)

return response
finally:
ACTIVE_CONNECTIONS.dec()


# Expose /metrics endpoint
metrics_app = make_asgi_app()
app.mount("/metrics", metrics_app)


# ── Business metrics ──────────────────────────────────────
ORDER_CREATED = Counter("orders_created_total", "Number of orders created", labelnames=["type"])
ORDER_REVENUE = Counter("order_revenue_total", "Total revenue")
INVENTORY_LEVEL = Gauge("inventory_level", "Inventory quantity", labelnames=["product_id"])


@app.post("/orders")
async def create_order(order_type: str, amount: float, product_id: str):
# ... order processing
ORDER_CREATED.labels(type=order_type).inc()
ORDER_REVENUE.inc(amount)
INVENTORY_LEVEL.labels(product_id=product_id).dec()
return {"status": "created"}

prometheus.yml — Scrape Configuration

# prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s

scrape_configs:
- job_name: "fastapi-app"
static_configs:
- targets: ["app:8000"]
metrics_path: /metrics

- job_name: "postgres"
static_configs:
- targets: ["postgres-exporter:9187"]

- job_name: "redis"
static_configs:
- targets: ["redis-exporter:9121"]

alerting:
alertmanagers:
- static_configs:
- targets: ["alertmanager:9093"]

rule_files:
- "alerts.yml"

Grafana Alert Rules

# alerts.yml
groups:
- name: app_alerts
rules:
- alert: HighErrorRate
expr: |
rate(http_requests_total{status_code=~"5.."}[5m]) /
rate(http_requests_total[5m]) > 0.05
for: 2m
labels:
severity: critical
annotations:
summary: "High error rate detected"
description: "5xx error rate exceeded 5% (current: {{ $value | humanizePercentage }})"

- alert: SlowResponseTime
expr: |
histogram_quantile(0.95,
rate(http_request_duration_seconds_bucket[5m])
) > 1.0
for: 5m
labels:
severity: warning
annotations:
summary: "Response time degraded"
description: "P95 response time exceeded 1 second"

OpenTelemetry — Distributed Tracing

from opentelemetry import trace
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.exporter.jaeger.thrift import JaegerExporter
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor


def setup_tracing(service_name: str, jaeger_host: str = "localhost"):
provider = TracerProvider()
jaeger_exporter = JaegerExporter(
agent_host_name=jaeger_host,
agent_port=6831,
)
provider.add_span_processor(BatchSpanProcessor(jaeger_exporter))
trace.set_tracer_provider(provider)

# Auto-instrumentation
FastAPIInstrumentor.instrument_app(app)
SQLAlchemyInstrumentor().instrument()


tracer = trace.get_tracer(__name__)


@app.get("/users/{user_id}")
async def get_user(user_id: int):
with tracer.start_as_current_span("fetch-user") as span:
span.set_attribute("user.id", user_id)

with tracer.start_as_current_span("db-query"):
# DB query (auto-instrumented)
user = {"id": user_id, "name": "Alice"}

return user

Summary

ToolObservability PillarRole
SentryError trackingException capture, stack traces, alerts
PrometheusMetricsNumeric time-series data
GrafanaVisualization + alertsDashboards, threshold alerts
OpenTelemetryDistributed tracingRequest flow across microservices

The three pillars of observability: Logs + Metrics + Traces

Advertisement