모니터링과 관찰 가능성
Sentry로 에러를 추적하고 Prometheus/Grafana로 메트릭을 수집합니다.
설치
pip install sentry-sdk prometheus-client opentelemetry-sdk
Sentry — 에러 추적
import sentry_sdk
from sentry_sdk.integrations.fastapi import FastApiIntegration
from sentry_sdk.integrations.sqlalchemy import SqlalchemyIntegration
from sentry_sdk.integrations.redis import RedisIntegration
def setup_sentry(dsn: str, environment: str):
sentry_sdk.init(
dsn=dsn,
environment=environment, # "production", "staging"
traces_sample_rate=0.1, # 10% 트랜잭션 추적
profiles_sample_rate=0.1, # 10% 프로파일링
integrations=[
FastApiIntegration(
transaction_style="endpoint",
),
SqlalchemyIntegration(),
RedisIntegration(),
],
before_send=filter_sensitive_data,
)
def filter_sensitive_data(event, hint):
"""민감한 데이터 제거"""
if "request" in event:
headers = event["request"].get("headers", {})
headers.pop("Authorization", None)
headers.pop("Cookie", None)
return event
# FastAPI 연동
from fastapi import FastAPI
import os
app = FastAPI()
setup_sentry(
dsn=os.environ["SENTRY_DSN"],
environment=os.getenv("ENVIRONMENT", "production"),
)
# 수동 에러 캡처
@app.get("/risky")
async def risky_endpoint():
try:
result = 1 / 0
except ZeroDivisionError as e:
sentry_sdk.capture_exception(e)
return {"error": "계산 실패"}
return {"result": result}
# 커스텀 컨텍스트 추가
def process_order(order_id: int, user_id: int):
with sentry_sdk.new_scope() as scope:
scope.set_user({"id": user_id})
scope.set_tag("order_id", order_id)
scope.set_context("order", {"id": order_id, "status": "processing"})
# ... 주문 처리 로직
sentry_sdk.capture_message("주문 처리 완료", level="info")
Prometheus — 메트릭 수집
from prometheus_client import (
Counter, Histogram, Gauge, Summary,
start_http_server, make_asgi_app,
CollectorRegistry, multiprocess,
)
import time
from fastapi import FastAPI, Request, Response
# ── 메트릭 정의 ────────────────────────────────────────────
REQUEST_COUNT = Counter(
"http_requests_total",
"총 HTTP 요청 수",
labelnames=["method", "endpoint", "status_code"],
)
REQUEST_DURATION = Histogram(
"http_request_duration_seconds",
"HTTP 요청 처리 시간",
labelnames=["method", "endpoint"],
buckets=[0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0],
)
ACTIVE_CONNECTIONS = Gauge(
"active_connections",
"현재 활성 연결 수",
)
DB_QUERY_DURATION = Histogram(
"db_query_duration_seconds",
"DB 쿼리 실행 시간",
labelnames=["operation", "table"],
)
CACHE_HITS = Counter("cache_hits_total", "캐시 히트 수", labelnames=["cache_name"])
CACHE_MISSES = Counter("cache_misses_total", "캐시 미스 수", labelnames=["cache_name"])
# ── FastAPI 미들웨어 ────────────────────────────────────────
app = FastAPI()
@app.middleware("http")
async def metrics_middleware(request: Request, call_next):
endpoint = request.url.path
method = request.method
ACTIVE_CONNECTIONS.inc()
start = time.perf_counter()
try:
response: Response = await call_next(request)
duration = time.perf_counter() - start
REQUEST_COUNT.labels(
method=method,
endpoint=endpoint,
status_code=response.status_code,
).inc()
REQUEST_DURATION.labels(
method=method,
endpoint=endpoint,
).observe(duration)
return response
finally:
ACTIVE_CONNECTIONS.dec()
# /metrics 엔드포인트 노출
metrics_app = make_asgi_app()
app.mount("/metrics", metrics_app)
# ── 비즈니스 메트릭 ────────────────────────────────────────
ORDER_CREATED = Counter("orders_created_total", "생성된 주문 수", labelnames=["type"])
ORDER_REVENUE = Counter("order_revenue_total", "총 매출액")
INVENTORY_LEVEL = Gauge("inventory_level", "재고 수량", labelnames=["product_id"])
@app.post("/orders")
async def create_order(order_type: str, amount: float, product_id: str):
# ... 주문 처리
ORDER_CREATED.labels(type=order_type).inc()
ORDER_REVENUE.inc(amount)
INVENTORY_LEVEL.labels(product_id=product_id).dec()
return {"status": "created"}
prometheus.yml — 스크래핑 설정
# prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
scrape_configs:
- job_name: "fastapi-app"
static_configs:
- targets: ["app:8000"]
metrics_path: /metrics
- job_name: "postgres"
static_configs:
- targets: ["postgres-exporter:9187"]
- job_name: "redis"
static_configs:
- targets: ["redis-exporter:9121"]
alerting:
alertmanagers:
- static_configs:
- targets: ["alertmanager:9093"]
rule_files:
- "alerts.yml"
Grafana 알람 규칙
# alerts.yml
groups:
- name: app_alerts
rules:
- alert: HighErrorRate
expr: |
rate(http_requests_total{status_code=~"5.."}[5m]) /
rate(http_requests_total[5m]) > 0.05
for: 2m
labels:
severity: critical
annotations:
summary: "높은 에러율 감지"
description: "5xx 에러율이 5%를 초과했습니다 (현재: {{ $value | humanizePercentage }})"
- alert: SlowResponseTime
expr: |
histogram_quantile(0.95,
rate(http_request_duration_seconds_bucket[5m])
) > 1.0
for: 5m
labels:
severity: warning
annotations:
summary: "응답 시간 지연"
description: "P95 응답 시간이 1초를 초과했습니다"
OpenTelemetry — 분산 추적
from opentelemetry import trace
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.exporter.jaeger.thrift import JaegerExporter
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor
def setup_tracing(service_name: str, jaeger_host: str = "localhost"):
provider = TracerProvider()
jaeger_exporter = JaegerExporter(
agent_host_name=jaeger_host,
agent_port=6831,
)
provider.add_span_processor(BatchSpanProcessor(jaeger_exporter))
trace.set_tracer_provider(provider)
# 자동 계측
FastAPIInstrumentor.instrument_app(app)
SQLAlchemyInstrumentor().instrument()
tracer = trace.get_tracer(__name__)
@app.get("/users/{user_id}")
async def get_user(user_id: int):
with tracer.start_as_current_span("fetch-user") as span:
span.set_attribute("user.id", user_id)
with tracer.start_as_current_span("db-query"):
# DB 조회 (자동 계측됨)
user = {"id": user_id, "name": "Alice"}
return user
정리
| 도구 | 관찰 가능성 영역 | 역할 |
|---|---|---|
Sentry | 에러 추적 | 예외 캡처, 스택 트레이스, 알람 |
Prometheus | 메트릭 | 숫자 기반 시계열 데이터 |
Grafana | 시각화 + 알람 | 대시보드, 임계값 알람 |
OpenTelemetry | 분산 추적 | 마이크로서비스 요청 흐름 |
관찰 가능성의 3요소: 로그(Logs) + 메트릭(Metrics) + 트레이스(Traces)