guardia-itsm/routers/observability.py

351 lines
12 KiB
Python

from __future__ import annotations
from datetime import datetime, timedelta
from typing import Any, Optional, List
import httpx
from fastapi import APIRouter, Depends, HTTPException, Query
from pydantic import BaseModel
from sqlalchemy import select, delete
from sqlalchemy.ext.asyncio import AsyncSession
from core.auth import get_current_user
from database import get_db
from models import User, OtelTraceExt as OtelTrace, SLODefinition, SignalLink
router = APIRouter(prefix="/api/observability", tags=["Observability"])
def _tenant(user: User) -> str:
return user.inst_code or str(user.id)
async def _ollama(prompt: str) -> str:
try:
async with httpx.AsyncClient(timeout=30) as c:
r = await c.post(
"http://localhost:11434/api/generate",
json={"model": "llama3", "prompt": prompt, "stream": False},
)
return r.json().get("response", "분석 결과 없음")
except Exception:
return "AI 분석 불가 (Ollama 연결 실패)"
# ── Pydantic 스키마 ────────────────────────────────────────────────────────────
class TraceIn(BaseModel):
trace_id: str
service_name: Optional[str] = None
spans: list[dict[str, Any]] = []
status: Optional[str] = "OK"
class TraceOut(BaseModel):
model_config = {"from_attributes": True}
id: int
trace_id: str
service_name: Optional[str]
duration_ms: Optional[float]
status: Optional[str]
created_at: datetime
class SLOIn(BaseModel):
service_name: str
metric_type: str # availability | latency | error_rate
target_pct: float = 99.9
window_days: int = 30
class SLOOut(BaseModel):
model_config = {"from_attributes": True}
id: int
service_name: Optional[str]
metric_type: Optional[str]
target_pct: Optional[float]
window_days: Optional[int]
current_value: Optional[float]
error_budget_pct: Optional[float]
created_at: datetime
class SignalLinkIn(BaseModel):
trace_id: Optional[str] = None
metric_ref: Optional[str] = None
log_ref: Optional[str] = None
service_name: Optional[str] = None
correlation: Optional[dict] = None
# ── 엔드포인트 ─────────────────────────────────────────────────────────────────
@router.post("/traces", summary="분산 트레이스 수집")
async def ingest_trace(
body: TraceIn,
db: AsyncSession = Depends(get_db),
current_user: User = Depends(get_current_user),
):
tid = _tenant(current_user)
duration_ms = None
if body.spans:
starts = [s.get("start_time_unix_nano", 0) for s in body.spans]
ends = [s.get("end_time_unix_nano", 0) for s in body.spans]
if starts and ends:
duration_ms = (max(ends) - min(starts)) / 1_000_000
trace = OtelTrace(
tenant_id=tid,
trace_id=body.trace_id,
service_name=body.service_name,
spans=body.spans,
duration_ms=duration_ms,
status=body.status,
)
db.add(trace)
await db.commit()
await db.refresh(trace)
return {"id": trace.id, "trace_id": trace.trace_id, "message": "수집 완료"}
@router.get("/traces", summary="트레이스 목록 조회")
async def list_traces(
service_name: Optional[str] = None,
status: Optional[str] = None,
hours: int = Query(24, ge=1, le=720),
limit: int = Query(50, ge=1, le=500),
db: AsyncSession = Depends(get_db),
current_user: User = Depends(get_current_user),
):
tid = _tenant(current_user)
since = datetime.utcnow() - timedelta(hours=hours)
q = select(OtelTrace).where(
OtelTrace.tenant_id == tid,
OtelTrace.created_at >= since,
)
if service_name:
q = q.where(OtelTrace.service_name == service_name)
if status:
q = q.where(OtelTrace.status == status)
q = q.order_by(OtelTrace.created_at.desc()).limit(limit)
rows = (await db.execute(q)).scalars().all()
return [TraceOut.model_validate(r) for r in rows]
@router.get("/traces/{trace_id}", summary="트레이스 상세 (스팬 트리)")
async def get_trace(
trace_id: str,
db: AsyncSession = Depends(get_db),
current_user: User = Depends(get_current_user),
):
tid = _tenant(current_user)
row = (await db.execute(
select(OtelTrace).where(OtelTrace.tenant_id == tid, OtelTrace.trace_id == trace_id)
)).scalar_one_or_none()
if not row:
raise HTTPException(404, "트레이스를 찾을 수 없습니다")
spans = row.spans or []
span_map: dict[str, Any] = {}
for s in spans:
sid = s.get("span_id")
if sid:
span_map[sid] = {**s, "children": []}
roots = []
for s in spans:
pid = s.get("parent_span_id")
sid = s.get("span_id", "")
if pid and pid in span_map:
span_map[pid]["children"].append(span_map.get(sid, s))
elif not pid:
roots.append(span_map.get(sid, s))
return {
"trace_id": row.trace_id,
"service_name": row.service_name,
"duration_ms": row.duration_ms,
"status": row.status,
"span_tree": roots,
}
@router.get("/service-map", summary="서비스 의존성 자동 맵핑")
async def service_map(
hours: int = Query(24, ge=1, le=720),
db: AsyncSession = Depends(get_db),
current_user: User = Depends(get_current_user),
):
tid = _tenant(current_user)
since = datetime.utcnow() - timedelta(hours=hours)
rows = (await db.execute(
select(OtelTrace).where(
OtelTrace.tenant_id == tid,
OtelTrace.created_at >= since,
)
)).scalars().all()
nodes: dict[str, dict] = {}
edges: dict[tuple, dict] = {}
for trace in rows:
spans = trace.spans or []
span_svc: dict[str, str] = {}
for s in spans:
sid = s.get("span_id")
svc = s.get("service_name") or trace.service_name or "unknown"
if sid:
span_svc[sid] = svc
if svc not in nodes:
nodes[svc] = {"name": svc, "call_count": 0, "error_count": 0}
nodes[svc]["call_count"] += 1
if s.get("status", {}).get("code") == 2:
nodes[svc]["error_count"] += 1
for s in spans:
pid = s.get("parent_span_id")
sid = s.get("span_id")
if pid and sid:
src = span_svc.get(pid)
dst = span_svc.get(sid)
if src and dst and src != dst:
key = (src, dst)
if key not in edges:
edges[key] = {"source": src, "target": dst, "call_count": 0}
edges[key]["call_count"] += 1
return {
"nodes": list(nodes.values()),
"edges": list(edges.values()),
"period_hours": hours,
}
@router.post("/slo", summary="SLO 정의 등록")
async def create_slo(
body: SLOIn,
db: AsyncSession = Depends(get_db),
current_user: User = Depends(get_current_user),
):
valid_types = {"availability", "latency", "error_rate"}
if body.metric_type not in valid_types:
raise HTTPException(400, f"metric_type은 {valid_types} 중 하나여야 합니다")
tid = _tenant(current_user)
slo = SLODefinition(
tenant_id=tid,
service_name=body.service_name,
metric_type=body.metric_type,
target_pct=body.target_pct,
window_days=body.window_days,
)
db.add(slo)
await db.commit()
await db.refresh(slo)
return SLOOut.model_validate(slo)
@router.get("/slo", summary="SLO 현황 목록")
async def list_slos(
db: AsyncSession = Depends(get_db),
current_user: User = Depends(get_current_user),
):
tid = _tenant(current_user)
rows = (await db.execute(
select(SLODefinition).where(SLODefinition.tenant_id == tid)
)).scalars().all()
return [SLOOut.model_validate(r) for r in rows]
@router.get("/slo/{slo_id}/budget", summary="에러 예산 소진율 조회")
async def slo_budget(
slo_id: int,
db: AsyncSession = Depends(get_db),
current_user: User = Depends(get_current_user),
):
tid = _tenant(current_user)
slo = (await db.execute(
select(SLODefinition).where(SLODefinition.tenant_id == tid, SLODefinition.id == slo_id)
)).scalar_one_or_none()
if not slo:
raise HTTPException(404, "SLO를 찾을 수 없습니다")
current = slo.current_value or slo.target_pct
allowed_error = 100.0 - slo.target_pct
actual_error = max(0.0, 100.0 - current)
consumed_pct = (actual_error / allowed_error * 100) if allowed_error > 0 else 0.0
remaining_pct = max(0.0, 100.0 - consumed_pct)
slo.error_budget_pct = remaining_pct
await db.commit()
return {
"slo_id": slo_id,
"service_name": slo.service_name,
"target_pct": slo.target_pct,
"current_value": current,
"error_budget_total_pct": allowed_error,
"error_budget_consumed_pct": round(consumed_pct, 2),
"error_budget_remaining_pct": round(remaining_pct, 2),
"status": "HEALTHY" if remaining_pct > 25 else "WARNING" if remaining_pct > 0 else "EXHAUSTED",
}
@router.post("/metrics-logs-link", summary="메트릭-로그-트레이스 상관관계 링크")
async def create_signal_link(
body: SignalLinkIn,
db: AsyncSession = Depends(get_db),
current_user: User = Depends(get_current_user),
):
tid = _tenant(current_user)
link = SignalLink(
tenant_id=tid,
trace_id=body.trace_id,
metric_ref=body.metric_ref,
log_ref=body.log_ref,
service_name=body.service_name,
correlation=body.correlation,
)
db.add(link)
await db.commit()
await db.refresh(link)
return {"id": link.id, "message": "신호 링크 생성 완료"}
@router.get("/root-cause/{trace_id}", summary="AI 기반 루트코즈 분석")
async def root_cause_analysis(
trace_id: str,
db: AsyncSession = Depends(get_db),
current_user: User = Depends(get_current_user),
):
tid = _tenant(current_user)
trace = (await db.execute(
select(OtelTrace).where(OtelTrace.tenant_id == tid, OtelTrace.trace_id == trace_id)
)).scalar_one_or_none()
if not trace:
raise HTTPException(404, "트레이스를 찾을 수 없습니다")
if trace.root_cause:
return {"trace_id": trace_id, "root_cause": trace.root_cause, "cached": True}
spans = trace.spans or []
error_spans = [s for s in spans if s.get("status", {}).get("code") == 2]
slowest = max(spans, key=lambda s: s.get("end_time_unix_nano", 0) - s.get("start_time_unix_nano", 0), default={})
prompt = (
f"OpenTelemetry 트레이스 분석:\n"
f"서비스: {trace.service_name}\n"
f"상태: {trace.status}\n"
f"총 스팬: {len(spans)}\n"
f"오류 스팬: {len(error_spans)}\n"
f"가장 느린 스팬: {slowest.get('name', 'unknown')}\n"
f"루트코즈와 해결 방법을 간단히 설명하시오."
)
analysis = await _ollama(prompt)
if "AI 분석 불가" in analysis:
if error_spans:
analysis = f"오류 스팬 감지: {error_spans[0].get('name', 'unknown')} — 해당 서비스의 오류 로그를 확인하세요."
else:
analysis = f"가장 느린 스팬 '{slowest.get('name', 'unknown')}' 성능 병목 가능성이 있습니다."
trace.root_cause = analysis
await db.commit()
return {"trace_id": trace_id, "root_cause": analysis, "cached": False}