351 lines
12 KiB
Python
351 lines
12 KiB
Python
from __future__ import annotations
|
|
|
|
from datetime import datetime, timedelta
|
|
from typing import Any, Optional, List
|
|
|
|
import httpx
|
|
from fastapi import APIRouter, Depends, HTTPException, Query
|
|
from pydantic import BaseModel
|
|
from sqlalchemy import select, delete
|
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
|
|
from core.auth import get_current_user
|
|
from database import get_db
|
|
from models import User, OtelTrace, SLODefinition, SignalLink
|
|
|
|
router = APIRouter(prefix="/api/observability", tags=["Observability"])
|
|
|
|
|
|
def _tenant(user: User) -> str:
|
|
return user.inst_code or str(user.id)
|
|
|
|
|
|
async def _ollama(prompt: str) -> str:
|
|
try:
|
|
async with httpx.AsyncClient(timeout=30) as c:
|
|
r = await c.post(
|
|
"http://localhost:11434/api/generate",
|
|
json={"model": "llama3", "prompt": prompt, "stream": False},
|
|
)
|
|
return r.json().get("response", "분석 결과 없음")
|
|
except Exception:
|
|
return "AI 분석 불가 (Ollama 연결 실패)"
|
|
|
|
|
|
# ── Pydantic 스키마 ────────────────────────────────────────────────────────────
|
|
|
|
class TraceIn(BaseModel):
|
|
trace_id: str
|
|
service_name: Optional[str] = None
|
|
spans: list[dict[str, Any]] = []
|
|
status: Optional[str] = "OK"
|
|
|
|
class TraceOut(BaseModel):
|
|
model_config = {"from_attributes": True}
|
|
id: int
|
|
trace_id: str
|
|
service_name: Optional[str]
|
|
duration_ms: Optional[float]
|
|
status: Optional[str]
|
|
created_at: datetime
|
|
|
|
class SLOIn(BaseModel):
|
|
service_name: str
|
|
metric_type: str # availability | latency | error_rate
|
|
target_pct: float = 99.9
|
|
window_days: int = 30
|
|
|
|
class SLOOut(BaseModel):
|
|
model_config = {"from_attributes": True}
|
|
id: int
|
|
service_name: Optional[str]
|
|
metric_type: Optional[str]
|
|
target_pct: Optional[float]
|
|
window_days: Optional[int]
|
|
current_value: Optional[float]
|
|
error_budget_pct: Optional[float]
|
|
created_at: datetime
|
|
|
|
class SignalLinkIn(BaseModel):
|
|
trace_id: Optional[str] = None
|
|
metric_ref: Optional[str] = None
|
|
log_ref: Optional[str] = None
|
|
service_name: Optional[str] = None
|
|
correlation: Optional[dict] = None
|
|
|
|
|
|
# ── 엔드포인트 ─────────────────────────────────────────────────────────────────
|
|
|
|
@router.post("/traces", summary="분산 트레이스 수집")
|
|
async def ingest_trace(
|
|
body: TraceIn,
|
|
db: AsyncSession = Depends(get_db),
|
|
current_user: User = Depends(get_current_user),
|
|
):
|
|
tid = _tenant(current_user)
|
|
duration_ms = None
|
|
if body.spans:
|
|
starts = [s.get("start_time_unix_nano", 0) for s in body.spans]
|
|
ends = [s.get("end_time_unix_nano", 0) for s in body.spans]
|
|
if starts and ends:
|
|
duration_ms = (max(ends) - min(starts)) / 1_000_000
|
|
|
|
trace = OtelTrace(
|
|
tenant_id=tid,
|
|
trace_id=body.trace_id,
|
|
service_name=body.service_name,
|
|
spans=body.spans,
|
|
duration_ms=duration_ms,
|
|
status=body.status,
|
|
)
|
|
db.add(trace)
|
|
await db.commit()
|
|
await db.refresh(trace)
|
|
return {"id": trace.id, "trace_id": trace.trace_id, "message": "수집 완료"}
|
|
|
|
|
|
@router.get("/traces", summary="트레이스 목록 조회")
|
|
async def list_traces(
|
|
service_name: Optional[str] = None,
|
|
status: Optional[str] = None,
|
|
hours: int = Query(24, ge=1, le=720),
|
|
limit: int = Query(50, ge=1, le=500),
|
|
db: AsyncSession = Depends(get_db),
|
|
current_user: User = Depends(get_current_user),
|
|
):
|
|
tid = _tenant(current_user)
|
|
since = datetime.utcnow() - timedelta(hours=hours)
|
|
q = select(OtelTrace).where(
|
|
OtelTrace.tenant_id == tid,
|
|
OtelTrace.created_at >= since,
|
|
)
|
|
if service_name:
|
|
q = q.where(OtelTrace.service_name == service_name)
|
|
if status:
|
|
q = q.where(OtelTrace.status == status)
|
|
q = q.order_by(OtelTrace.created_at.desc()).limit(limit)
|
|
rows = (await db.execute(q)).scalars().all()
|
|
return [TraceOut.model_validate(r) for r in rows]
|
|
|
|
|
|
@router.get("/traces/{trace_id}", summary="트레이스 상세 (스팬 트리)")
|
|
async def get_trace(
|
|
trace_id: str,
|
|
db: AsyncSession = Depends(get_db),
|
|
current_user: User = Depends(get_current_user),
|
|
):
|
|
tid = _tenant(current_user)
|
|
row = (await db.execute(
|
|
select(OtelTrace).where(OtelTrace.tenant_id == tid, OtelTrace.trace_id == trace_id)
|
|
)).scalar_one_or_none()
|
|
if not row:
|
|
raise HTTPException(404, "트레이스를 찾을 수 없습니다")
|
|
|
|
spans = row.spans or []
|
|
span_map: dict[str, Any] = {}
|
|
for s in spans:
|
|
sid = s.get("span_id")
|
|
if sid:
|
|
span_map[sid] = {**s, "children": []}
|
|
|
|
roots = []
|
|
for s in spans:
|
|
pid = s.get("parent_span_id")
|
|
sid = s.get("span_id", "")
|
|
if pid and pid in span_map:
|
|
span_map[pid]["children"].append(span_map.get(sid, s))
|
|
elif not pid:
|
|
roots.append(span_map.get(sid, s))
|
|
|
|
return {
|
|
"trace_id": row.trace_id,
|
|
"service_name": row.service_name,
|
|
"duration_ms": row.duration_ms,
|
|
"status": row.status,
|
|
"span_tree": roots,
|
|
}
|
|
|
|
|
|
@router.get("/service-map", summary="서비스 의존성 자동 맵핑")
|
|
async def service_map(
|
|
hours: int = Query(24, ge=1, le=720),
|
|
db: AsyncSession = Depends(get_db),
|
|
current_user: User = Depends(get_current_user),
|
|
):
|
|
tid = _tenant(current_user)
|
|
since = datetime.utcnow() - timedelta(hours=hours)
|
|
rows = (await db.execute(
|
|
select(OtelTrace).where(
|
|
OtelTrace.tenant_id == tid,
|
|
OtelTrace.created_at >= since,
|
|
)
|
|
)).scalars().all()
|
|
|
|
nodes: dict[str, dict] = {}
|
|
edges: dict[tuple, dict] = {}
|
|
|
|
for trace in rows:
|
|
spans = trace.spans or []
|
|
span_svc: dict[str, str] = {}
|
|
for s in spans:
|
|
sid = s.get("span_id")
|
|
svc = s.get("service_name") or trace.service_name or "unknown"
|
|
if sid:
|
|
span_svc[sid] = svc
|
|
if svc not in nodes:
|
|
nodes[svc] = {"name": svc, "call_count": 0, "error_count": 0}
|
|
nodes[svc]["call_count"] += 1
|
|
if s.get("status", {}).get("code") == 2:
|
|
nodes[svc]["error_count"] += 1
|
|
|
|
for s in spans:
|
|
pid = s.get("parent_span_id")
|
|
sid = s.get("span_id")
|
|
if pid and sid:
|
|
src = span_svc.get(pid)
|
|
dst = span_svc.get(sid)
|
|
if src and dst and src != dst:
|
|
key = (src, dst)
|
|
if key not in edges:
|
|
edges[key] = {"source": src, "target": dst, "call_count": 0}
|
|
edges[key]["call_count"] += 1
|
|
|
|
return {
|
|
"nodes": list(nodes.values()),
|
|
"edges": list(edges.values()),
|
|
"period_hours": hours,
|
|
}
|
|
|
|
|
|
@router.post("/slo", summary="SLO 정의 등록")
|
|
async def create_slo(
|
|
body: SLOIn,
|
|
db: AsyncSession = Depends(get_db),
|
|
current_user: User = Depends(get_current_user),
|
|
):
|
|
valid_types = {"availability", "latency", "error_rate"}
|
|
if body.metric_type not in valid_types:
|
|
raise HTTPException(400, f"metric_type은 {valid_types} 중 하나여야 합니다")
|
|
tid = _tenant(current_user)
|
|
slo = SLODefinition(
|
|
tenant_id=tid,
|
|
service_name=body.service_name,
|
|
metric_type=body.metric_type,
|
|
target_pct=body.target_pct,
|
|
window_days=body.window_days,
|
|
)
|
|
db.add(slo)
|
|
await db.commit()
|
|
await db.refresh(slo)
|
|
return SLOOut.model_validate(slo)
|
|
|
|
|
|
@router.get("/slo", summary="SLO 현황 목록")
|
|
async def list_slos(
|
|
db: AsyncSession = Depends(get_db),
|
|
current_user: User = Depends(get_current_user),
|
|
):
|
|
tid = _tenant(current_user)
|
|
rows = (await db.execute(
|
|
select(SLODefinition).where(SLODefinition.tenant_id == tid)
|
|
)).scalars().all()
|
|
return [SLOOut.model_validate(r) for r in rows]
|
|
|
|
|
|
@router.get("/slo/{slo_id}/budget", summary="에러 예산 소진율 조회")
|
|
async def slo_budget(
|
|
slo_id: int,
|
|
db: AsyncSession = Depends(get_db),
|
|
current_user: User = Depends(get_current_user),
|
|
):
|
|
tid = _tenant(current_user)
|
|
slo = (await db.execute(
|
|
select(SLODefinition).where(SLODefinition.tenant_id == tid, SLODefinition.id == slo_id)
|
|
)).scalar_one_or_none()
|
|
if not slo:
|
|
raise HTTPException(404, "SLO를 찾을 수 없습니다")
|
|
|
|
current = slo.current_value or slo.target_pct
|
|
allowed_error = 100.0 - slo.target_pct
|
|
actual_error = max(0.0, 100.0 - current)
|
|
consumed_pct = (actual_error / allowed_error * 100) if allowed_error > 0 else 0.0
|
|
remaining_pct = max(0.0, 100.0 - consumed_pct)
|
|
|
|
slo.error_budget_pct = remaining_pct
|
|
await db.commit()
|
|
|
|
return {
|
|
"slo_id": slo_id,
|
|
"service_name": slo.service_name,
|
|
"target_pct": slo.target_pct,
|
|
"current_value": current,
|
|
"error_budget_total_pct": allowed_error,
|
|
"error_budget_consumed_pct": round(consumed_pct, 2),
|
|
"error_budget_remaining_pct": round(remaining_pct, 2),
|
|
"status": "HEALTHY" if remaining_pct > 25 else "WARNING" if remaining_pct > 0 else "EXHAUSTED",
|
|
}
|
|
|
|
|
|
@router.post("/metrics-logs-link", summary="메트릭-로그-트레이스 상관관계 링크")
|
|
async def create_signal_link(
|
|
body: SignalLinkIn,
|
|
db: AsyncSession = Depends(get_db),
|
|
current_user: User = Depends(get_current_user),
|
|
):
|
|
tid = _tenant(current_user)
|
|
link = SignalLink(
|
|
tenant_id=tid,
|
|
trace_id=body.trace_id,
|
|
metric_ref=body.metric_ref,
|
|
log_ref=body.log_ref,
|
|
service_name=body.service_name,
|
|
correlation=body.correlation,
|
|
)
|
|
db.add(link)
|
|
await db.commit()
|
|
await db.refresh(link)
|
|
return {"id": link.id, "message": "신호 링크 생성 완료"}
|
|
|
|
|
|
@router.get("/root-cause/{trace_id}", summary="AI 기반 루트코즈 분석")
|
|
async def root_cause_analysis(
|
|
trace_id: str,
|
|
db: AsyncSession = Depends(get_db),
|
|
current_user: User = Depends(get_current_user),
|
|
):
|
|
tid = _tenant(current_user)
|
|
trace = (await db.execute(
|
|
select(OtelTrace).where(OtelTrace.tenant_id == tid, OtelTrace.trace_id == trace_id)
|
|
)).scalar_one_or_none()
|
|
if not trace:
|
|
raise HTTPException(404, "트레이스를 찾을 수 없습니다")
|
|
|
|
if trace.root_cause:
|
|
return {"trace_id": trace_id, "root_cause": trace.root_cause, "cached": True}
|
|
|
|
spans = trace.spans or []
|
|
error_spans = [s for s in spans if s.get("status", {}).get("code") == 2]
|
|
slowest = max(spans, key=lambda s: s.get("end_time_unix_nano", 0) - s.get("start_time_unix_nano", 0), default={})
|
|
|
|
prompt = (
|
|
f"OpenTelemetry 트레이스 분석:\n"
|
|
f"서비스: {trace.service_name}\n"
|
|
f"상태: {trace.status}\n"
|
|
f"총 스팬: {len(spans)}개\n"
|
|
f"오류 스팬: {len(error_spans)}개\n"
|
|
f"가장 느린 스팬: {slowest.get('name', 'unknown')}\n"
|
|
f"루트코즈와 해결 방법을 간단히 설명하시오."
|
|
)
|
|
|
|
analysis = await _ollama(prompt)
|
|
if "AI 분석 불가" in analysis:
|
|
if error_spans:
|
|
analysis = f"오류 스팬 감지: {error_spans[0].get('name', 'unknown')} — 해당 서비스의 오류 로그를 확인하세요."
|
|
else:
|
|
analysis = f"가장 느린 스팬 '{slowest.get('name', 'unknown')}' 성능 병목 가능성이 있습니다."
|
|
|
|
trace.root_cause = analysis
|
|
await db.commit()
|
|
|
|
return {"trace_id": trace_id, "root_cause": analysis, "cached": False}
|