from __future__ import annotations from datetime import datetime, timedelta from typing import Any, Optional, List import httpx from fastapi import APIRouter, Depends, HTTPException, Query from pydantic import BaseModel from sqlalchemy import select, delete from sqlalchemy.ext.asyncio import AsyncSession from core.auth import get_current_user from database import get_db from models import User, OtelTraceExt as OtelTrace, SLODefinition, SignalLink router = APIRouter(prefix="/api/observability", tags=["Observability"]) def _tenant(user: User) -> str: return user.inst_code or str(user.id) async def _ollama(prompt: str) -> str: try: async with httpx.AsyncClient(timeout=30) as c: r = await c.post( "http://localhost:11434/api/generate", json={"model": "llama3", "prompt": prompt, "stream": False}, ) return r.json().get("response", "분석 결과 없음") except Exception: return "AI 분석 불가 (Ollama 연결 실패)" # ── Pydantic 스키마 ──────────────────────────────────────────────────────────── class TraceIn(BaseModel): trace_id: str service_name: Optional[str] = None spans: list[dict[str, Any]] = [] status: Optional[str] = "OK" class TraceOut(BaseModel): model_config = {"from_attributes": True} id: int trace_id: str service_name: Optional[str] duration_ms: Optional[float] status: Optional[str] created_at: datetime class SLOIn(BaseModel): service_name: str metric_type: str # availability | latency | error_rate target_pct: float = 99.9 window_days: int = 30 class SLOOut(BaseModel): model_config = {"from_attributes": True} id: int service_name: Optional[str] metric_type: Optional[str] target_pct: Optional[float] window_days: Optional[int] current_value: Optional[float] error_budget_pct: Optional[float] created_at: datetime class SignalLinkIn(BaseModel): trace_id: Optional[str] = None metric_ref: Optional[str] = None log_ref: Optional[str] = None service_name: Optional[str] = None correlation: Optional[dict] = None # ── 엔드포인트 ───────────────────────────────────────────────────────────────── @router.post("/traces", summary="분산 트레이스 수집") async def ingest_trace( body: TraceIn, db: AsyncSession = Depends(get_db), current_user: User = Depends(get_current_user), ): tid = _tenant(current_user) duration_ms = None if body.spans: starts = [s.get("start_time_unix_nano", 0) for s in body.spans] ends = [s.get("end_time_unix_nano", 0) for s in body.spans] if starts and ends: duration_ms = (max(ends) - min(starts)) / 1_000_000 trace = OtelTrace( tenant_id=tid, trace_id=body.trace_id, service_name=body.service_name, spans=body.spans, duration_ms=duration_ms, status=body.status, ) db.add(trace) await db.commit() await db.refresh(trace) return {"id": trace.id, "trace_id": trace.trace_id, "message": "수집 완료"} @router.get("/traces", summary="트레이스 목록 조회") async def list_traces( service_name: Optional[str] = None, status: Optional[str] = None, hours: int = Query(24, ge=1, le=720), limit: int = Query(50, ge=1, le=500), db: AsyncSession = Depends(get_db), current_user: User = Depends(get_current_user), ): tid = _tenant(current_user) since = datetime.utcnow() - timedelta(hours=hours) q = select(OtelTrace).where( OtelTrace.tenant_id == tid, OtelTrace.created_at >= since, ) if service_name: q = q.where(OtelTrace.service_name == service_name) if status: q = q.where(OtelTrace.status == status) q = q.order_by(OtelTrace.created_at.desc()).limit(limit) rows = (await db.execute(q)).scalars().all() return [TraceOut.model_validate(r) for r in rows] @router.get("/traces/{trace_id}", summary="트레이스 상세 (스팬 트리)") async def get_trace( trace_id: str, db: AsyncSession = Depends(get_db), current_user: User = Depends(get_current_user), ): tid = _tenant(current_user) row = (await db.execute( select(OtelTrace).where(OtelTrace.tenant_id == tid, OtelTrace.trace_id == trace_id) )).scalar_one_or_none() if not row: raise HTTPException(404, "트레이스를 찾을 수 없습니다") spans = row.spans or [] span_map: dict[str, Any] = {} for s in spans: sid = s.get("span_id") if sid: span_map[sid] = {**s, "children": []} roots = [] for s in spans: pid = s.get("parent_span_id") sid = s.get("span_id", "") if pid and pid in span_map: span_map[pid]["children"].append(span_map.get(sid, s)) elif not pid: roots.append(span_map.get(sid, s)) return { "trace_id": row.trace_id, "service_name": row.service_name, "duration_ms": row.duration_ms, "status": row.status, "span_tree": roots, } @router.get("/service-map", summary="서비스 의존성 자동 맵핑") async def service_map( hours: int = Query(24, ge=1, le=720), db: AsyncSession = Depends(get_db), current_user: User = Depends(get_current_user), ): tid = _tenant(current_user) since = datetime.utcnow() - timedelta(hours=hours) rows = (await db.execute( select(OtelTrace).where( OtelTrace.tenant_id == tid, OtelTrace.created_at >= since, ) )).scalars().all() nodes: dict[str, dict] = {} edges: dict[tuple, dict] = {} for trace in rows: spans = trace.spans or [] span_svc: dict[str, str] = {} for s in spans: sid = s.get("span_id") svc = s.get("service_name") or trace.service_name or "unknown" if sid: span_svc[sid] = svc if svc not in nodes: nodes[svc] = {"name": svc, "call_count": 0, "error_count": 0} nodes[svc]["call_count"] += 1 if s.get("status", {}).get("code") == 2: nodes[svc]["error_count"] += 1 for s in spans: pid = s.get("parent_span_id") sid = s.get("span_id") if pid and sid: src = span_svc.get(pid) dst = span_svc.get(sid) if src and dst and src != dst: key = (src, dst) if key not in edges: edges[key] = {"source": src, "target": dst, "call_count": 0} edges[key]["call_count"] += 1 return { "nodes": list(nodes.values()), "edges": list(edges.values()), "period_hours": hours, } @router.post("/slo", summary="SLO 정의 등록") async def create_slo( body: SLOIn, db: AsyncSession = Depends(get_db), current_user: User = Depends(get_current_user), ): valid_types = {"availability", "latency", "error_rate"} if body.metric_type not in valid_types: raise HTTPException(400, f"metric_type은 {valid_types} 중 하나여야 합니다") tid = _tenant(current_user) slo = SLODefinition( tenant_id=tid, service_name=body.service_name, metric_type=body.metric_type, target_pct=body.target_pct, window_days=body.window_days, ) db.add(slo) await db.commit() await db.refresh(slo) return SLOOut.model_validate(slo) @router.get("/slo", summary="SLO 현황 목록") async def list_slos( db: AsyncSession = Depends(get_db), current_user: User = Depends(get_current_user), ): tid = _tenant(current_user) rows = (await db.execute( select(SLODefinition).where(SLODefinition.tenant_id == tid) )).scalars().all() return [SLOOut.model_validate(r) for r in rows] @router.get("/slo/{slo_id}/budget", summary="에러 예산 소진율 조회") async def slo_budget( slo_id: int, db: AsyncSession = Depends(get_db), current_user: User = Depends(get_current_user), ): tid = _tenant(current_user) slo = (await db.execute( select(SLODefinition).where(SLODefinition.tenant_id == tid, SLODefinition.id == slo_id) )).scalar_one_or_none() if not slo: raise HTTPException(404, "SLO를 찾을 수 없습니다") current = slo.current_value or slo.target_pct allowed_error = 100.0 - slo.target_pct actual_error = max(0.0, 100.0 - current) consumed_pct = (actual_error / allowed_error * 100) if allowed_error > 0 else 0.0 remaining_pct = max(0.0, 100.0 - consumed_pct) slo.error_budget_pct = remaining_pct await db.commit() return { "slo_id": slo_id, "service_name": slo.service_name, "target_pct": slo.target_pct, "current_value": current, "error_budget_total_pct": allowed_error, "error_budget_consumed_pct": round(consumed_pct, 2), "error_budget_remaining_pct": round(remaining_pct, 2), "status": "HEALTHY" if remaining_pct > 25 else "WARNING" if remaining_pct > 0 else "EXHAUSTED", } @router.post("/metrics-logs-link", summary="메트릭-로그-트레이스 상관관계 링크") async def create_signal_link( body: SignalLinkIn, db: AsyncSession = Depends(get_db), current_user: User = Depends(get_current_user), ): tid = _tenant(current_user) link = SignalLink( tenant_id=tid, trace_id=body.trace_id, metric_ref=body.metric_ref, log_ref=body.log_ref, service_name=body.service_name, correlation=body.correlation, ) db.add(link) await db.commit() await db.refresh(link) return {"id": link.id, "message": "신호 링크 생성 완료"} @router.get("/root-cause/{trace_id}", summary="AI 기반 루트코즈 분석") async def root_cause_analysis( trace_id: str, db: AsyncSession = Depends(get_db), current_user: User = Depends(get_current_user), ): tid = _tenant(current_user) trace = (await db.execute( select(OtelTrace).where(OtelTrace.tenant_id == tid, OtelTrace.trace_id == trace_id) )).scalar_one_or_none() if not trace: raise HTTPException(404, "트레이스를 찾을 수 없습니다") if trace.root_cause: return {"trace_id": trace_id, "root_cause": trace.root_cause, "cached": True} spans = trace.spans or [] error_spans = [s for s in spans if s.get("status", {}).get("code") == 2] slowest = max(spans, key=lambda s: s.get("end_time_unix_nano", 0) - s.get("start_time_unix_nano", 0), default={}) prompt = ( f"OpenTelemetry 트레이스 분석:\n" f"서비스: {trace.service_name}\n" f"상태: {trace.status}\n" f"총 스팬: {len(spans)}개\n" f"오류 스팬: {len(error_spans)}개\n" f"가장 느린 스팬: {slowest.get('name', 'unknown')}\n" f"루트코즈와 해결 방법을 간단히 설명하시오." ) analysis = await _ollama(prompt) if "AI 분석 불가" in analysis: if error_spans: analysis = f"오류 스팬 감지: {error_spans[0].get('name', 'unknown')} — 해당 서비스의 오류 로그를 확인하세요." else: analysis = f"가장 느린 스팬 '{slowest.get('name', 'unknown')}' 성능 병목 가능성이 있습니다." trace.root_cause = analysis await db.commit() return {"trace_id": trace_id, "root_cause": analysis, "cached": False}