""" E-4: Grafana 연동 (Prometheus 메트릭) 기능: 1. Prometheus text-format 메트릭 노출 (/api/metrics/prometheus) 2. SR / 인시던트 / 배포 / 감사 카운터 3. SLA 준수율 게이지 4. 서버·사용자 현황 게이지 5. Grafana JSON 데이터소스 설정 안내 6. 헬스체크 엔드포인트 엔드포인트: GET /api/metrics/prometheus — Prometheus text 포맷 GET /api/metrics/summary — JSON 요약 (Grafana Simple JSON 용) GET /api/metrics/health — 헬스체크 (UP/DOWN) GET /api/metrics/grafana-config — Grafana 데이터소스 설정 예시 GET /api/metrics/labels — 레이블 목록 (Grafana Simple JSON /labels) POST /api/metrics/query — Grafana Simple JSON /query """ from __future__ import annotations import logging import time from datetime import datetime, timedelta from typing import Any, Dict, List, Optional from fastapi import APIRouter, Depends, Request from fastapi.responses import PlainTextResponse, JSONResponse from pydantic import BaseModel from sqlalchemy import select, func, and_ from sqlalchemy.ext.asyncio import AsyncSession from database import get_db from models import User logger = logging.getLogger(__name__) router = APIRouter(prefix="/api/metrics", tags=["metrics"]) # ── 인메모리 카운터 (프로세스 재시작 시 초기화) ─────────────────────────────── _counters: Dict[str, float] = { "guardia_api_requests_total": 0, "guardia_api_errors_total": 0, } _start_time: float = time.time() # ── Pydantic 스키마 ────────────────────────────────────────────────────────── class GrafanaQueryIn(BaseModel): range: Optional[Dict[str, Any]] = None targets: List[Dict[str, Any]] = [] maxDataPoints: int = 100 # ── 메트릭 수집 헬퍼 ───────────────────────────────────────────────────────── async def _collect_metrics(db: AsyncSession) -> Dict[str, Any]: """DB에서 메트릭을 집계하여 딕셔너리로 반환.""" metrics: Dict[str, Any] = {} # ── SR 메트릭 ────────────────────────────────────────────────────────── try: from models import SRRequest sr_total = (await db.execute( select(func.count()).select_from(SRRequest) )).scalar() or 0 metrics["guardia_sr_total"] = sr_total by_status_rows = (await db.execute( select(SRRequest.status, func.count()) .group_by(SRRequest.status) )).all() for status, cnt in by_status_rows: key = f'guardia_sr_by_status{{status="{status or "UNKNOWN"}"}}' metrics[key] = cnt by_priority_rows = (await db.execute( select(SRRequest.priority, func.count()) .group_by(SRRequest.priority) )).all() for priority, cnt in by_priority_rows: key = f'guardia_sr_by_priority{{priority="{priority or "MEDIUM"}"}}' metrics[key] = cnt # 최근 24h SR since_24h = datetime.utcnow() - timedelta(hours=24) sr_24h = (await db.execute( select(func.count()).select_from(SRRequest) .where(SRRequest.created_at >= since_24h) )).scalar() or 0 metrics["guardia_sr_last_24h"] = sr_24h except Exception as e: logger.debug("SR 메트릭 수집 오류: %s", e) metrics["guardia_sr_total"] = 0 # ── 인시던트 메트릭 ──────────────────────────────────────────────────── try: from models import Incident inc_total = (await db.execute( select(func.count()).select_from(Incident) )).scalar() or 0 metrics["guardia_incidents_total"] = inc_total open_inc = (await db.execute( select(func.count()).select_from(Incident) .where(Incident.status.in_(["OPEN", "IN_PROGRESS"])) )).scalar() or 0 metrics["guardia_incidents_open"] = open_inc except Exception as e: logger.debug("인시던트 메트릭 수집 오류: %s", e) metrics["guardia_incidents_total"] = 0 metrics["guardia_incidents_open"] = 0 # ── 감사 로그 메트릭 ─────────────────────────────────────────────────── try: from models import AuditLog audit_total = (await db.execute( select(func.count()).select_from(AuditLog) )).scalar() or 0 metrics["guardia_audit_events_total"] = audit_total critical_audit = (await db.execute( select(func.count()).select_from(AuditLog) .where(AuditLog.severity == "CRITICAL") )).scalar() or 0 metrics["guardia_audit_critical_total"] = critical_audit by_sev_rows = (await db.execute( select(AuditLog.severity, func.count()) .group_by(AuditLog.severity) )).all() for sev, cnt in by_sev_rows: key = f'guardia_audit_by_severity{{severity="{sev or "INFO"}"}}' metrics[key] = cnt except Exception as e: logger.debug("감사 메트릭 수집 오류: %s", e) metrics["guardia_audit_events_total"] = 0 metrics["guardia_audit_critical_total"] = 0 # ── 사용자 메트릭 ────────────────────────────────────────────────────── try: user_total = (await db.execute( select(func.count()).select_from(User) )).scalar() or 0 metrics["guardia_users_total"] = user_total active_users = (await db.execute( select(func.count()).select_from(User) .where(User.is_active == True) )).scalar() or 0 metrics["guardia_users_active"] = active_users except Exception as e: logger.debug("사용자 메트릭 수집 오류: %s", e) metrics["guardia_users_total"] = 0 metrics["guardia_users_active"] = 0 # ── 용량 관리 메트릭 ─────────────────────────────────────────────────── try: from models import CapacityPlan cap_total = (await db.execute( select(func.count()).select_from(CapacityPlan) )).scalar() or 0 metrics["guardia_capacity_plans_total"] = cap_total cap_critical = (await db.execute( select(func.count()).select_from(CapacityPlan) .where(CapacityPlan.status.in_(["CRITICAL", "OVERLOAD"])) )).scalar() or 0 metrics["guardia_capacity_critical"] = cap_critical except Exception as e: logger.debug("용량 메트릭 수집 오류: %s", e) metrics["guardia_capacity_plans_total"] = 0 metrics["guardia_capacity_critical"] = 0 # ── 서버/CMDB 메트릭 ─────────────────────────────────────────────────── try: from models import Server server_total = (await db.execute( select(func.count()).select_from(Server) )).scalar() or 0 metrics["guardia_servers_total"] = server_total except Exception: metrics["guardia_servers_total"] = 0 # ── 프로세스 메트릭 ──────────────────────────────────────────────────── metrics["guardia_process_uptime_seconds"] = round(time.time() - _start_time, 1) metrics["guardia_api_requests_total"] = _counters["guardia_api_requests_total"] metrics["guardia_api_errors_total"] = _counters["guardia_api_errors_total"] return metrics def _to_prometheus_text(metrics: Dict[str, Any]) -> str: """메트릭 딕셔너리를 Prometheus text 포맷으로 변환.""" HELP = { "guardia_sr_total": "Total number of service requests", "guardia_sr_last_24h": "Service requests created in last 24 hours", "guardia_incidents_total": "Total number of incidents", "guardia_incidents_open": "Currently open incidents", "guardia_audit_events_total": "Total audit log events", "guardia_audit_critical_total": "Critical severity audit events", "guardia_users_total": "Total registered users", "guardia_users_active": "Currently active users", "guardia_capacity_plans_total": "Total capacity plans", "guardia_capacity_critical": "Capacity plans in critical/overload state", "guardia_servers_total": "Total managed servers", "guardia_process_uptime_seconds": "Process uptime in seconds", "guardia_api_requests_total": "Total API requests handled", "guardia_api_errors_total": "Total API errors", } TYPE = { "guardia_sr_total": "counter", "guardia_sr_last_24h": "gauge", "guardia_incidents_total": "counter", "guardia_incidents_open": "gauge", "guardia_audit_events_total": "counter", "guardia_audit_critical_total": "counter", "guardia_users_total": "gauge", "guardia_users_active": "gauge", "guardia_capacity_plans_total": "gauge", "guardia_capacity_critical": "gauge", "guardia_servers_total": "gauge", "guardia_process_uptime_seconds": "gauge", "guardia_api_requests_total": "counter", "guardia_api_errors_total": "counter", } lines: List[str] = [] emitted_headers: set = set() ts_ms = int(time.time() * 1000) for key, value in metrics.items(): # 기본 메트릭명 추출 (레이블 제거) base_name = key.split("{")[0] if base_name not in emitted_headers: if base_name in HELP: lines.append(f"# HELP {base_name} {HELP[base_name]}") lines.append(f"# TYPE {base_name} {TYPE.get(base_name, 'gauge')}") emitted_headers.add(base_name) lines.append(f"{key} {value} {ts_ms}") return "\n".join(lines) + "\n" # ── 엔드포인트 ─────────────────────────────────────────────────────────────── @router.get("/prometheus", response_class=PlainTextResponse) async def prometheus_metrics( db: AsyncSession = Depends(get_db), ): """ Prometheus scrape 엔드포인트. Prometheus 설정 예시: scrape_configs: - job_name: guardia_itsm static_configs: - targets: ['localhost:8000'] metrics_path: /api/metrics/prometheus """ metrics = await _collect_metrics(db) text = _to_prometheus_text(metrics) return PlainTextResponse( content=text, media_type="text/plain; version=0.0.4; charset=utf-8", ) @router.get("/summary") async def metrics_summary( db: AsyncSession = Depends(get_db), ): """JSON 형식 메트릭 요약 (Grafana Simple JSON datasource 용).""" metrics = await _collect_metrics(db) # 레이블이 없는 단순 메트릭만 추출 simple = {k: v for k, v in metrics.items() if "{" not in k} # SLA 준수율 계산 (간이) sr_total = simple.get("guardia_sr_total", 0) sr_open = 0 for k, v in metrics.items(): if "guardia_sr_by_status" in k and ('"OPEN"' in k or '"IN_PROGRESS"' in k): sr_open += v sr_resolved = sr_total - sr_open sla_rate = round(sr_resolved / sr_total * 100, 1) if sr_total > 0 else 100.0 return { "timestamp": datetime.utcnow().isoformat() + "Z", "uptime_seconds": simple.get("guardia_process_uptime_seconds", 0), "sr": { "total": simple.get("guardia_sr_total", 0), "last_24h": simple.get("guardia_sr_last_24h", 0), "open": sr_open, "resolved": sr_resolved, "sla_rate_pct": sla_rate, }, "incidents": { "total": simple.get("guardia_incidents_total", 0), "open": simple.get("guardia_incidents_open", 0), }, "security": { "audit_total": simple.get("guardia_audit_events_total", 0), "audit_critical": simple.get("guardia_audit_critical_total", 0), }, "capacity": { "plans_total": simple.get("guardia_capacity_plans_total", 0), "critical": simple.get("guardia_capacity_critical", 0), }, "infra": { "servers_total": simple.get("guardia_servers_total", 0), "users_active": simple.get("guardia_users_active", 0), }, "api": { "requests_total": simple.get("guardia_api_requests_total", 0), "errors_total": simple.get("guardia_api_errors_total", 0), }, } @router.get("/health") async def health_check( db: AsyncSession = Depends(get_db), ): """헬스체크 엔드포인트 (Grafana 알림 / 로드밸런서 프로브 용).""" try: # DB 접속 확인 await db.execute(select(func.now())) db_ok = True except Exception: db_ok = False status = "UP" if db_ok else "DEGRADED" code = 200 if db_ok else 503 return JSONResponse( status_code=code, content={ "status": status, "db": "UP" if db_ok else "DOWN", "uptime_s": round(time.time() - _start_time, 1), "checked_at": datetime.utcnow().isoformat() + "Z", }, ) @router.get("/grafana-config") async def grafana_config(): """ Grafana 데이터소스 및 대시보드 연동 설정 예시. 실제 Grafana UI에서 아래 설정으로 연동하세요. """ return { "note": "아래 설정을 Grafana에 적용하세요.", "datasources": [ { "name": "GUARDiA Prometheus", "type": "prometheus", "url": "http://localhost:9090", "access": "proxy", "note": "Prometheus가 /api/metrics/prometheus를 scrape하도록 설정 필요", }, { "name": "GUARDiA JSON", "type": "simplejson", "url": "http://localhost:8000/api/metrics", "access": "proxy", "note": "Grafana Simple JSON datasource 플러그인 설치 필요", }, ], "prometheus_scrape_config": { "scrape_configs": [ { "job_name": "guardia_itsm", "metrics_path": "/api/metrics/prometheus", "static_configs": [ {"targets": ["localhost:8000"]} ], "scrape_interval": "15s", } ] }, "recommended_panels": [ {"title": "SR 총 건수", "query": "guardia_sr_total"}, {"title": "미처리 SR", "query": 'guardia_sr_by_status{status="OPEN"}'}, {"title": "인시던트 오픈", "query": "guardia_incidents_open"}, {"title": "CRITICAL 보안 이벤트", "query": "guardia_audit_critical_total"}, {"title": "용량 위험 시스템", "query": "guardia_capacity_critical"}, {"title": "활성 사용자", "query": "guardia_users_active"}, {"title": "API 오류율", "query": "rate(guardia_api_errors_total[5m])"}, {"title": "프로세스 업타임", "query": "guardia_process_uptime_seconds"}, ], } @router.get("/labels") async def grafana_labels(): """Grafana Simple JSON datasource /labels 응답.""" return [ "sr_total", "sr_last_24h", "incidents_open", "audit_critical", "capacity_critical", "users_active", "servers_total", ] @router.post("/query") async def grafana_query( body: GrafanaQueryIn, db: AsyncSession = Depends(get_db), ): """ Grafana Simple JSON datasource /query 응답. timeserie 포맷: [ { target, datapoints: [[value, ts_ms], ...] } ] """ metrics = await _collect_metrics(db) now_ms = int(time.time() * 1000) METRIC_MAP = { "sr_total": "guardia_sr_total", "sr_last_24h": "guardia_sr_last_24h", "incidents_open": "guardia_incidents_open", "audit_critical": "guardia_audit_critical_total", "capacity_critical": "guardia_capacity_critical", "users_active": "guardia_users_active", "servers_total": "guardia_servers_total", } result = [] for target in body.targets: t_name = target.get("target", "") m_key = METRIC_MAP.get(t_name, t_name) value = metrics.get(m_key, 0) result.append({ "target": t_name, "datapoints": [[value, now_ms]], }) return result