zioinfo-mail/workspace/guardia-itsm/routers/metrics.py
DESKTOP-TKLFCPR\ython cfe2901a55 refactor(structure): consolidate all projects under workspace/
- itsm/    -> workspace/guardia-itsm/
- manager/ -> workspace/guardia-manager/
- app/     -> workspace/guardia-messenger/
- manual/  -> workspace/guardia-docs/

workspace/zioinfo-web/ unchanged.
git mv preserves full commit history.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-31 23:50:56 +09:00

443 lines
17 KiB
Python

"""
E-4: Grafana 연동 (Prometheus 메트릭)
기능:
1. Prometheus text-format 메트릭 노출 (/api/metrics/prometheus)
2. SR / 인시던트 / 배포 / 감사 카운터
3. SLA 준수율 게이지
4. 서버·사용자 현황 게이지
5. Grafana JSON 데이터소스 설정 안내
6. 헬스체크 엔드포인트
엔드포인트:
GET /api/metrics/prometheus — Prometheus text 포맷
GET /api/metrics/summary — JSON 요약 (Grafana Simple JSON 용)
GET /api/metrics/health — 헬스체크 (UP/DOWN)
GET /api/metrics/grafana-config — Grafana 데이터소스 설정 예시
GET /api/metrics/labels — 레이블 목록 (Grafana Simple JSON /labels)
POST /api/metrics/query — Grafana Simple JSON /query
"""
from __future__ import annotations
import logging
import time
from datetime import datetime, timedelta
from typing import Any, Dict, List, Optional
from fastapi import APIRouter, Depends, Request
from fastapi.responses import PlainTextResponse, JSONResponse
from pydantic import BaseModel
from sqlalchemy import select, func, and_
from sqlalchemy.ext.asyncio import AsyncSession
from database import get_db
from models import User
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/metrics", tags=["metrics"])
# ── 인메모리 카운터 (프로세스 재시작 시 초기화) ───────────────────────────────
_counters: Dict[str, float] = {
"guardia_api_requests_total": 0,
"guardia_api_errors_total": 0,
}
_start_time: float = time.time()
# ── Pydantic 스키마 ──────────────────────────────────────────────────────────
class GrafanaQueryIn(BaseModel):
range: Optional[Dict[str, Any]] = None
targets: List[Dict[str, Any]] = []
maxDataPoints: int = 100
# ── 메트릭 수집 헬퍼 ─────────────────────────────────────────────────────────
async def _collect_metrics(db: AsyncSession) -> Dict[str, Any]:
"""DB에서 메트릭을 집계하여 딕셔너리로 반환."""
metrics: Dict[str, Any] = {}
# ── SR 메트릭 ──────────────────────────────────────────────────────────
try:
from models import SRRequest
sr_total = (await db.execute(
select(func.count()).select_from(SRRequest)
)).scalar() or 0
metrics["guardia_sr_total"] = sr_total
by_status_rows = (await db.execute(
select(SRRequest.status, func.count())
.group_by(SRRequest.status)
)).all()
for status, cnt in by_status_rows:
key = f'guardia_sr_by_status{{status="{status or "UNKNOWN"}"}}'
metrics[key] = cnt
by_priority_rows = (await db.execute(
select(SRRequest.priority, func.count())
.group_by(SRRequest.priority)
)).all()
for priority, cnt in by_priority_rows:
key = f'guardia_sr_by_priority{{priority="{priority or "MEDIUM"}"}}'
metrics[key] = cnt
# 최근 24h SR
since_24h = datetime.utcnow() - timedelta(hours=24)
sr_24h = (await db.execute(
select(func.count()).select_from(SRRequest)
.where(SRRequest.created_at >= since_24h)
)).scalar() or 0
metrics["guardia_sr_last_24h"] = sr_24h
except Exception as e:
logger.debug("SR 메트릭 수집 오류: %s", e)
metrics["guardia_sr_total"] = 0
# ── 인시던트 메트릭 ────────────────────────────────────────────────────
try:
from models import Incident
inc_total = (await db.execute(
select(func.count()).select_from(Incident)
)).scalar() or 0
metrics["guardia_incidents_total"] = inc_total
open_inc = (await db.execute(
select(func.count()).select_from(Incident)
.where(Incident.status.in_(["OPEN", "IN_PROGRESS"]))
)).scalar() or 0
metrics["guardia_incidents_open"] = open_inc
except Exception as e:
logger.debug("인시던트 메트릭 수집 오류: %s", e)
metrics["guardia_incidents_total"] = 0
metrics["guardia_incidents_open"] = 0
# ── 감사 로그 메트릭 ───────────────────────────────────────────────────
try:
from models import AuditLog
audit_total = (await db.execute(
select(func.count()).select_from(AuditLog)
)).scalar() or 0
metrics["guardia_audit_events_total"] = audit_total
critical_audit = (await db.execute(
select(func.count()).select_from(AuditLog)
.where(AuditLog.severity == "CRITICAL")
)).scalar() or 0
metrics["guardia_audit_critical_total"] = critical_audit
by_sev_rows = (await db.execute(
select(AuditLog.severity, func.count())
.group_by(AuditLog.severity)
)).all()
for sev, cnt in by_sev_rows:
key = f'guardia_audit_by_severity{{severity="{sev or "INFO"}"}}'
metrics[key] = cnt
except Exception as e:
logger.debug("감사 메트릭 수집 오류: %s", e)
metrics["guardia_audit_events_total"] = 0
metrics["guardia_audit_critical_total"] = 0
# ── 사용자 메트릭 ──────────────────────────────────────────────────────
try:
user_total = (await db.execute(
select(func.count()).select_from(User)
)).scalar() or 0
metrics["guardia_users_total"] = user_total
active_users = (await db.execute(
select(func.count()).select_from(User)
.where(User.is_active == True)
)).scalar() or 0
metrics["guardia_users_active"] = active_users
except Exception as e:
logger.debug("사용자 메트릭 수집 오류: %s", e)
metrics["guardia_users_total"] = 0
metrics["guardia_users_active"] = 0
# ── 용량 관리 메트릭 ───────────────────────────────────────────────────
try:
from models import CapacityPlan
cap_total = (await db.execute(
select(func.count()).select_from(CapacityPlan)
)).scalar() or 0
metrics["guardia_capacity_plans_total"] = cap_total
cap_critical = (await db.execute(
select(func.count()).select_from(CapacityPlan)
.where(CapacityPlan.status.in_(["CRITICAL", "OVERLOAD"]))
)).scalar() or 0
metrics["guardia_capacity_critical"] = cap_critical
except Exception as e:
logger.debug("용량 메트릭 수집 오류: %s", e)
metrics["guardia_capacity_plans_total"] = 0
metrics["guardia_capacity_critical"] = 0
# ── 서버/CMDB 메트릭 ───────────────────────────────────────────────────
try:
from models import Server
server_total = (await db.execute(
select(func.count()).select_from(Server)
)).scalar() or 0
metrics["guardia_servers_total"] = server_total
except Exception:
metrics["guardia_servers_total"] = 0
# ── 프로세스 메트릭 ────────────────────────────────────────────────────
metrics["guardia_process_uptime_seconds"] = round(time.time() - _start_time, 1)
metrics["guardia_api_requests_total"] = _counters["guardia_api_requests_total"]
metrics["guardia_api_errors_total"] = _counters["guardia_api_errors_total"]
return metrics
def _to_prometheus_text(metrics: Dict[str, Any]) -> str:
"""메트릭 딕셔너리를 Prometheus text 포맷으로 변환."""
HELP = {
"guardia_sr_total": "Total number of service requests",
"guardia_sr_last_24h": "Service requests created in last 24 hours",
"guardia_incidents_total": "Total number of incidents",
"guardia_incidents_open": "Currently open incidents",
"guardia_audit_events_total": "Total audit log events",
"guardia_audit_critical_total": "Critical severity audit events",
"guardia_users_total": "Total registered users",
"guardia_users_active": "Currently active users",
"guardia_capacity_plans_total": "Total capacity plans",
"guardia_capacity_critical": "Capacity plans in critical/overload state",
"guardia_servers_total": "Total managed servers",
"guardia_process_uptime_seconds": "Process uptime in seconds",
"guardia_api_requests_total": "Total API requests handled",
"guardia_api_errors_total": "Total API errors",
}
TYPE = {
"guardia_sr_total": "counter",
"guardia_sr_last_24h": "gauge",
"guardia_incidents_total": "counter",
"guardia_incidents_open": "gauge",
"guardia_audit_events_total": "counter",
"guardia_audit_critical_total": "counter",
"guardia_users_total": "gauge",
"guardia_users_active": "gauge",
"guardia_capacity_plans_total": "gauge",
"guardia_capacity_critical": "gauge",
"guardia_servers_total": "gauge",
"guardia_process_uptime_seconds": "gauge",
"guardia_api_requests_total": "counter",
"guardia_api_errors_total": "counter",
}
lines: List[str] = []
emitted_headers: set = set()
ts_ms = int(time.time() * 1000)
for key, value in metrics.items():
# 기본 메트릭명 추출 (레이블 제거)
base_name = key.split("{")[0]
if base_name not in emitted_headers:
if base_name in HELP:
lines.append(f"# HELP {base_name} {HELP[base_name]}")
lines.append(f"# TYPE {base_name} {TYPE.get(base_name, 'gauge')}")
emitted_headers.add(base_name)
lines.append(f"{key} {value} {ts_ms}")
return "\n".join(lines) + "\n"
# ── 엔드포인트 ───────────────────────────────────────────────────────────────
@router.get("/prometheus", response_class=PlainTextResponse)
async def prometheus_metrics(
db: AsyncSession = Depends(get_db),
):
"""
Prometheus scrape 엔드포인트.
Prometheus 설정 예시:
scrape_configs:
- job_name: guardia_itsm
static_configs:
- targets: ['localhost:8000']
metrics_path: /api/metrics/prometheus
"""
metrics = await _collect_metrics(db)
text = _to_prometheus_text(metrics)
return PlainTextResponse(
content=text,
media_type="text/plain; version=0.0.4; charset=utf-8",
)
@router.get("/summary")
async def metrics_summary(
db: AsyncSession = Depends(get_db),
):
"""JSON 형식 메트릭 요약 (Grafana Simple JSON datasource 용)."""
metrics = await _collect_metrics(db)
# 레이블이 없는 단순 메트릭만 추출
simple = {k: v for k, v in metrics.items() if "{" not in k}
# SLA 준수율 계산 (간이)
sr_total = simple.get("guardia_sr_total", 0)
sr_open = 0
for k, v in metrics.items():
if "guardia_sr_by_status" in k and ('"OPEN"' in k or '"IN_PROGRESS"' in k):
sr_open += v
sr_resolved = sr_total - sr_open
sla_rate = round(sr_resolved / sr_total * 100, 1) if sr_total > 0 else 100.0
return {
"timestamp": datetime.utcnow().isoformat() + "Z",
"uptime_seconds": simple.get("guardia_process_uptime_seconds", 0),
"sr": {
"total": simple.get("guardia_sr_total", 0),
"last_24h": simple.get("guardia_sr_last_24h", 0),
"open": sr_open,
"resolved": sr_resolved,
"sla_rate_pct": sla_rate,
},
"incidents": {
"total": simple.get("guardia_incidents_total", 0),
"open": simple.get("guardia_incidents_open", 0),
},
"security": {
"audit_total": simple.get("guardia_audit_events_total", 0),
"audit_critical": simple.get("guardia_audit_critical_total", 0),
},
"capacity": {
"plans_total": simple.get("guardia_capacity_plans_total", 0),
"critical": simple.get("guardia_capacity_critical", 0),
},
"infra": {
"servers_total": simple.get("guardia_servers_total", 0),
"users_active": simple.get("guardia_users_active", 0),
},
"api": {
"requests_total": simple.get("guardia_api_requests_total", 0),
"errors_total": simple.get("guardia_api_errors_total", 0),
},
}
@router.get("/health")
async def health_check(
db: AsyncSession = Depends(get_db),
):
"""헬스체크 엔드포인트 (Grafana 알림 / 로드밸런서 프로브 용)."""
try:
# DB 접속 확인
await db.execute(select(func.now()))
db_ok = True
except Exception:
db_ok = False
status = "UP" if db_ok else "DEGRADED"
code = 200 if db_ok else 503
return JSONResponse(
status_code=code,
content={
"status": status,
"db": "UP" if db_ok else "DOWN",
"uptime_s": round(time.time() - _start_time, 1),
"checked_at": datetime.utcnow().isoformat() + "Z",
},
)
@router.get("/grafana-config")
async def grafana_config():
"""
Grafana 데이터소스 및 대시보드 연동 설정 예시.
실제 Grafana UI에서 아래 설정으로 연동하세요.
"""
return {
"note": "아래 설정을 Grafana에 적용하세요.",
"datasources": [
{
"name": "GUARDiA Prometheus",
"type": "prometheus",
"url": "http://localhost:9090",
"access": "proxy",
"note": "Prometheus가 /api/metrics/prometheus를 scrape하도록 설정 필요",
},
{
"name": "GUARDiA JSON",
"type": "simplejson",
"url": "http://localhost:8000/api/metrics",
"access": "proxy",
"note": "Grafana Simple JSON datasource 플러그인 설치 필요",
},
],
"prometheus_scrape_config": {
"scrape_configs": [
{
"job_name": "guardia_itsm",
"metrics_path": "/api/metrics/prometheus",
"static_configs": [
{"targets": ["localhost:8000"]}
],
"scrape_interval": "15s",
}
]
},
"recommended_panels": [
{"title": "SR 총 건수", "query": "guardia_sr_total"},
{"title": "미처리 SR", "query": 'guardia_sr_by_status{status="OPEN"}'},
{"title": "인시던트 오픈", "query": "guardia_incidents_open"},
{"title": "CRITICAL 보안 이벤트", "query": "guardia_audit_critical_total"},
{"title": "용량 위험 시스템", "query": "guardia_capacity_critical"},
{"title": "활성 사용자", "query": "guardia_users_active"},
{"title": "API 오류율", "query": "rate(guardia_api_errors_total[5m])"},
{"title": "프로세스 업타임", "query": "guardia_process_uptime_seconds"},
],
}
@router.get("/labels")
async def grafana_labels():
"""Grafana Simple JSON datasource /labels 응답."""
return [
"sr_total", "sr_last_24h", "incidents_open",
"audit_critical", "capacity_critical",
"users_active", "servers_total",
]
@router.post("/query")
async def grafana_query(
body: GrafanaQueryIn,
db: AsyncSession = Depends(get_db),
):
"""
Grafana Simple JSON datasource /query 응답.
timeserie 포맷: [ { target, datapoints: [[value, ts_ms], ...] } ]
"""
metrics = await _collect_metrics(db)
now_ms = int(time.time() * 1000)
METRIC_MAP = {
"sr_total": "guardia_sr_total",
"sr_last_24h": "guardia_sr_last_24h",
"incidents_open": "guardia_incidents_open",
"audit_critical": "guardia_audit_critical_total",
"capacity_critical": "guardia_capacity_critical",
"users_active": "guardia_users_active",
"servers_total": "guardia_servers_total",
}
result = []
for target in body.targets:
t_name = target.get("target", "")
m_key = METRIC_MAP.get(t_name, t_name)
value = metrics.get(m_key, 0)
result.append({
"target": t_name,
"datapoints": [[value, now_ms]],
})
return result