G-1: 메신저 Webhook Relay + _send_to_room 실제 httpx 호출 구현 G-2: POST /api/tasks/bulk SR 대량작업 엔드포인트 (최대 100건) G-3: 라이선스 만료 알림 스케줄러 (매일 09:00 KST) G-4: 체험판 upgrade_banner 필드 + license.py 배너 로직 G-5: core/auto_rca.py + incidents/problem auto-rca 엔드포인트 G-6: core/deploy_impact.py + vibe impact-analysis 엔드포인트 G-7: core/ticket_classifier.py + SR 생성 시 AI 분류 + ai-suggestion API G-8: VulnPatchRecord 모델 + vuln_scan 패치추적 4개 엔드포인트 G-9: core/jira_sync.py + gateway Jira/Confluence 연동 엔드포인트 G-10: core/push_notify.py + routers/push.py + PushSubscription 모델 G-11: approvals 다중승인 (위임/서명/기한초과/마감연장) G-12: alembic.ini + migrations/ + cicd/migrate_to_postgres.sh 하네스: guardia-orchestrator 확장기능 Phase 반영 봇명령어: /sr /status /license /bulk 슬래시 명령어 추가 설치스크립트: setup/ (Ubuntu, CentOS, RHEL, Windows) --test 옵션 포함 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
443 lines
17 KiB
Python
443 lines
17 KiB
Python
"""
|
|
E-4: Grafana 연동 (Prometheus 메트릭)
|
|
|
|
기능:
|
|
1. Prometheus text-format 메트릭 노출 (/api/metrics/prometheus)
|
|
2. SR / 인시던트 / 배포 / 감사 카운터
|
|
3. SLA 준수율 게이지
|
|
4. 서버·사용자 현황 게이지
|
|
5. Grafana JSON 데이터소스 설정 안내
|
|
6. 헬스체크 엔드포인트
|
|
|
|
엔드포인트:
|
|
GET /api/metrics/prometheus — Prometheus text 포맷
|
|
GET /api/metrics/summary — JSON 요약 (Grafana Simple JSON 용)
|
|
GET /api/metrics/health — 헬스체크 (UP/DOWN)
|
|
GET /api/metrics/grafana-config — Grafana 데이터소스 설정 예시
|
|
GET /api/metrics/labels — 레이블 목록 (Grafana Simple JSON /labels)
|
|
POST /api/metrics/query — Grafana Simple JSON /query
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import time
|
|
from datetime import datetime, timedelta
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
from fastapi import APIRouter, Depends, Request
|
|
from fastapi.responses import PlainTextResponse, JSONResponse
|
|
from pydantic import BaseModel
|
|
from sqlalchemy import select, func, and_
|
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
|
|
from database import get_db
|
|
from models import User
|
|
|
|
logger = logging.getLogger(__name__)
|
|
router = APIRouter(prefix="/api/metrics", tags=["metrics"])
|
|
|
|
# ── 인메모리 카운터 (프로세스 재시작 시 초기화) ───────────────────────────────
|
|
_counters: Dict[str, float] = {
|
|
"guardia_api_requests_total": 0,
|
|
"guardia_api_errors_total": 0,
|
|
}
|
|
_start_time: float = time.time()
|
|
|
|
|
|
# ── Pydantic 스키마 ──────────────────────────────────────────────────────────
|
|
|
|
class GrafanaQueryIn(BaseModel):
|
|
range: Optional[Dict[str, Any]] = None
|
|
targets: List[Dict[str, Any]] = []
|
|
maxDataPoints: int = 100
|
|
|
|
|
|
# ── 메트릭 수집 헬퍼 ─────────────────────────────────────────────────────────
|
|
|
|
async def _collect_metrics(db: AsyncSession) -> Dict[str, Any]:
|
|
"""DB에서 메트릭을 집계하여 딕셔너리로 반환."""
|
|
metrics: Dict[str, Any] = {}
|
|
|
|
# ── SR 메트릭 ──────────────────────────────────────────────────────────
|
|
try:
|
|
from models import SRRequest
|
|
sr_total = (await db.execute(
|
|
select(func.count()).select_from(SRRequest)
|
|
)).scalar() or 0
|
|
metrics["guardia_sr_total"] = sr_total
|
|
|
|
by_status_rows = (await db.execute(
|
|
select(SRRequest.status, func.count())
|
|
.group_by(SRRequest.status)
|
|
)).all()
|
|
for status, cnt in by_status_rows:
|
|
key = f'guardia_sr_by_status{{status="{status or "UNKNOWN"}"}}'
|
|
metrics[key] = cnt
|
|
|
|
by_priority_rows = (await db.execute(
|
|
select(SRRequest.priority, func.count())
|
|
.group_by(SRRequest.priority)
|
|
)).all()
|
|
for priority, cnt in by_priority_rows:
|
|
key = f'guardia_sr_by_priority{{priority="{priority or "MEDIUM"}"}}'
|
|
metrics[key] = cnt
|
|
|
|
# 최근 24h SR
|
|
since_24h = datetime.utcnow() - timedelta(hours=24)
|
|
sr_24h = (await db.execute(
|
|
select(func.count()).select_from(SRRequest)
|
|
.where(SRRequest.created_at >= since_24h)
|
|
)).scalar() or 0
|
|
metrics["guardia_sr_last_24h"] = sr_24h
|
|
|
|
except Exception as e:
|
|
logger.debug("SR 메트릭 수집 오류: %s", e)
|
|
metrics["guardia_sr_total"] = 0
|
|
|
|
# ── 인시던트 메트릭 ────────────────────────────────────────────────────
|
|
try:
|
|
from models import Incident
|
|
inc_total = (await db.execute(
|
|
select(func.count()).select_from(Incident)
|
|
)).scalar() or 0
|
|
metrics["guardia_incidents_total"] = inc_total
|
|
|
|
open_inc = (await db.execute(
|
|
select(func.count()).select_from(Incident)
|
|
.where(Incident.status.in_(["OPEN", "IN_PROGRESS"]))
|
|
)).scalar() or 0
|
|
metrics["guardia_incidents_open"] = open_inc
|
|
|
|
except Exception as e:
|
|
logger.debug("인시던트 메트릭 수집 오류: %s", e)
|
|
metrics["guardia_incidents_total"] = 0
|
|
metrics["guardia_incidents_open"] = 0
|
|
|
|
# ── 감사 로그 메트릭 ───────────────────────────────────────────────────
|
|
try:
|
|
from models import AuditLog
|
|
audit_total = (await db.execute(
|
|
select(func.count()).select_from(AuditLog)
|
|
)).scalar() or 0
|
|
metrics["guardia_audit_events_total"] = audit_total
|
|
|
|
critical_audit = (await db.execute(
|
|
select(func.count()).select_from(AuditLog)
|
|
.where(AuditLog.severity == "CRITICAL")
|
|
)).scalar() or 0
|
|
metrics["guardia_audit_critical_total"] = critical_audit
|
|
|
|
by_sev_rows = (await db.execute(
|
|
select(AuditLog.severity, func.count())
|
|
.group_by(AuditLog.severity)
|
|
)).all()
|
|
for sev, cnt in by_sev_rows:
|
|
key = f'guardia_audit_by_severity{{severity="{sev or "INFO"}"}}'
|
|
metrics[key] = cnt
|
|
|
|
except Exception as e:
|
|
logger.debug("감사 메트릭 수집 오류: %s", e)
|
|
metrics["guardia_audit_events_total"] = 0
|
|
metrics["guardia_audit_critical_total"] = 0
|
|
|
|
# ── 사용자 메트릭 ──────────────────────────────────────────────────────
|
|
try:
|
|
user_total = (await db.execute(
|
|
select(func.count()).select_from(User)
|
|
)).scalar() or 0
|
|
metrics["guardia_users_total"] = user_total
|
|
|
|
active_users = (await db.execute(
|
|
select(func.count()).select_from(User)
|
|
.where(User.is_active == True)
|
|
)).scalar() or 0
|
|
metrics["guardia_users_active"] = active_users
|
|
|
|
except Exception as e:
|
|
logger.debug("사용자 메트릭 수집 오류: %s", e)
|
|
metrics["guardia_users_total"] = 0
|
|
metrics["guardia_users_active"] = 0
|
|
|
|
# ── 용량 관리 메트릭 ───────────────────────────────────────────────────
|
|
try:
|
|
from models import CapacityPlan
|
|
cap_total = (await db.execute(
|
|
select(func.count()).select_from(CapacityPlan)
|
|
)).scalar() or 0
|
|
metrics["guardia_capacity_plans_total"] = cap_total
|
|
|
|
cap_critical = (await db.execute(
|
|
select(func.count()).select_from(CapacityPlan)
|
|
.where(CapacityPlan.status.in_(["CRITICAL", "OVERLOAD"]))
|
|
)).scalar() or 0
|
|
metrics["guardia_capacity_critical"] = cap_critical
|
|
|
|
except Exception as e:
|
|
logger.debug("용량 메트릭 수집 오류: %s", e)
|
|
metrics["guardia_capacity_plans_total"] = 0
|
|
metrics["guardia_capacity_critical"] = 0
|
|
|
|
# ── 서버/CMDB 메트릭 ───────────────────────────────────────────────────
|
|
try:
|
|
from models import Server
|
|
server_total = (await db.execute(
|
|
select(func.count()).select_from(Server)
|
|
)).scalar() or 0
|
|
metrics["guardia_servers_total"] = server_total
|
|
except Exception:
|
|
metrics["guardia_servers_total"] = 0
|
|
|
|
# ── 프로세스 메트릭 ────────────────────────────────────────────────────
|
|
metrics["guardia_process_uptime_seconds"] = round(time.time() - _start_time, 1)
|
|
metrics["guardia_api_requests_total"] = _counters["guardia_api_requests_total"]
|
|
metrics["guardia_api_errors_total"] = _counters["guardia_api_errors_total"]
|
|
|
|
return metrics
|
|
|
|
|
|
def _to_prometheus_text(metrics: Dict[str, Any]) -> str:
|
|
"""메트릭 딕셔너리를 Prometheus text 포맷으로 변환."""
|
|
HELP = {
|
|
"guardia_sr_total": "Total number of service requests",
|
|
"guardia_sr_last_24h": "Service requests created in last 24 hours",
|
|
"guardia_incidents_total": "Total number of incidents",
|
|
"guardia_incidents_open": "Currently open incidents",
|
|
"guardia_audit_events_total": "Total audit log events",
|
|
"guardia_audit_critical_total": "Critical severity audit events",
|
|
"guardia_users_total": "Total registered users",
|
|
"guardia_users_active": "Currently active users",
|
|
"guardia_capacity_plans_total": "Total capacity plans",
|
|
"guardia_capacity_critical": "Capacity plans in critical/overload state",
|
|
"guardia_servers_total": "Total managed servers",
|
|
"guardia_process_uptime_seconds": "Process uptime in seconds",
|
|
"guardia_api_requests_total": "Total API requests handled",
|
|
"guardia_api_errors_total": "Total API errors",
|
|
}
|
|
TYPE = {
|
|
"guardia_sr_total": "counter",
|
|
"guardia_sr_last_24h": "gauge",
|
|
"guardia_incidents_total": "counter",
|
|
"guardia_incidents_open": "gauge",
|
|
"guardia_audit_events_total": "counter",
|
|
"guardia_audit_critical_total": "counter",
|
|
"guardia_users_total": "gauge",
|
|
"guardia_users_active": "gauge",
|
|
"guardia_capacity_plans_total": "gauge",
|
|
"guardia_capacity_critical": "gauge",
|
|
"guardia_servers_total": "gauge",
|
|
"guardia_process_uptime_seconds": "gauge",
|
|
"guardia_api_requests_total": "counter",
|
|
"guardia_api_errors_total": "counter",
|
|
}
|
|
|
|
lines: List[str] = []
|
|
emitted_headers: set = set()
|
|
ts_ms = int(time.time() * 1000)
|
|
|
|
for key, value in metrics.items():
|
|
# 기본 메트릭명 추출 (레이블 제거)
|
|
base_name = key.split("{")[0]
|
|
if base_name not in emitted_headers:
|
|
if base_name in HELP:
|
|
lines.append(f"# HELP {base_name} {HELP[base_name]}")
|
|
lines.append(f"# TYPE {base_name} {TYPE.get(base_name, 'gauge')}")
|
|
emitted_headers.add(base_name)
|
|
lines.append(f"{key} {value} {ts_ms}")
|
|
|
|
return "\n".join(lines) + "\n"
|
|
|
|
|
|
# ── 엔드포인트 ───────────────────────────────────────────────────────────────
|
|
|
|
@router.get("/prometheus", response_class=PlainTextResponse)
|
|
async def prometheus_metrics(
|
|
db: AsyncSession = Depends(get_db),
|
|
):
|
|
"""
|
|
Prometheus scrape 엔드포인트.
|
|
Prometheus 설정 예시:
|
|
scrape_configs:
|
|
- job_name: guardia_itsm
|
|
static_configs:
|
|
- targets: ['localhost:8000']
|
|
metrics_path: /api/metrics/prometheus
|
|
"""
|
|
metrics = await _collect_metrics(db)
|
|
text = _to_prometheus_text(metrics)
|
|
return PlainTextResponse(
|
|
content=text,
|
|
media_type="text/plain; version=0.0.4; charset=utf-8",
|
|
)
|
|
|
|
|
|
@router.get("/summary")
|
|
async def metrics_summary(
|
|
db: AsyncSession = Depends(get_db),
|
|
):
|
|
"""JSON 형식 메트릭 요약 (Grafana Simple JSON datasource 용)."""
|
|
metrics = await _collect_metrics(db)
|
|
|
|
# 레이블이 없는 단순 메트릭만 추출
|
|
simple = {k: v for k, v in metrics.items() if "{" not in k}
|
|
|
|
# SLA 준수율 계산 (간이)
|
|
sr_total = simple.get("guardia_sr_total", 0)
|
|
sr_open = 0
|
|
for k, v in metrics.items():
|
|
if "guardia_sr_by_status" in k and ('"OPEN"' in k or '"IN_PROGRESS"' in k):
|
|
sr_open += v
|
|
sr_resolved = sr_total - sr_open
|
|
sla_rate = round(sr_resolved / sr_total * 100, 1) if sr_total > 0 else 100.0
|
|
|
|
return {
|
|
"timestamp": datetime.utcnow().isoformat() + "Z",
|
|
"uptime_seconds": simple.get("guardia_process_uptime_seconds", 0),
|
|
"sr": {
|
|
"total": simple.get("guardia_sr_total", 0),
|
|
"last_24h": simple.get("guardia_sr_last_24h", 0),
|
|
"open": sr_open,
|
|
"resolved": sr_resolved,
|
|
"sla_rate_pct": sla_rate,
|
|
},
|
|
"incidents": {
|
|
"total": simple.get("guardia_incidents_total", 0),
|
|
"open": simple.get("guardia_incidents_open", 0),
|
|
},
|
|
"security": {
|
|
"audit_total": simple.get("guardia_audit_events_total", 0),
|
|
"audit_critical": simple.get("guardia_audit_critical_total", 0),
|
|
},
|
|
"capacity": {
|
|
"plans_total": simple.get("guardia_capacity_plans_total", 0),
|
|
"critical": simple.get("guardia_capacity_critical", 0),
|
|
},
|
|
"infra": {
|
|
"servers_total": simple.get("guardia_servers_total", 0),
|
|
"users_active": simple.get("guardia_users_active", 0),
|
|
},
|
|
"api": {
|
|
"requests_total": simple.get("guardia_api_requests_total", 0),
|
|
"errors_total": simple.get("guardia_api_errors_total", 0),
|
|
},
|
|
}
|
|
|
|
|
|
@router.get("/health")
|
|
async def health_check(
|
|
db: AsyncSession = Depends(get_db),
|
|
):
|
|
"""헬스체크 엔드포인트 (Grafana 알림 / 로드밸런서 프로브 용)."""
|
|
try:
|
|
# DB 접속 확인
|
|
await db.execute(select(func.now()))
|
|
db_ok = True
|
|
except Exception:
|
|
db_ok = False
|
|
|
|
status = "UP" if db_ok else "DEGRADED"
|
|
code = 200 if db_ok else 503
|
|
|
|
return JSONResponse(
|
|
status_code=code,
|
|
content={
|
|
"status": status,
|
|
"db": "UP" if db_ok else "DOWN",
|
|
"uptime_s": round(time.time() - _start_time, 1),
|
|
"checked_at": datetime.utcnow().isoformat() + "Z",
|
|
},
|
|
)
|
|
|
|
|
|
@router.get("/grafana-config")
|
|
async def grafana_config():
|
|
"""
|
|
Grafana 데이터소스 및 대시보드 연동 설정 예시.
|
|
실제 Grafana UI에서 아래 설정으로 연동하세요.
|
|
"""
|
|
return {
|
|
"note": "아래 설정을 Grafana에 적용하세요.",
|
|
"datasources": [
|
|
{
|
|
"name": "GUARDiA Prometheus",
|
|
"type": "prometheus",
|
|
"url": "http://localhost:9090",
|
|
"access": "proxy",
|
|
"note": "Prometheus가 /api/metrics/prometheus를 scrape하도록 설정 필요",
|
|
},
|
|
{
|
|
"name": "GUARDiA JSON",
|
|
"type": "simplejson",
|
|
"url": "http://localhost:8000/api/metrics",
|
|
"access": "proxy",
|
|
"note": "Grafana Simple JSON datasource 플러그인 설치 필요",
|
|
},
|
|
],
|
|
"prometheus_scrape_config": {
|
|
"scrape_configs": [
|
|
{
|
|
"job_name": "guardia_itsm",
|
|
"metrics_path": "/api/metrics/prometheus",
|
|
"static_configs": [
|
|
{"targets": ["localhost:8000"]}
|
|
],
|
|
"scrape_interval": "15s",
|
|
}
|
|
]
|
|
},
|
|
"recommended_panels": [
|
|
{"title": "SR 총 건수", "query": "guardia_sr_total"},
|
|
{"title": "미처리 SR", "query": 'guardia_sr_by_status{status="OPEN"}'},
|
|
{"title": "인시던트 오픈", "query": "guardia_incidents_open"},
|
|
{"title": "CRITICAL 보안 이벤트", "query": "guardia_audit_critical_total"},
|
|
{"title": "용량 위험 시스템", "query": "guardia_capacity_critical"},
|
|
{"title": "활성 사용자", "query": "guardia_users_active"},
|
|
{"title": "API 오류율", "query": "rate(guardia_api_errors_total[5m])"},
|
|
{"title": "프로세스 업타임", "query": "guardia_process_uptime_seconds"},
|
|
],
|
|
}
|
|
|
|
|
|
@router.get("/labels")
|
|
async def grafana_labels():
|
|
"""Grafana Simple JSON datasource /labels 응답."""
|
|
return [
|
|
"sr_total", "sr_last_24h", "incidents_open",
|
|
"audit_critical", "capacity_critical",
|
|
"users_active", "servers_total",
|
|
]
|
|
|
|
|
|
@router.post("/query")
|
|
async def grafana_query(
|
|
body: GrafanaQueryIn,
|
|
db: AsyncSession = Depends(get_db),
|
|
):
|
|
"""
|
|
Grafana Simple JSON datasource /query 응답.
|
|
timeserie 포맷: [ { target, datapoints: [[value, ts_ms], ...] } ]
|
|
"""
|
|
metrics = await _collect_metrics(db)
|
|
now_ms = int(time.time() * 1000)
|
|
|
|
METRIC_MAP = {
|
|
"sr_total": "guardia_sr_total",
|
|
"sr_last_24h": "guardia_sr_last_24h",
|
|
"incidents_open": "guardia_incidents_open",
|
|
"audit_critical": "guardia_audit_critical_total",
|
|
"capacity_critical": "guardia_capacity_critical",
|
|
"users_active": "guardia_users_active",
|
|
"servers_total": "guardia_servers_total",
|
|
}
|
|
|
|
result = []
|
|
for target in body.targets:
|
|
t_name = target.get("target", "")
|
|
m_key = METRIC_MAP.get(t_name, t_name)
|
|
value = metrics.get(m_key, 0)
|
|
result.append({
|
|
"target": t_name,
|
|
"datapoints": [[value, now_ms]],
|
|
})
|
|
|
|
return result
|