zioinfo-mail/workspace/guardia-itsm/routers/anomaly.py
DESKTOP-TKLFCPR\ython cfe2901a55 refactor(structure): consolidate all projects under workspace/
- itsm/    -> workspace/guardia-itsm/
- manager/ -> workspace/guardia-manager/
- app/     -> workspace/guardia-messenger/
- manual/  -> workspace/guardia-docs/

workspace/zioinfo-web/ unchanged.
git mv preserves full commit history.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-31 23:50:56 +09:00

525 lines
18 KiB
Python

"""
B-1: AI 이상 탐지 API 라우터
엔드포인트:
POST /api/anomaly/metrics — 메트릭 수집 (단건)
POST /api/anomaly/metrics/batch — 메트릭 일괄 수집
GET /api/anomaly/metrics/{source} — 메트릭 히스토리
POST /api/anomaly/detect — 단순 탐지 실행 (DB 저장 없이)
GET /api/anomaly/events — 이상 이벤트 목록
GET /api/anomaly/events/{id} — 이상 이벤트 상세
PATCH /api/anomaly/events/{id}/acknowledge — 인지 처리
PATCH /api/anomaly/events/{id}/resolve — 해결 처리
POST /api/anomaly/rules — 탐지 룰 생성
GET /api/anomaly/rules — 탐지 룰 목록
DELETE /api/anomaly/rules/{id} — 룰 삭제
GET /api/anomaly/summary — 요약 통계
POST /api/anomaly/simulate — 시뮬레이션 (테스트)
"""
from __future__ import annotations
import json
import logging
from datetime import datetime, timedelta
from typing import List, Optional
from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, Query
from sqlalchemy import func, select, and_, desc
from sqlalchemy.ext.asyncio import AsyncSession
from database import get_db
from models import (
AnomalyEvent, AnomalyEventOut,
AnomalyRule, AnomalyRuleCreate, AnomalyRuleOut,
MetricSnapshot, MetricSnapshotIn, MetricSnapshotOut,
SimulateMetricIn,
)
from core.anomaly import (
run_rules_on_metric,
run_detection,
generate_simulation_data,
fetch_recent_values,
METRIC_UNITS,
)
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/anomaly", tags=["anomaly"])
# ── 헬퍼 ─────────────────────────────────────────────────────────────────────
def _get_current_user(token: Optional[str] = None) -> str:
"""간소화된 사용자 추출 (실제 환경에서는 JWT 검증)."""
return "system"
# ── 메트릭 수집 ───────────────────────────────────────────────────────────────
@router.post("/metrics", status_code=202)
async def ingest_metric(
body: MetricSnapshotIn,
background_tasks: BackgroundTasks,
db: AsyncSession = Depends(get_db),
):
"""
메트릭 단건 수집 + 백그라운드에서 이상 탐지 실행.
외부 모니터링 에이전트, cron, 배치에서 주기적으로 호출.
"""
# 1. 스냅샷 저장
snap = MetricSnapshot(
server_id = body.server_id,
source = body.source,
metric_type = body.metric_type,
value = body.value,
unit = body.unit,
tags = json.dumps(body.tags, ensure_ascii=False) if body.tags else None,
)
db.add(snap)
await db.commit()
await db.refresh(snap)
# 2. 백그라운드 이상 탐지
background_tasks.add_task(
_detect_background,
snap.id,
body.source,
body.metric_type,
body.value,
body.unit,
body.server_id,
)
return {"snapshot_id": snap.id, "status": "accepted", "detection": "queued"}
@router.post("/metrics/batch", status_code=202)
async def ingest_metrics_batch(
body: List[MetricSnapshotIn],
background_tasks: BackgroundTasks,
db: AsyncSession = Depends(get_db),
):
"""일괄 메트릭 수집 (최대 100건)."""
if len(body) > 100:
raise HTTPException(400, "최대 100건까지 허용됩니다.")
ids = []
for m in body:
snap = MetricSnapshot(
server_id = m.server_id,
source = m.source,
metric_type = m.metric_type,
value = m.value,
unit = m.unit,
tags = json.dumps(m.tags, ensure_ascii=False) if m.tags else None,
)
db.add(snap)
await db.flush()
ids.append(snap.id)
await db.commit()
# 메트릭별 그룹화하여 탐지
for m in body:
background_tasks.add_task(
_detect_background,
None, # snap_id
m.source,
m.metric_type,
m.value,
m.unit,
m.server_id,
)
return {"saved": len(ids), "detection": "queued"}
async def _detect_background(
snap_id: Optional[int],
source: str,
metric_type: str,
current_value: float,
unit: Optional[str],
server_id: Optional[int],
):
"""백그라운드 이상 탐지 실행 + 알림."""
from database import SessionLocal
async with SessionLocal() as db:
try:
events = await run_rules_on_metric(
db=db,
source=source,
metric_type=metric_type,
current_value=current_value,
unit=unit,
server_id=server_id,
include_llm=False, # 실시간 수집 시 LLM 분석 스킵 (성능)
)
if events:
logger.warning(
"[B-1 이상 탐지] %s 개 이벤트 생성: %s",
len(events),
[e["title"] for e in events],
)
# 알림 발송 시도 (실패해도 무시)
try:
from core.notify import broadcast_sse
for evt in events:
await broadcast_sse("anomaly_detected", evt)
except Exception as e:
logger.debug("알림 발송 실패: %s", e)
except Exception as e:
logger.error("[B-1] 탐지 백그라운드 오류: %s", e)
# ── 메트릭 히스토리 ───────────────────────────────────────────────────────────
@router.get("/metrics/{source}", response_model=List[MetricSnapshotOut])
async def get_metric_history(
source: str,
metric_type: Optional[str] = Query(None),
hours: int = Query(24, ge=1, le=168),
limit: int = Query(200, ge=1, le=1000),
db: AsyncSession = Depends(get_db),
):
"""메트릭 히스토리 조회 (기본 24시간)."""
since = datetime.utcnow() - timedelta(hours=hours)
q = (
select(MetricSnapshot)
.where(
and_(
MetricSnapshot.source == source,
MetricSnapshot.recorded_at >= since,
)
)
.order_by(desc(MetricSnapshot.recorded_at))
.limit(limit)
)
if metric_type:
q = q.where(MetricSnapshot.metric_type == metric_type)
rows = (await db.execute(q)).scalars().all()
return rows
# ── 단순 탐지 (DB 저장 없이) ─────────────────────────────────────────────────
@router.post("/detect")
async def detect_anomaly(
source: str = Query(...),
metric_type: str = Query(...),
current_value: float = Query(...),
method: str = Query("ZSCORE"),
z_threshold: float = Query(3.0),
iqr_factor: float = Query(1.5),
threshold: Optional[float] = Query(None),
db: AsyncSession = Depends(get_db),
):
"""
히스토리 조회 + 이상 탐지 실행 (이벤트 생성 없이 결과만 반환).
UI에서 실시간 분석 미리보기 용도.
"""
history = await fetch_recent_values(db, source, metric_type)
result = run_detection(
metric_type=metric_type,
current_value=current_value,
historical_values=history,
method=method,
z_threshold=z_threshold,
iqr_factor=iqr_factor,
threshold=threshold,
min_samples=5,
)
unit = METRIC_UNITS.get(metric_type, "")
return {
"source": source,
"metric_type": metric_type,
"current_value": current_value,
"unit": unit,
"history_count": len(history),
**result,
}
# ── 이상 이벤트 조회 ──────────────────────────────────────────────────────────
@router.get("/events", response_model=List[AnomalyEventOut])
async def list_anomaly_events(
status: Optional[str] = Query(None), # OPEN|ACKNOWLEDGED|RESOLVED
severity: Optional[str] = Query(None), # INFO|WARNING|CRITICAL
source: Optional[str] = Query(None),
metric_type: Optional[str] = Query(None),
hours: int = Query(72, ge=1, le=720),
limit: int = Query(50, ge=1, le=200),
offset: int = Query(0, ge=0),
db: AsyncSession = Depends(get_db),
):
"""이상 이벤트 목록 조회."""
since = datetime.utcnow() - timedelta(hours=hours)
conditions = [AnomalyEvent.detected_at >= since]
if status:
conditions.append(AnomalyEvent.status == status.upper())
if severity:
conditions.append(AnomalyEvent.severity == severity.upper())
if source:
conditions.append(AnomalyEvent.source.ilike(f"%{source}%"))
if metric_type:
conditions.append(AnomalyEvent.metric_type == metric_type.upper())
q = (
select(AnomalyEvent)
.where(and_(*conditions))
.order_by(desc(AnomalyEvent.detected_at))
.limit(limit)
.offset(offset)
)
rows = (await db.execute(q)).scalars().all()
return rows
@router.get("/events/{event_id}", response_model=AnomalyEventOut)
async def get_anomaly_event(
event_id: int,
db: AsyncSession = Depends(get_db),
):
evt = (await db.execute(select(AnomalyEvent).where(AnomalyEvent.id == event_id))).scalars().first()
if not evt:
raise HTTPException(404, f"이벤트 {event_id}를 찾을 수 없습니다.")
return evt
# ── 이상 이벤트 상태 변경 ─────────────────────────────────────────────────────
@router.patch("/events/{event_id}/acknowledge")
async def acknowledge_event(
event_id: int,
comment: Optional[str] = Query(None),
db: AsyncSession = Depends(get_db),
):
"""이상 이벤트 인지(Acknowledge) 처리."""
evt = (await db.execute(select(AnomalyEvent).where(AnomalyEvent.id == event_id))).scalars().first()
if not evt:
raise HTTPException(404, f"이벤트 {event_id}를 찾을 수 없습니다.")
if evt.status != "OPEN":
raise HTTPException(400, f"OPEN 상태가 아닙니다. 현재: {evt.status}")
evt.status = "ACKNOWLEDGED"
evt.acknowledged_at = datetime.utcnow()
evt.acknowledged_by = "system" # 실제 환경: JWT에서 추출
if comment:
evt.description = (evt.description or "") + f"\n[인지 메모] {comment}"
await db.commit()
return {"event_id": event_id, "status": "ACKNOWLEDGED"}
@router.patch("/events/{event_id}/resolve")
async def resolve_event(
event_id: int,
comment: Optional[str] = Query(None),
db: AsyncSession = Depends(get_db),
):
"""이상 이벤트 해결(Resolve) 처리."""
evt = (await db.execute(select(AnomalyEvent).where(AnomalyEvent.id == event_id))).scalars().first()
if not evt:
raise HTTPException(404, f"이벤트 {event_id}를 찾을 수 없습니다.")
if evt.status == "RESOLVED":
raise HTTPException(400, "이미 해결된 이벤트입니다.")
evt.status = "RESOLVED"
evt.resolved_at = datetime.utcnow()
if comment:
evt.description = (evt.description or "") + f"\n[해결 메모] {comment}"
await db.commit()
return {"event_id": event_id, "status": "RESOLVED"}
@router.patch("/events/{event_id}/false-positive")
async def mark_false_positive(
event_id: int,
db: AsyncSession = Depends(get_db),
):
"""오탐(False Positive) 처리."""
evt = (await db.execute(select(AnomalyEvent).where(AnomalyEvent.id == event_id))).scalars().first()
if not evt:
raise HTTPException(404, f"이벤트 {event_id}를 찾을 수 없습니다.")
evt.status = "FALSE_POSITIVE"
evt.resolved_at = datetime.utcnow()
await db.commit()
return {"event_id": event_id, "status": "FALSE_POSITIVE"}
# ── 탐지 룰 관리 ─────────────────────────────────────────────────────────────
@router.get("/rules", response_model=List[AnomalyRuleOut])
async def list_rules(
db: AsyncSession = Depends(get_db),
):
"""등록된 이상 탐지 룰 목록."""
rows = (await db.execute(select(AnomalyRule).order_by(AnomalyRule.id))).scalars().all()
return rows
@router.post("/rules", response_model=AnomalyRuleOut, status_code=201)
async def create_rule(
body: AnomalyRuleCreate,
db: AsyncSession = Depends(get_db),
):
"""새 이상 탐지 룰 생성."""
existing = (await db.execute(
select(AnomalyRule).where(AnomalyRule.name == body.name)
)).scalars().first()
if existing:
raise HTTPException(409, f"이미 존재하는 룰명: {body.name}")
rule = AnomalyRule(**body.model_dump())
db.add(rule)
await db.commit()
await db.refresh(rule)
return rule
@router.patch("/rules/{rule_id}/toggle")
async def toggle_rule(
rule_id: int,
db: AsyncSession = Depends(get_db),
):
"""룰 활성/비활성 토글."""
rule = (await db.execute(select(AnomalyRule).where(AnomalyRule.id == rule_id))).scalars().first()
if not rule:
raise HTTPException(404, f"{rule_id}를 찾을 수 없습니다.")
rule.is_active = not rule.is_active
await db.commit()
return {"rule_id": rule_id, "is_active": rule.is_active}
@router.delete("/rules/{rule_id}", status_code=204)
async def delete_rule(
rule_id: int,
db: AsyncSession = Depends(get_db),
):
"""룰 삭제."""
rule = (await db.execute(select(AnomalyRule).where(AnomalyRule.id == rule_id))).scalars().first()
if not rule:
raise HTTPException(404, f"{rule_id}를 찾을 수 없습니다.")
await db.delete(rule)
await db.commit()
# ── 요약 통계 ─────────────────────────────────────────────────────────────────
@router.get("/summary")
async def get_anomaly_summary(
hours: int = Query(24, ge=1, le=720),
db: AsyncSession = Depends(get_db),
):
"""이상 탐지 요약 통계."""
since = datetime.utcnow() - timedelta(hours=hours)
# 전체 이벤트 조회
all_events = (await db.execute(
select(AnomalyEvent).where(AnomalyEvent.detected_at >= since)
)).scalars().all()
total = len(all_events)
by_status = {}
by_severity = {}
by_metric = {}
for evt in all_events:
by_status[evt.status] = by_status.get(evt.status, 0) + 1
by_severity[evt.severity] = by_severity.get(evt.severity, 0) + 1
by_metric[evt.metric_type] = by_metric.get(evt.metric_type, 0) + 1
# 최근 7일 일별 추이
daily = {}
for evt in all_events:
day = evt.detected_at.strftime("%Y-%m-%d")
daily[day] = daily.get(day, 0) + 1
# 메트릭 스냅샷 수
snap_count = (await db.execute(
select(func.count()).where(MetricSnapshot.recorded_at >= since)
)).scalar()
return {
"period_hours": hours,
"total_events": total,
"by_status": by_status,
"by_severity": by_severity,
"by_metric_type": by_metric,
"daily_trend": daily,
"metric_snapshots": snap_count or 0,
"open_critical": by_severity.get("CRITICAL", 0) if "OPEN" in by_status else 0,
}
# ── 시뮬레이션 (테스트) ───────────────────────────────────────────────────────
@router.post("/simulate")
async def simulate_anomaly(
body: SimulateMetricIn,
db: AsyncSession = Depends(get_db),
):
"""
이상 탐지 시뮬레이션.
1. 정상 분포 데이터 DB에 저장
2. 이상 값 탐지 실행
3. 탐지 결과 + 이벤트 반환
"""
# 1. 정상 데이터 삽입
normal_values, anomaly_val = generate_simulation_data(
normal_count = body.normal_count,
baseline_mean = body.baseline_mean,
baseline_std = body.baseline_std,
anomaly_value = body.anomaly_value,
)
unit = METRIC_UNITS.get(body.metric_type, "")
for v in normal_values:
db.add(MetricSnapshot(
source = body.source,
metric_type = body.metric_type,
value = v,
unit = unit,
tags = '{"sim": true}',
))
await db.commit()
# 2. 이상 탐지
events = await run_rules_on_metric(
db = db,
source = body.source,
metric_type = body.metric_type,
current_value = anomaly_val,
unit = unit,
include_llm = body.include_llm,
)
# 3. Z-score 미리보기
detect_result = run_detection(
metric_type = body.metric_type,
current_value = anomaly_val,
historical_values= normal_values,
method = "ZSCORE",
min_samples = 5,
)
return {
"simulation": {
"source": body.source,
"metric_type": body.metric_type,
"normal_points": body.normal_count,
"baseline_mean": body.baseline_mean,
"baseline_std": body.baseline_std,
"anomaly_value": anomaly_val,
},
"detection": detect_result,
"events_created": len(events),
"events": events,
}