guardia-itsm/core/anomaly.py
DESKTOP-TKLFCPRython 64c27c3509 feat(itsm): G-1~G-12 확장 기능 + 하네스/봇/설치스크립트 구현
G-1: 메신저 Webhook Relay + _send_to_room 실제 httpx 호출 구현
G-2: POST /api/tasks/bulk SR 대량작업 엔드포인트 (최대 100건)
G-3: 라이선스 만료 알림 스케줄러 (매일 09:00 KST)
G-4: 체험판 upgrade_banner 필드 + license.py 배너 로직
G-5: core/auto_rca.py + incidents/problem auto-rca 엔드포인트
G-6: core/deploy_impact.py + vibe impact-analysis 엔드포인트
G-7: core/ticket_classifier.py + SR 생성 시 AI 분류 + ai-suggestion API
G-8: VulnPatchRecord 모델 + vuln_scan 패치추적 4개 엔드포인트
G-9: core/jira_sync.py + gateway Jira/Confluence 연동 엔드포인트
G-10: core/push_notify.py + routers/push.py + PushSubscription 모델
G-11: approvals 다중승인 (위임/서명/기한초과/마감연장)
G-12: alembic.ini + migrations/ + cicd/migrate_to_postgres.sh

하네스: guardia-orchestrator 확장기능 Phase 반영
봇명령어: /sr /status /license /bulk 슬래시 명령어 추가
설치스크립트: setup/ (Ubuntu, CentOS, RHEL, Windows) --test 옵션 포함

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-29 18:18:52 +09:00

536 lines
18 KiB
Python

"""
B-1: AI 이상 탐지 (Anomaly Detection) 엔진
탐지 방법:
1. ZSCORE — Z-score 기반 통계적 이상 탐지
2. IQR — 사분위수 범위(IQR) 기반 이상치 탐지
3. THRESHOLD — 정적 임계값 초과
4. TREND — 연속 상승/하강 추세 이탈
Ollama LLM은 탐지 후 자연어 분석 생성에만 사용 (탐지 자체는 순수 통계).
"""
from __future__ import annotations
import json
import logging
import math
import statistics
from datetime import datetime, timedelta
from pathlib import Path
from typing import Dict, List, Optional, Tuple
import httpx
from sqlalchemy import select, and_
from sqlalchemy.ext.asyncio import AsyncSession
logger = logging.getLogger(__name__)
# ── 설정 ────────────────────────────────────────────────────────────────────
OLLAMA_URL = "http://localhost:11434/api/generate"
DEFAULT_MODEL = "llama3"
# 기본 정적 임계값 (THRESHOLD 방식 폴백)
DEFAULT_THRESHOLDS: Dict[str, Tuple[float, str]] = {
"CPU_USAGE": (90.0, "WARNING"),
"MEMORY_USAGE": (85.0, "WARNING"),
"DISK_USAGE": (90.0, "CRITICAL"),
"RESPONSE_TIME": (3000.0,"WARNING"), # 3초
"ERROR_RATE": (50.0, "WARNING"), # 분당 50건
"HEAP_USAGE": (85.0, "WARNING"),
"ACTIVE_CONNECTIONS": (500.0, "WARNING"),
}
# 단위 맵
METRIC_UNITS: Dict[str, str] = {
"CPU_USAGE": "%",
"MEMORY_USAGE": "%",
"DISK_USAGE": "%",
"RESPONSE_TIME": "ms",
"ERROR_RATE": "건/분",
"NETWORK_RX": "KB/s",
"NETWORK_TX": "KB/s",
"ACTIVE_CONNECTIONS": "",
"HEAP_USAGE": "%",
"THROUGHPUT": "req/s",
}
# ── 핵심 탐지 함수 ────────────────────────────────────────────────────────────
def detect_zscore(
values: List[float],
current: float,
z_threshold: float = 3.0,
min_samples: int = 10,
) -> Tuple[bool, float, float, float]:
"""
Z-score 기반 이상 탐지.
Returns: (is_anomaly, mean, std, z_score)
"""
if len(values) < min_samples:
return False, 0.0, 0.0, 0.0
mean = statistics.mean(values)
try:
std = statistics.stdev(values)
except statistics.StatisticsError:
std = 0.0
if std < 1e-6:
# 표준편차가 0에 가까우면 분산 없음 → 임계값 방식으로 폴백
return False, mean, std, 0.0
z = abs(current - mean) / std
return z > z_threshold, mean, std, z
def detect_iqr(
values: List[float],
current: float,
iqr_factor: float = 1.5,
min_samples: int = 10,
) -> Tuple[bool, float, float]:
"""
IQR 기반 이상 탐지.
Returns: (is_anomaly, lower_fence, upper_fence)
"""
if len(values) < min_samples:
return False, 0.0, 0.0
sorted_vals = sorted(values)
n = len(sorted_vals)
q1_idx = n // 4
q3_idx = 3 * n // 4
q1 = sorted_vals[q1_idx]
q3 = sorted_vals[q3_idx]
iqr = q3 - q1
lower = q1 - iqr_factor * iqr
upper = q3 + iqr_factor * iqr
return current > upper or current < lower, lower, upper
def detect_threshold(
current: float,
threshold: float,
operator: str = "gt",
) -> bool:
"""정적 임계값 비교. operator: gt(>), lt(<), gte(>=), lte(<=)."""
if operator == "gt":
return current > threshold
elif operator == "lt":
return current < threshold
elif operator == "gte":
return current >= threshold
elif operator == "lte":
return current <= threshold
return current > threshold
def detect_trend(
values: List[float],
window: int = 5,
deviation_pct: float = 20.0,
) -> Tuple[bool, str]:
"""
연속 추세 탐지: 최근 window 포인트가 모두 상승 or 하강.
Returns: (is_anomaly, direction) direction: "RISING" | "FALLING" | "STABLE"
"""
if len(values) < window:
return False, "STABLE"
recent = values[-window:]
diffs = [recent[i + 1] - recent[i] for i in range(len(recent) - 1)]
if all(d > 0 for d in diffs):
# 연속 상승 — 변화폭 확인
total_change = abs(recent[-1] - recent[0])
if recent[0] > 0 and (total_change / recent[0]) * 100 >= deviation_pct:
return True, "RISING"
elif all(d < 0 for d in diffs):
# 연속 하강 — throughput / 응답시간 하락 경보
total_change = abs(recent[-1] - recent[0])
if recent[0] > 0 and (total_change / recent[0]) * 100 >= deviation_pct:
return True, "FALLING"
return False, "STABLE"
# ── 룰 기반 탐지 실행 ─────────────────────────────────────────────────────────
def run_detection(
metric_type: str,
current_value: float,
historical_values: List[float],
method: str = "ZSCORE",
z_threshold: float = 3.0,
iqr_factor: float = 1.5,
threshold: Optional[float] = None,
min_samples: int = 10,
) -> Dict:
"""
통합 이상 탐지 실행.
Returns: {
is_anomaly: bool,
method: str,
baseline_mean: float,
baseline_std: float,
z_score: float,
threshold_used: float,
detail: str,
}
"""
result = {
"is_anomaly": False,
"method": method,
"baseline_mean": None,
"baseline_std": None,
"z_score": None,
"threshold_used": None,
"detail": "",
}
if method == "ZSCORE":
is_anom, mean, std, z = detect_zscore(
historical_values, current_value, z_threshold, min_samples
)
result.update({
"is_anomaly": is_anom,
"baseline_mean": round(mean, 4),
"baseline_std": round(std, 4),
"z_score": round(z, 4),
"detail": f"Z-score={z:.2f} (임계={z_threshold}), 평균={mean:.2f}, 표준편차={std:.2f}",
})
elif method == "IQR":
is_anom, lower, upper = detect_iqr(
historical_values, current_value, iqr_factor, min_samples
)
result.update({
"is_anomaly": is_anom,
"threshold_used": upper,
"detail": f"IQR 범위=[{lower:.2f}, {upper:.2f}], 현재={current_value:.2f}",
})
if historical_values and len(historical_values) >= min_samples:
result["baseline_mean"] = round(statistics.mean(historical_values), 4)
try:
result["baseline_std"] = round(statistics.stdev(historical_values), 4)
except Exception:
pass
elif method == "THRESHOLD":
thr = threshold
if thr is None:
# 기본 임계값 참조
thr_info = DEFAULT_THRESHOLDS.get(metric_type)
thr = thr_info[0] if thr_info else 90.0
is_anom = detect_threshold(current_value, thr)
result.update({
"is_anomaly": is_anom,
"threshold_used": thr,
"detail": f"임계값={thr}, 현재={current_value}",
})
elif method == "TREND":
is_anom, direction = detect_trend(historical_values)
result.update({
"is_anomaly": is_anom,
"detail": f"추세={direction}",
})
if historical_values:
result["baseline_mean"] = round(statistics.mean(historical_values), 4)
return result
# ── 이상 이벤트 제목/설명 자동 생성 ──────────────────────────────────────────
_METRIC_KO = {
"CPU_USAGE": "CPU 사용률",
"MEMORY_USAGE": "메모리 사용률",
"DISK_USAGE": "디스크 사용률",
"RESPONSE_TIME": "응답 시간",
"ERROR_RATE": "에러 발생률",
"NETWORK_RX": "네트워크 수신량",
"NETWORK_TX": "네트워크 송신량",
"ACTIVE_CONNECTIONS": "활성 연결 수",
"HEAP_USAGE": "JVM 힙 사용률",
"THROUGHPUT": "처리량",
"CUSTOM": "사용자 정의 메트릭",
}
def build_event_title(source: str, metric_type: str, current_value: float,
unit: str = "") -> str:
metric_ko = _METRIC_KO.get(metric_type, metric_type)
return f"[이상 탐지] {source}{metric_ko} {current_value:.1f}{unit}"
def build_event_description(metric_type: str, detect_result: Dict,
current_value: float, unit: str = "") -> str:
metric_ko = _METRIC_KO.get(metric_type, metric_type)
lines = [
f"메트릭: {metric_ko}",
f"현재값: {current_value:.2f} {unit}",
f"탐지 방법: {detect_result['method']}",
detect_result.get("detail", ""),
]
if detect_result.get("baseline_mean") is not None:
lines.append(f"기준 평균: {detect_result['baseline_mean']:.2f} {unit}")
return "\n".join(l for l in lines if l)
# ── Ollama LLM 분석 (선택적) ──────────────────────────────────────────────────
async def _call_ollama_analysis(
source: str,
metric_type: str,
current_value: float,
baseline_mean: Optional[float],
z_score: Optional[float],
detect_detail: str,
model: str = DEFAULT_MODEL,
timeout: int = 60,
) -> str:
"""Ollama를 통해 이상 탐지 결과에 대한 자연어 분석 생성."""
metric_ko = _METRIC_KO.get(metric_type, metric_type)
unit = METRIC_UNITS.get(metric_type, "")
mean_str = f"{baseline_mean:.2f}{unit}" if baseline_mean is not None else "N/A"
z_str = f"{z_score:.2f}" if z_score is not None else "N/A"
prompt = f"""시스템 이상 탐지 분석 보고서를 작성하라. JSON 형식으로만 응답하라.
서버/소스: {source}
메트릭: {metric_ko} ({metric_type})
현재값: {current_value:.2f}{unit}
기준 평균: {mean_str}
Z-Score: {z_str}
탐지 상세: {detect_detail}
다음 JSON을 반환하라:
{{
"cause": "예상 원인 (1-2 문장)",
"impact": "예상 영향 (1-2 문장)",
"actions": ["즉시 조치 1", "조치 2", "조치 3"],
"urgency": "HIGH|MEDIUM|LOW"
}}"""
try:
async with httpx.AsyncClient(timeout=timeout) as client:
resp = await client.post(
OLLAMA_URL,
json={"model": model, "prompt": prompt, "stream": False},
)
if resp.status_code == 200:
raw = resp.json().get("response", "")
# JSON 추출
start = raw.find("{")
end = raw.rfind("}") + 1
if start >= 0 and end > start:
return raw[start:end]
return raw[:500]
except (httpx.ConnectError, httpx.TimeoutException) as e:
logger.debug("Ollama 연결 실패 (이상 탐지 분석): %s", e)
return ""
# ── DB 기반 탐지 실행 ─────────────────────────────────────────────────────────
async def fetch_recent_values(
db: AsyncSession,
source: str,
metric_type: str,
window_size: int = 60,
before_dt: Optional[datetime] = None,
) -> List[float]:
"""최근 window_size 개 스냅샷 값 조회."""
from models import MetricSnapshot
q = (
select(MetricSnapshot.value)
.where(
and_(
MetricSnapshot.source == source,
MetricSnapshot.metric_type == metric_type,
)
)
.order_by(MetricSnapshot.recorded_at.desc())
.limit(window_size)
)
if before_dt:
q = q.where(MetricSnapshot.recorded_at < before_dt)
rows = (await db.execute(q)).scalars().all()
return list(reversed(rows)) # 오래된 순 → 최근 순
async def run_rules_on_metric(
db: AsyncSession,
source: str,
metric_type: str,
current_value: float,
unit: Optional[str] = None,
server_id: Optional[int] = None,
include_llm: bool = False,
) -> List[Dict]:
"""
주어진 메트릭에 대해 활성 룰 전체를 실행하고
이상 탐지 시 AnomalyEvent를 생성해 반환.
"""
from models import AnomalyRule, AnomalyEvent
# 적용 가능한 룰 조회
q = select(AnomalyRule).where(
and_(
AnomalyRule.metric_type == metric_type,
AnomalyRule.is_active == True,
)
)
rules = (await db.execute(q)).scalars().all()
# 룰이 없으면 기본 THRESHOLD 룰 적용
if not rules:
rules = _get_default_rules(metric_type)
events_created = []
for rule in rules:
# source 패턴 필터
if hasattr(rule, "source_pattern") and rule.source_pattern:
import fnmatch
if not fnmatch.fnmatch(source, rule.source_pattern):
continue
# 히스토리 조회
window = getattr(rule, "window_size", 60)
min_s = getattr(rule, "min_samples", 10)
history = await fetch_recent_values(db, source, metric_type, window)
method = getattr(rule, "method", "ZSCORE")
z_thr = getattr(rule, "z_threshold", 3.0)
iqr_factor = getattr(rule, "iqr_factor", 1.5)
threshold = getattr(rule, "threshold", None)
severity = getattr(rule, "severity", "WARNING")
rule_name = getattr(rule, "name", f"DEFAULT_{metric_type}")
detect_result = run_detection(
metric_type=metric_type,
current_value=current_value,
historical_values=history,
method=method,
z_threshold=z_thr,
iqr_factor=iqr_factor,
threshold=threshold,
min_samples=min_s,
)
if not detect_result["is_anomaly"]:
continue
# 중복 방지: 같은 소스+메트릭에 OPEN 이벤트가 있으면 스킵
existing_q = select(AnomalyEvent).where(
and_(
AnomalyEvent.source == source,
AnomalyEvent.metric_type == metric_type,
AnomalyEvent.status == "OPEN",
)
)
existing = (await db.execute(existing_q)).scalars().first()
if existing:
continue
# LLM 분석 (선택적)
llm_text = ""
if include_llm:
try:
llm_text = await _call_ollama_analysis(
source=source,
metric_type=metric_type,
current_value=current_value,
baseline_mean=detect_result.get("baseline_mean"),
z_score=detect_result.get("z_score"),
detect_detail=detect_result.get("detail", ""),
)
except Exception as e:
logger.debug("LLM 분석 오류: %s", e)
unit_str = unit or METRIC_UNITS.get(metric_type, "")
event = AnomalyEvent(
server_id = server_id,
source = source,
metric_type = metric_type,
severity = severity,
status = "OPEN",
title = build_event_title(source, metric_type, current_value, unit_str),
description = build_event_description(metric_type, detect_result, current_value, unit_str),
current_value = current_value,
baseline_mean = detect_result.get("baseline_mean"),
baseline_std = detect_result.get("baseline_std"),
z_score = detect_result.get("z_score"),
threshold = detect_result.get("threshold_used"),
detect_method = method,
llm_analysis = llm_text or None,
rule_name = rule_name,
detected_at = datetime.utcnow(),
)
db.add(event)
await db.flush()
events_created.append({
"id": event.id,
"source": source,
"metric_type": metric_type,
"severity": severity,
"title": event.title,
"current_value": current_value,
"z_score": detect_result.get("z_score"),
"rule_name": rule_name,
})
await db.commit()
return events_created
class _MockRule:
"""DB 룰이 없을 때 사용하는 기본 룰 객체."""
def __init__(self, metric_type: str):
info = DEFAULT_THRESHOLDS.get(metric_type, (90.0, "WARNING"))
self.name = f"DEFAULT_{metric_type}"
self.source_pattern = None
self.metric_type = metric_type
self.method = "THRESHOLD"
self.threshold = info[0]
self.z_threshold = 3.0
self.iqr_factor = 1.5
self.window_size = 60
self.min_samples = 10
self.severity = info[1]
self.is_active = True
def _get_default_rules(metric_type: str) -> List[_MockRule]:
if metric_type in DEFAULT_THRESHOLDS:
return [_MockRule(metric_type)]
return []
# ── 시뮬레이션 데이터 생성 (테스트용) ────────────────────────────────────────
def generate_simulation_data(
normal_count: int = 50,
baseline_mean: float = 40.0,
baseline_std: float = 10.0,
anomaly_value: float = 95.0,
) -> Tuple[List[float], float]:
"""
정상 분포 데이터 + 이상 값 반환.
Returns: (normal_values, anomaly_value)
"""
import random
normal = [
max(0.0, min(100.0, random.gauss(baseline_mean, baseline_std)))
for _ in range(normal_count)
]
return normal, anomaly_value