guardia-itsm/core/anomaly.py

"""
B-1: AI 이상 탐지 (Anomaly Detection) 엔진

탐지 방법:
  1. ZSCORE   — Z-score 기반 통계적 이상 탐지
  2. IQR      — 사분위수 범위(IQR) 기반 이상치 탐지
  3. THRESHOLD — 정적 임계값 초과
  4. TREND    — 연속 상승/하강 추세 이탈

Ollama LLM은 탐지 후 자연어 분석 생성에만 사용 (탐지 자체는 순수 통계).
"""
from __future__ import annotations

import json
import logging
import math
import statistics
from datetime import datetime, timedelta
from pathlib import Path
from typing import Dict, List, Optional, Tuple

import httpx
from sqlalchemy import select, and_
from sqlalchemy.ext.asyncio import AsyncSession

logger = logging.getLogger(__name__)

# ── 설정 ────────────────────────────────────────────────────────────────────

OLLAMA_URL   = "http://localhost:11434/api/generate"
DEFAULT_MODEL = "llama3"

# 기본 정적 임계값 (THRESHOLD 방식 폴백)
DEFAULT_THRESHOLDS: Dict[str, Tuple[float, str]] = {
    "CPU_USAGE":          (90.0,  "WARNING"),
    "MEMORY_USAGE":       (85.0,  "WARNING"),
    "DISK_USAGE":         (90.0,  "CRITICAL"),
    "RESPONSE_TIME":      (3000.0,"WARNING"),    # 3초
    "ERROR_RATE":         (50.0,  "WARNING"),    # 분당 50건
    "HEAP_USAGE":         (85.0,  "WARNING"),
    "ACTIVE_CONNECTIONS": (500.0, "WARNING"),
}

# 단위 맵
METRIC_UNITS: Dict[str, str] = {
    "CPU_USAGE":          "%",
    "MEMORY_USAGE":       "%",
    "DISK_USAGE":         "%",
    "RESPONSE_TIME":      "ms",
    "ERROR_RATE":         "건/분",
    "NETWORK_RX":         "KB/s",
    "NETWORK_TX":         "KB/s",
    "ACTIVE_CONNECTIONS": "개",
    "HEAP_USAGE":         "%",
    "THROUGHPUT":         "req/s",
}


# ── 핵심 탐지 함수 ────────────────────────────────────────────────────────────

def detect_zscore(
    values: List[float],
    current: float,
    z_threshold: float = 3.0,
    min_samples: int = 10,
) -> Tuple[bool, float, float, float]:
    """
    Z-score 기반 이상 탐지.
    Returns: (is_anomaly, mean, std, z_score)
    """
    if len(values) < min_samples:
        return False, 0.0, 0.0, 0.0

    mean = statistics.mean(values)
    try:
        std = statistics.stdev(values)
    except statistics.StatisticsError:
        std = 0.0

    if std < 1e-6:
        # 표준편차가 0에 가까우면 분산 없음 → 임계값 방식으로 폴백
        return False, mean, std, 0.0

    z = abs(current - mean) / std
    return z > z_threshold, mean, std, z


def detect_iqr(
    values: List[float],
    current: float,
    iqr_factor: float = 1.5,
    min_samples: int = 10,
) -> Tuple[bool, float, float]:
    """
    IQR 기반 이상 탐지.
    Returns: (is_anomaly, lower_fence, upper_fence)
    """
    if len(values) < min_samples:
        return False, 0.0, 0.0

    sorted_vals = sorted(values)
    n = len(sorted_vals)
    q1_idx = n // 4
    q3_idx = 3 * n // 4
    q1 = sorted_vals[q1_idx]
    q3 = sorted_vals[q3_idx]
    iqr = q3 - q1

    lower = q1 - iqr_factor * iqr
    upper = q3 + iqr_factor * iqr
    return current > upper or current < lower, lower, upper


def detect_threshold(
    current: float,
    threshold: float,
    operator: str = "gt",
) -> bool:
    """정적 임계값 비교. operator: gt(>), lt(<), gte(>=), lte(<=)."""
    if operator == "gt":
        return current > threshold
    elif operator == "lt":
        return current < threshold
    elif operator == "gte":
        return current >= threshold
    elif operator == "lte":
        return current <= threshold
    return current > threshold


def detect_trend(
    values: List[float],
    window: int = 5,
    deviation_pct: float = 20.0,
) -> Tuple[bool, str]:
    """
    연속 추세 탐지: 최근 window 포인트가 모두 상승 or 하강.
    Returns: (is_anomaly, direction)  direction: "RISING" | "FALLING" | "STABLE"
    """
    if len(values) < window:
        return False, "STABLE"

    recent = values[-window:]
    diffs = [recent[i + 1] - recent[i] for i in range(len(recent) - 1)]

    if all(d > 0 for d in diffs):
        # 연속 상승 — 변화폭 확인
        total_change = abs(recent[-1] - recent[0])
        if recent[0] > 0 and (total_change / recent[0]) * 100 >= deviation_pct:
            return True, "RISING"
    elif all(d < 0 for d in diffs):
        # 연속 하강 — throughput / 응답시간 하락 경보
        total_change = abs(recent[-1] - recent[0])
        if recent[0] > 0 and (total_change / recent[0]) * 100 >= deviation_pct:
            return True, "FALLING"

    return False, "STABLE"


# ── 룰 기반 탐지 실행 ─────────────────────────────────────────────────────────

def run_detection(
    metric_type: str,
    current_value: float,
    historical_values: List[float],
    method: str = "ZSCORE",
    z_threshold: float = 3.0,
    iqr_factor: float = 1.5,
    threshold: Optional[float] = None,
    min_samples: int = 10,
) -> Dict:
    """
    통합 이상 탐지 실행.
    Returns: {
        is_anomaly: bool,
        method: str,
        baseline_mean: float,
        baseline_std: float,
        z_score: float,
        threshold_used: float,
        detail: str,
    }
    """
    result = {
        "is_anomaly": False,
        "method": method,
        "baseline_mean": None,
        "baseline_std": None,
        "z_score": None,
        "threshold_used": None,
        "detail": "",
    }

    if method == "ZSCORE":
        is_anom, mean, std, z = detect_zscore(
            historical_values, current_value, z_threshold, min_samples
        )
        result.update({
            "is_anomaly": is_anom,
            "baseline_mean": round(mean, 4),
            "baseline_std": round(std, 4),
            "z_score": round(z, 4),
            "detail": f"Z-score={z:.2f} (임계={z_threshold}), 평균={mean:.2f}, 표준편차={std:.2f}",
        })

    elif method == "IQR":
        is_anom, lower, upper = detect_iqr(
            historical_values, current_value, iqr_factor, min_samples
        )
        result.update({
            "is_anomaly": is_anom,
            "threshold_used": upper,
            "detail": f"IQR 범위=[{lower:.2f}, {upper:.2f}], 현재={current_value:.2f}",
        })
        if historical_values and len(historical_values) >= min_samples:
            result["baseline_mean"] = round(statistics.mean(historical_values), 4)
            try:
                result["baseline_std"] = round(statistics.stdev(historical_values), 4)
            except Exception:
                pass

    elif method == "THRESHOLD":
        thr = threshold
        if thr is None:
            # 기본 임계값 참조
            thr_info = DEFAULT_THRESHOLDS.get(metric_type)
            thr = thr_info[0] if thr_info else 90.0
        is_anom = detect_threshold(current_value, thr)
        result.update({
            "is_anomaly": is_anom,
            "threshold_used": thr,
            "detail": f"임계값={thr}, 현재={current_value}",
        })

    elif method == "TREND":
        is_anom, direction = detect_trend(historical_values)
        result.update({
            "is_anomaly": is_anom,
            "detail": f"추세={direction}",
        })
        if historical_values:
            result["baseline_mean"] = round(statistics.mean(historical_values), 4)

    return result


# ── 이상 이벤트 제목/설명 자동 생성 ──────────────────────────────────────────

_METRIC_KO = {
    "CPU_USAGE":          "CPU 사용률",
    "MEMORY_USAGE":       "메모리 사용률",
    "DISK_USAGE":         "디스크 사용률",
    "RESPONSE_TIME":      "응답 시간",
    "ERROR_RATE":         "에러 발생률",
    "NETWORK_RX":         "네트워크 수신량",
    "NETWORK_TX":         "네트워크 송신량",
    "ACTIVE_CONNECTIONS": "활성 연결 수",
    "HEAP_USAGE":         "JVM 힙 사용률",
    "THROUGHPUT":         "처리량",
    "CUSTOM":             "사용자 정의 메트릭",
}


def build_event_title(source: str, metric_type: str, current_value: float,
                      unit: str = "") -> str:
    metric_ko = _METRIC_KO.get(metric_type, metric_type)
    return f"[이상 탐지] {source} — {metric_ko} {current_value:.1f}{unit}"


def build_event_description(metric_type: str, detect_result: Dict,
                             current_value: float, unit: str = "") -> str:
    metric_ko = _METRIC_KO.get(metric_type, metric_type)
    lines = [
        f"메트릭: {metric_ko}",
        f"현재값: {current_value:.2f} {unit}",
        f"탐지 방법: {detect_result['method']}",
        detect_result.get("detail", ""),
    ]
    if detect_result.get("baseline_mean") is not None:
        lines.append(f"기준 평균: {detect_result['baseline_mean']:.2f} {unit}")
    return "\n".join(l for l in lines if l)


# ── Ollama LLM 분석 (선택적) ──────────────────────────────────────────────────

async def _call_ollama_analysis(
    source: str,
    metric_type: str,
    current_value: float,
    baseline_mean: Optional[float],
    z_score: Optional[float],
    detect_detail: str,
    model: str = DEFAULT_MODEL,
    timeout: int = 60,
) -> str:
    """Ollama를 통해 이상 탐지 결과에 대한 자연어 분석 생성."""
    metric_ko = _METRIC_KO.get(metric_type, metric_type)
    unit = METRIC_UNITS.get(metric_type, "")
    mean_str = f"{baseline_mean:.2f}{unit}" if baseline_mean is not None else "N/A"
    z_str = f"{z_score:.2f}" if z_score is not None else "N/A"

    prompt = f"""시스템 이상 탐지 분석 보고서를 작성하라. JSON 형식으로만 응답하라.

서버/소스: {source}
메트릭: {metric_ko} ({metric_type})
현재값: {current_value:.2f}{unit}
기준 평균: {mean_str}
Z-Score: {z_str}
탐지 상세: {detect_detail}

다음 JSON을 반환하라:
{{
  "cause": "예상 원인 (1-2 문장)",
  "impact": "예상 영향 (1-2 문장)",
  "actions": ["즉시 조치 1", "조치 2", "조치 3"],
  "urgency": "HIGH|MEDIUM|LOW"
}}"""

    try:
        async with httpx.AsyncClient(timeout=timeout) as client:
            resp = await client.post(
                OLLAMA_URL,
                json={"model": model, "prompt": prompt, "stream": False},
            )
        if resp.status_code == 200:
            raw = resp.json().get("response", "")
            # JSON 추출
            start = raw.find("{")
            end = raw.rfind("}") + 1
            if start >= 0 and end > start:
                return raw[start:end]
            return raw[:500]
    except (httpx.ConnectError, httpx.TimeoutException) as e:
        logger.debug("Ollama 연결 실패 (이상 탐지 분석): %s", e)

    return ""


# ── DB 기반 탐지 실행 ─────────────────────────────────────────────────────────

async def fetch_recent_values(
    db: AsyncSession,
    source: str,
    metric_type: str,
    window_size: int = 60,
    before_dt: Optional[datetime] = None,
) -> List[float]:
    """최근 window_size 개 스냅샷 값 조회."""
    from models import MetricSnapshot

    q = (
        select(MetricSnapshot.value)
        .where(
            and_(
                MetricSnapshot.source == source,
                MetricSnapshot.metric_type == metric_type,
            )
        )
        .order_by(MetricSnapshot.recorded_at.desc())
        .limit(window_size)
    )
    if before_dt:
        q = q.where(MetricSnapshot.recorded_at < before_dt)

    rows = (await db.execute(q)).scalars().all()
    return list(reversed(rows))   # 오래된 순 → 최근 순


async def run_rules_on_metric(
    db: AsyncSession,
    source: str,
    metric_type: str,
    current_value: float,
    unit: Optional[str] = None,
    server_id: Optional[int] = None,
    include_llm: bool = False,
) -> List[Dict]:
    """
    주어진 메트릭에 대해 활성 룰 전체를 실행하고
    이상 탐지 시 AnomalyEvent를 생성해 반환.
    """
    from models import AnomalyRule, AnomalyEvent

    # 적용 가능한 룰 조회
    q = select(AnomalyRule).where(
        and_(
            AnomalyRule.metric_type == metric_type,
            AnomalyRule.is_active == True,
        )
    )
    rules = (await db.execute(q)).scalars().all()

    # 룰이 없으면 기본 THRESHOLD 룰 적용
    if not rules:
        rules = _get_default_rules(metric_type)

    events_created = []

    for rule in rules:
        # source 패턴 필터
        if hasattr(rule, "source_pattern") and rule.source_pattern:
            import fnmatch
            if not fnmatch.fnmatch(source, rule.source_pattern):
                continue

        # 히스토리 조회
        window = getattr(rule, "window_size", 60)
        min_s  = getattr(rule, "min_samples", 10)
        history = await fetch_recent_values(db, source, metric_type, window)

        method     = getattr(rule, "method", "ZSCORE")
        z_thr      = getattr(rule, "z_threshold", 3.0)
        iqr_factor = getattr(rule, "iqr_factor", 1.5)
        threshold  = getattr(rule, "threshold", None)
        severity   = getattr(rule, "severity", "WARNING")
        rule_name  = getattr(rule, "name", f"DEFAULT_{metric_type}")

        detect_result = run_detection(
            metric_type=metric_type,
            current_value=current_value,
            historical_values=history,
            method=method,
            z_threshold=z_thr,
            iqr_factor=iqr_factor,
            threshold=threshold,
            min_samples=min_s,
        )

        if not detect_result["is_anomaly"]:
            continue

        # 중복 방지: 같은 소스+메트릭에 OPEN 이벤트가 있으면 스킵
        existing_q = select(AnomalyEvent).where(
            and_(
                AnomalyEvent.source == source,
                AnomalyEvent.metric_type == metric_type,
                AnomalyEvent.status == "OPEN",
            )
        )
        existing = (await db.execute(existing_q)).scalars().first()
        if existing:
            continue

        # LLM 분석 (선택적)
        llm_text = ""
        if include_llm:
            try:
                llm_text = await _call_ollama_analysis(
                    source=source,
                    metric_type=metric_type,
                    current_value=current_value,
                    baseline_mean=detect_result.get("baseline_mean"),
                    z_score=detect_result.get("z_score"),
                    detect_detail=detect_result.get("detail", ""),
                )
            except Exception as e:
                logger.debug("LLM 분석 오류: %s", e)

        unit_str = unit or METRIC_UNITS.get(metric_type, "")
        event = AnomalyEvent(
            server_id       = server_id,
            source          = source,
            metric_type     = metric_type,
            severity        = severity,
            status          = "OPEN",
            title           = build_event_title(source, metric_type, current_value, unit_str),
            description     = build_event_description(metric_type, detect_result, current_value, unit_str),
            current_value   = current_value,
            baseline_mean   = detect_result.get("baseline_mean"),
            baseline_std    = detect_result.get("baseline_std"),
            z_score         = detect_result.get("z_score"),
            threshold       = detect_result.get("threshold_used"),
            detect_method   = method,
            llm_analysis    = llm_text or None,
            rule_name       = rule_name,
            detected_at     = datetime.utcnow(),
        )
        db.add(event)
        await db.flush()
        events_created.append({
            "id": event.id,
            "source": source,
            "metric_type": metric_type,
            "severity": severity,
            "title": event.title,
            "current_value": current_value,
            "z_score": detect_result.get("z_score"),
            "rule_name": rule_name,
        })

    await db.commit()
    return events_created


class _MockRule:
    """DB 룰이 없을 때 사용하는 기본 룰 객체."""
    def __init__(self, metric_type: str):
        info = DEFAULT_THRESHOLDS.get(metric_type, (90.0, "WARNING"))
        self.name           = f"DEFAULT_{metric_type}"
        self.source_pattern = None
        self.metric_type    = metric_type
        self.method         = "THRESHOLD"
        self.threshold      = info[0]
        self.z_threshold    = 3.0
        self.iqr_factor     = 1.5
        self.window_size    = 60
        self.min_samples    = 10
        self.severity       = info[1]
        self.is_active      = True


def _get_default_rules(metric_type: str) -> List[_MockRule]:
    if metric_type in DEFAULT_THRESHOLDS:
        return [_MockRule(metric_type)]
    return []


# ── 시뮬레이션 데이터 생성 (테스트용) ────────────────────────────────────────

def generate_simulation_data(
    normal_count: int = 50,
    baseline_mean: float = 40.0,
    baseline_std: float = 10.0,
    anomaly_value: float = 95.0,
) -> Tuple[List[float], float]:
    """
    정상 분포 데이터 + 이상 값 반환.
    Returns: (normal_values, anomaly_value)
    """
    import random
    normal = [
        max(0.0, min(100.0, random.gauss(baseline_mean, baseline_std)))
        for _ in range(normal_count)
    ]
    return normal, anomaly_value