zioinfo-mail/workspace/guardia-itsm/core/predictive.py

"""
B-6: 예측 유지보수 엔진

기능:
  1. 메트릭 트렌드 분석 (선형 회귀 기반)
  2. 임계값 도달 시점 예측 (TTR: Time-To-Reach)
  3. 예방적 SR 자동 생성
  4. 장비 노후화 분석 (CMDB 수명 기반)
  5. 예측 정확도 피드백 (실제 vs 예측)
"""
from __future__ import annotations

import logging
import math
from datetime import datetime, timedelta
from typing import Any, Dict, List, Optional, Tuple

from sqlalchemy import select, and_, desc, func
from sqlalchemy.ext.asyncio import AsyncSession

logger = logging.getLogger(__name__)


# ── 예측 임계값 기본값 ────────────────────────────────────────────────────────

PREDICTION_THRESHOLDS: Dict[str, Dict] = {
    "CPU_USAGE": {
        "warning":  75.0,
        "critical": 90.0,
        "unit":     "%",
        "horizon_hours": 24,
    },
    "MEMORY_USAGE": {
        "warning":  80.0,
        "critical": 90.0,
        "unit":     "%",
        "horizon_hours": 24,
    },
    "DISK_USAGE": {
        "warning":  75.0,
        "critical": 85.0,
        "unit":     "%",
        "horizon_hours": 72,
    },
    "RESPONSE_TIME": {
        "warning":  3000.0,
        "critical": 5000.0,
        "unit":     "ms",
        "horizon_hours": 12,
    },
}

# 장비 수명 기준 (년)
EQUIPMENT_LIFESPAN: Dict[str, int] = {
    "SERVER":   7,
    "NETWORK":  5,
    "STORAGE":  5,
    "PC":       5,
    "PRINTER":  7,
    "UPS":      5,
    "DEFAULT":  7,
}


# ── 수치 연산 유틸리티 ─────────────────────────────────────────────────────────

def linear_regression(x_vals: List[float], y_vals: List[float]) -> Tuple[float, float, float]:
    """
    최소제곱법 선형 회귀.
    Returns: (slope, intercept, r_squared)
      - slope: 단위 x당 y 변화량
      - intercept: y절편
      - r_squared: 결정계수 (0~1, 1에 가까울수록 선형 트렌드가 강함)
    """
    n = len(x_vals)
    if n < 2:
        return 0.0, (y_vals[0] if y_vals else 0.0), 0.0

    sum_x  = sum(x_vals)
    sum_y  = sum(y_vals)
    sum_xx = sum(x * x for x in x_vals)
    sum_xy = sum(x * y for x, y in zip(x_vals, y_vals))

    denom = n * sum_xx - sum_x ** 2
    if abs(denom) < 1e-10:
        return 0.0, sum_y / n, 0.0

    slope     = (n * sum_xy - sum_x * sum_y) / denom
    intercept = (sum_y - slope * sum_x) / n

    # R² 계산
    y_mean = sum_y / n
    ss_tot = sum((y - y_mean) ** 2 for y in y_vals)
    ss_res = sum((y - (slope * x + intercept)) ** 2 for x, y in zip(x_vals, y_vals))
    r_sq = 1.0 - (ss_res / ss_tot) if abs(ss_tot) > 1e-10 else 0.0
    r_sq = max(0.0, min(1.0, r_sq))

    return slope, intercept, r_sq


def predict_value(slope: float, intercept: float, x: float) -> float:
    """회귀선으로 특정 x에서 y 예측."""
    return slope * x + intercept


def time_to_reach(
    slope: float,
    intercept: float,
    current_x: float,
    target_y: float,
) -> Optional[float]:
    """
    현재 추세로 target_y에 도달하는 x 오프셋(시간) 계산.
    slope가 0이거나 방향이 반대이면 None 반환.
    Returns: 도달까지 걸리는 x단위 시간 (None이면 도달 불가)
    """
    if abs(slope) < 1e-10:
        return None
    x_target = (target_y - intercept) / slope
    delta = x_target - current_x
    if delta <= 0:
        return None  # 이미 초과했거나 감소 추세
    return delta


def moving_average(values: List[float], window: int = 5) -> List[float]:
    """이동 평균 계산."""
    if not values or window <= 0:
        return values
    result = []
    for i in range(len(values)):
        start = max(0, i - window + 1)
        chunk = values[start:i + 1]
        result.append(sum(chunk) / len(chunk))
    return result


def detect_seasonal_pattern(values: List[float], period: int = 24) -> Dict:
    """
    주기성(일별/시간별) 패턴 감지.
    Returns: {"has_pattern": bool, "peak_index": int, "amplitude": float}
    """
    if len(values) < period * 2:
        return {"has_pattern": False, "peak_index": 0, "amplitude": 0.0}

    # 하나의 주기(period) 평균
    cycle = [0.0] * period
    count = [0] * period
    for i, v in enumerate(values):
        idx = i % period
        cycle[idx] += v
        count[idx] += 1

    avg_cycle = [cycle[i] / count[i] if count[i] > 0 else 0.0 for i in range(period)]
    amplitude = max(avg_cycle) - min(avg_cycle)
    overall_mean = sum(values) / len(values) if values else 0.0
    cv = amplitude / overall_mean if overall_mean > 0 else 0.0

    peak_index = avg_cycle.index(max(avg_cycle))

    return {
        "has_pattern": cv > 0.1,   # 변동계수 10% 이상이면 패턴 있음
        "peak_index":  peak_index,
        "amplitude":   round(amplitude, 2),
        "cycle_values": [round(v, 2) for v in avg_cycle],
    }


# ── DB 기반 예측 분석 ──────────────────────────────────────────────────────────

async def fetch_metric_history(
    db: AsyncSession,
    source: str,
    metric_type: str,
    hours_back: int = 72,
    max_points: int = 200,
) -> List[Tuple[datetime, float]]:
    """MetricSnapshot에서 시계열 데이터 조회."""
    try:
        from models import MetricSnapshot
        since = datetime.utcnow() - timedelta(hours=hours_back)
        rows = (await db.execute(
            select(MetricSnapshot)
            .where(
                and_(
                    MetricSnapshot.source == source,
                    MetricSnapshot.metric_type == metric_type,
                    MetricSnapshot.measured_at >= since,
                )
            )
            .order_by(MetricSnapshot.measured_at)
            .limit(max_points)
        )).scalars().all()

        return [(r.measured_at, float(r.value)) for r in rows]
    except Exception as e:
        logger.debug("메트릭 이력 조회 실패: %s", e)
        return []


async def predict_metric_trend(
    db: AsyncSession,
    source: str,
    metric_type: str,
    horizon_hours: int = 24,
    hours_back: int = 72,
) -> Dict:
    """
    메트릭 트렌드 예측.
    Returns: {
      "source", "metric_type",
      "current_value", "predicted_value",
      "slope_per_hour", "r_squared",
      "ttr_warning_hours", "ttr_critical_hours",
      "trend_direction", "confidence",
      "data_points",
    }
    """
    history = await fetch_metric_history(db, source, metric_type, hours_back=hours_back)

    if len(history) < 5:
        return {
            "source":        source,
            "metric_type":   metric_type,
            "current_value": history[-1][1] if history else 0.0,
            "predicted_value": None,
            "slope_per_hour":  0.0,
            "r_squared":       0.0,
            "ttr_warning_hours":  None,
            "ttr_critical_hours": None,
            "trend_direction": "STABLE",
            "confidence":      "LOW",
            "data_points":     len(history),
            "error": "데이터 부족 (최소 5개 필요)",
        }

    # 시간 기준으로 x축 구성 (첫 측정 시각 기준 시간 오프셋)
    t0 = history[0][0]
    x_vals = [(h[0] - t0).total_seconds() / 3600.0 for h in history]
    y_vals = [h[1] for h in history]

    slope, intercept, r_sq = linear_regression(x_vals, y_vals)

    current_x   = x_vals[-1]
    current_val = y_vals[-1]
    predicted   = predict_value(slope, intercept, current_x + horizon_hours)

    # 임계값 도달 시간 계산
    cfg           = PREDICTION_THRESHOLDS.get(metric_type, {})
    warn_thresh   = cfg.get("warning")
    crit_thresh   = cfg.get("critical")

    ttr_warn = None
    ttr_crit = None
    if warn_thresh is not None:
        ttr_warn = time_to_reach(slope, intercept, current_x, warn_thresh)
    if crit_thresh is not None:
        ttr_crit = time_to_reach(slope, intercept, current_x, crit_thresh)

    # 트렌드 방향 결정
    if abs(slope) < 0.05:
        direction = "STABLE"
    elif slope > 0:
        direction = "INCREASING"
    else:
        direction = "DECREASING"

    # 신뢰도 판정
    if r_sq >= 0.8 and len(history) >= 20:
        confidence = "HIGH"
    elif r_sq >= 0.5 and len(history) >= 10:
        confidence = "MEDIUM"
    else:
        confidence = "LOW"

    return {
        "source":              source,
        "metric_type":         metric_type,
        "current_value":       round(current_val, 2),
        "predicted_value":     round(predicted, 2),
        "horizon_hours":       horizon_hours,
        "slope_per_hour":      round(slope, 4),
        "r_squared":           round(r_sq, 4),
        "ttr_warning_hours":   round(ttr_warn, 1) if ttr_warn else None,
        "ttr_critical_hours":  round(ttr_crit, 1) if ttr_crit else None,
        "trend_direction":     direction,
        "confidence":          confidence,
        "data_points":         len(history),
        "warning_threshold":   warn_thresh,
        "critical_threshold":  crit_thresh,
        "unit":                cfg.get("unit", ""),
    }


async def analyze_server_health(
    db: AsyncSession,
    source: str,
    metric_types: Optional[List[str]] = None,
) -> Dict:
    """
    서버 전체 건강도 분석 (여러 메트릭 종합).
    Returns: {"source", "health_score", "risk_level", "metrics": [...], "recommendations": [...]}
    """
    if metric_types is None:
        metric_types = list(PREDICTION_THRESHOLDS.keys())

    metrics_result = []
    risk_scores    = []
    recommendations = []

    for mt in metric_types:
        pred = await predict_metric_trend(db, source, mt)
        metrics_result.append(pred)

        # 위험도 점수 계산 (0~100)
        curr = pred.get("current_value", 0)
        cfg  = PREDICTION_THRESHOLDS.get(mt, {})
        crit = cfg.get("critical", 100)
        warn = cfg.get("warning", 80)

        if crit and curr >= crit:
            risk_scores.append(90)
        elif warn and curr >= warn:
            risk_scores.append(60)
        else:
            risk_scores.append(max(0, (curr / crit * 50) if crit else 0))

        # 예측 기반 권고 생성
        ttr_crit = pred.get("ttr_critical_hours")
        ttr_warn = pred.get("ttr_warning_hours")
        if ttr_crit and ttr_crit < 24:
            recommendations.append({
                "metric":   mt,
                "severity": "CRITICAL",
                "message":  f"{mt} {round(ttr_crit)}시간 내 임계값 초과 예측 — 즉시 조치 필요",
                "ttr_hours": ttr_crit,
            })
        elif ttr_warn and ttr_warn < 48:
            recommendations.append({
                "metric":   mt,
                "severity": "WARNING",
                "message":  f"{mt} {round(ttr_warn)}시간 내 경고 임계값 도달 예측 — 모니터링 강화",
                "ttr_hours": ttr_warn,
            })

    avg_risk = sum(risk_scores) / len(risk_scores) if risk_scores else 0
    health_score = round(100 - avg_risk, 1)

    if health_score >= 80:
        risk_level = "LOW"
    elif health_score >= 60:
        risk_level = "MEDIUM"
    elif health_score >= 40:
        risk_level = "HIGH"
    else:
        risk_level = "CRITICAL"

    return {
        "source":          source,
        "health_score":    health_score,
        "risk_level":      risk_level,
        "metrics":         metrics_result,
        "recommendations": recommendations,
        "analyzed_at":     datetime.utcnow().isoformat(),
    }


# ── 예방적 SR 자동 생성 ────────────────────────────────────────────────────────

async def create_preventive_sr(
    db: AsyncSession,
    source: str,
    metric_type: str,
    ttr_hours: float,
    predicted_value: float,
    threshold: float,
    severity: str = "WARNING",
) -> Optional[Dict]:
    """
    예방적 SR 자동 생성 (예측 기반 선제 대응).
    Returns: {"created": bool, "sr_id": str, "title": str}
    """
    try:
        from models import SRRequest, SRStatus, SRType

        # 같은 소스+메트릭으로 24시간 내 예방 SR이 있으면 중복 생성 방지
        since = datetime.utcnow() - timedelta(hours=24)
        keyword = f"[예방] {source} {metric_type}"
        existing = (await db.execute(
            select(SRRequest).where(
                and_(
                    SRRequest.created_at >= since,
                    SRRequest.title.like(f"%{source}%"),
                    SRRequest.title.like(f"%예방%"),
                    SRRequest.status.notin_(["COMPLETED", "CANCELLED"]),
                )
            )
        )).scalars().first()

        if existing:
            return {"created": False, "reason": "중복 예방 SR 존재", "sr_id": existing.sr_id}

        unit = PREDICTION_THRESHOLDS.get(metric_type, {}).get("unit", "")
        title = (
            f"[예방] {source} {metric_type} {round(ttr_hours)}h 내 "
            f"임계값({threshold}{unit}) 초과 예측"
        )
        desc = (
            f"예측 유지보수 에이전트가 자동 감지:\n"
            f"- 대상 서버: {source}\n"
            f"- 메트릭: {metric_type}\n"
            f"- 현재 예측값: {round(predicted_value, 1)}{unit}\n"
            f"- 임계값: {threshold}{unit}\n"
            f"- 예상 도달 시간: {round(ttr_hours, 1)}시간 후\n"
            f"- 심각도: {severity}\n\n"
            f"권고 조치: 해당 서버의 {metric_type} 원인 분석 후 선제 대응 수행"
        )

        # SR ID 생성 (간단 시퀀스)
        today     = datetime.utcnow().strftime("%Y%m%d")
        prefix    = f"PM-{today}-"
        last_sr = (await db.execute(
            select(SRRequest.sr_id)
            .where(SRRequest.sr_id.like(f"{prefix}%"))
            .order_by(desc(SRRequest.sr_id))
            .limit(1)
        )).scalar()
        seq = 1
        if last_sr:
            try:
                seq = int(last_sr.split("-")[-1]) + 1
            except ValueError:
                seq = 1
        sr_id = f"{prefix}{seq:04d}"

        sr = SRRequest(
            sr_id       = sr_id,
            title       = title,
            description = desc,
            status      = "OPEN",
            priority    = "HIGH" if severity == "CRITICAL" else "MEDIUM",
            sr_type     = "OTHER",
            created_at  = datetime.utcnow(),
        )
        db.add(sr)
        await db.commit()
        await db.refresh(sr)

        logger.info("예방 SR 생성: %s (%s %s TTR=%.1fh)", sr_id, source, metric_type, ttr_hours)
        return {
            "created": True,
            "sr_id":   sr_id,
            "title":   title,
        }

    except Exception as e:
        logger.error("예방 SR 생성 실패: %s", e)
        return {"created": False, "reason": str(e)[:100]}


# ── 장비 노후화 분석 ──────────────────────────────────────────────────────────

def calculate_equipment_age(install_date: datetime) -> float:
    """장비 사용 연수 계산."""
    delta = datetime.utcnow() - install_date
    return delta.days / 365.25


def assess_equipment_lifecycle(
    equipment_type: str,
    install_date: datetime,
    last_maintenance: Optional[datetime] = None,
) -> Dict:
    """
    장비 수명 주기 평가.
    Returns: {"age_years", "lifespan_years", "usage_pct", "status", "months_to_eol", "recommendation"}
    """
    age_years     = calculate_equipment_age(install_date)
    lifespan      = EQUIPMENT_LIFESPAN.get(equipment_type.upper(), EQUIPMENT_LIFESPAN["DEFAULT"])
    usage_pct     = min(100.0, round(age_years / lifespan * 100, 1))
    months_to_eol = round((lifespan - age_years) * 12, 1)

    if usage_pct >= 100:
        status = "EOL"          # End of Life
        recommendation = "즉시 교체 계획 수립 필요 (수명 초과)"
    elif usage_pct >= 85:
        status = "CRITICAL"
        recommendation = f"교체 준비 필요 (약 {max(0, round(months_to_eol))}개월 후 수명 종료)"
    elif usage_pct >= 70:
        status = "WARNING"
        recommendation = f"교체 예산 편성 검토 권장 (약 {round(months_to_eol)}개월 후)"
    else:
        status = "HEALTHY"
        recommendation = "정상 운영 가능"

    # 마지막 유지보수 경과일
    days_since_maint = None
    if last_maintenance:
        days_since_maint = (datetime.utcnow() - last_maintenance).days

    return {
        "age_years":        round(age_years, 2),
        "lifespan_years":   lifespan,
        "usage_pct":        usage_pct,
        "status":           status,
        "months_to_eol":    months_to_eol,
        "recommendation":   recommendation,
        "days_since_maintenance": days_since_maint,
    }


async def run_lifecycle_analysis(
    db: AsyncSession,
    equipment_type_filter: Optional[str] = None,
    max_items: int = 100,
) -> Dict:
    """
    CMDB 서버 목록을 기반으로 수명 주기 분석 실행.
    Returns: {"analyzed": int, "critical": int, "warning": int, "eol": int, "items": [...]}
    """
    try:
        from models import Server

        q = select(Server)
        if equipment_type_filter:
            q = q.where(Server.server_type == equipment_type_filter.upper())
        q = q.limit(max_items)

        servers = (await db.execute(q)).scalars().all()
        items   = []
        counts  = {"EOL": 0, "CRITICAL": 0, "WARNING": 0, "HEALTHY": 0}

        for srv in servers:
            install_date = getattr(srv, "install_date", None) or (datetime.utcnow() - timedelta(days=365 * 3))
            srv_type     = getattr(srv, "server_type", "SERVER") or "SERVER"

            assessment = assess_equipment_lifecycle(
                equipment_type   = srv_type,
                install_date     = install_date,
                last_maintenance = getattr(srv, "last_maintenance_at", None),
            )
            assessment["server_id"]   = srv.id
            assessment["server_name"] = getattr(srv, "hostname", str(srv.id))
            assessment["server_type"] = srv_type

            items.append(assessment)
            counts[assessment["status"]] = counts.get(assessment["status"], 0) + 1

        # 위험도 순 정렬
        order = {"EOL": 0, "CRITICAL": 1, "WARNING": 2, "HEALTHY": 3}
        items.sort(key=lambda x: (order.get(x["status"], 4), -x["usage_pct"]))

        return {
            "analyzed":  len(items),
            "eol":       counts["EOL"],
            "critical":  counts["CRITICAL"],
            "warning":   counts["WARNING"],
            "healthy":   counts["HEALTHY"],
            "items":     items,
            "analyzed_at": datetime.utcnow().isoformat(),
        }

    except Exception as e:
        logger.error("수명 주기 분석 오류: %s", e)
        return {
            "analyzed": 0,
            "eol": 0, "critical": 0, "warning": 0, "healthy": 0,
            "items": [],
            "error": str(e)[:100],
        }


# ── 예측 배치 실행 ────────────────────────────────────────────────────────────

async def run_predictive_batch(
    db: AsyncSession,
    auto_create_sr: bool = True,
    ttr_threshold_hours: float = 48.0,
    max_sources: int = 50,
) -> Dict:
    """
    모든 활성 서버에 대해 예측 유지보수 배치 실행.
    Returns: {"analyzed": int, "alerts": int, "srs_created": int, "results": [...]}
    """
    try:
        from models import MetricSnapshot

        # 최근 24시간 내 메트릭이 있는 소스 목록 조회
        since = datetime.utcnow() - timedelta(hours=24)
        sources_rows = (await db.execute(
            select(MetricSnapshot.source)
            .where(MetricSnapshot.measured_at >= since)
            .distinct()
            .limit(max_sources)
        )).scalars().all()
        sources = list(sources_rows)

    except Exception:
        sources = []

    results    = []
    alerts     = 0
    srs_created = 0

    for source in sources:
        for mt in PREDICTION_THRESHOLDS.keys():
            try:
                pred = await predict_metric_trend(db, source, mt)
                ttr_crit = pred.get("ttr_critical_hours")
                ttr_warn = pred.get("ttr_warning_hours")

                alert_level = None
                ttr_used    = None

                if ttr_crit and ttr_crit <= ttr_threshold_hours:
                    alert_level = "CRITICAL"
                    ttr_used    = ttr_crit
                    alerts += 1
                elif ttr_warn and ttr_warn <= ttr_threshold_hours * 2:
                    alert_level = "WARNING"
                    ttr_used    = ttr_warn
                    alerts += 1

                if alert_level and auto_create_sr and ttr_used:
                    sr_result = await create_preventive_sr(
                        db             = db,
                        source         = source,
                        metric_type    = mt,
                        ttr_hours      = ttr_used,
                        predicted_value = pred.get("predicted_value", 0),
                        threshold      = (
                            PREDICTION_THRESHOLDS[mt]["critical"]
                            if alert_level == "CRITICAL"
                            else PREDICTION_THRESHOLDS[mt]["warning"]
                        ),
                        severity       = alert_level,
                    )
                    if sr_result and sr_result.get("created"):
                        srs_created += 1
                    pred["preventive_sr"] = sr_result

                if alert_level:
                    pred["alert_level"] = alert_level
                    results.append(pred)

            except Exception as e:
                logger.debug("예측 배치 오류 (%s/%s): %s", source, mt, e)

    return {
        "analyzed":    len(sources) * len(PREDICTION_THRESHOLDS),
        "sources":     len(sources),
        "alerts":      alerts,
        "srs_created": srs_created,
        "results":     results,
        "run_at":      datetime.utcnow().isoformat(),
    }