guardia-itsm/routers/predictive.py

"""
B-6: 예측 유지보수 API 라우터

엔드포인트:
  POST /api/predictive/analyze/{source}    — 단일 서버 예측 분석
  POST /api/predictive/batch               — 전체 서버 배치 예측
  GET  /api/predictive/health/{source}     — 서버 종합 건강도
  GET  /api/predictive/lifecycle           — 장비 수명 주기 분석
  GET  /api/predictive/lifecycle/{source}  — 단일 장비 수명 평가
  GET  /api/predictive/thresholds          — 예측 임계값 조회
  PUT  /api/predictive/thresholds/{metric} — 예측 임계값 수정
  GET  /api/predictive/stats               — 예측 통계
"""
from __future__ import annotations

import logging
from datetime import datetime, timedelta
from typing import Dict, List, Optional

from fastapi import APIRouter, Depends, HTTPException, Query, Body
from pydantic import BaseModel
from sqlalchemy import select, desc, func
from sqlalchemy.ext.asyncio import AsyncSession

from database import get_db
from core.predictive import (
    predict_metric_trend,
    analyze_server_health,
    run_predictive_batch,
    run_lifecycle_analysis,
    assess_equipment_lifecycle,
    PREDICTION_THRESHOLDS,
    EQUIPMENT_LIFESPAN,
    linear_regression,
    moving_average,
)

logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/predictive", tags=["predictive"])


# ── Pydantic 스키마 ───────────────────────────────────────────────────────────

class ThresholdUpdateRequest(BaseModel):
    warning:  Optional[float] = None
    critical: Optional[float] = None
    horizon_hours: Optional[int] = None


class BatchRunRequest(BaseModel):
    auto_create_sr: bool = True
    ttr_threshold_hours: float = 48.0
    max_sources: int = 50


# ── 단일 서버 예측 ────────────────────────────────────────────────────────────

@router.post("/analyze/{source}")
async def analyze_source(
    source: str,
    metric_type: str = Query("CPU_USAGE"),
    horizon_hours: int = Query(24, ge=1, le=720),
    hours_back: int = Query(72, ge=6, le=720),
    db: AsyncSession = Depends(get_db),
):
    """
    단일 서버/소스에 대한 메트릭 트렌드 예측.
    - 선형 회귀로 향후 horizon_hours 후 값 예측
    - 임계값 도달 시간(TTR) 계산
    """
    result = await predict_metric_trend(
        db, source, metric_type.upper(),
        horizon_hours=horizon_hours,
        hours_back=hours_back,
    )
    return result


@router.get("/health/{source}")
async def server_health(
    source: str,
    metric_types: Optional[str] = Query(None, description="쉼표 구분 메트릭 목록"),
    db: AsyncSession = Depends(get_db),
):
    """
    서버 종합 건강도 분석 (여러 메트릭 종합 점수).
    health_score: 0~100 (높을수록 양호)
    risk_level: LOW / MEDIUM / HIGH / CRITICAL
    """
    mt_list = None
    if metric_types:
        mt_list = [m.strip().upper() for m in metric_types.split(",") if m.strip()]

    result = await analyze_server_health(db, source, mt_list)
    return result


# ── 배치 예측 ────────────────────────────────────────────────────────────────

@router.post("/batch")
async def run_batch(
    body: BatchRunRequest = Body(...),
    db: AsyncSession = Depends(get_db),
):
    """
    모든 활성 서버 예측 배치 실행.
    - 최근 24시간 내 메트릭이 있는 소스 자동 감지
    - TTR < threshold_hours 이면 경고 발생
    - auto_create_sr=true이면 예방 SR 자동 생성
    """
    result = await run_predictive_batch(
        db,
        auto_create_sr      = body.auto_create_sr,
        ttr_threshold_hours = body.ttr_threshold_hours,
        max_sources         = body.max_sources,
    )
    return result


# ── 수명 주기 분석 ────────────────────────────────────────────────────────────

@router.get("/lifecycle")
async def lifecycle_overview(
    equipment_type: Optional[str] = Query(None),
    max_items: int = Query(100, ge=1, le=500),
    db: AsyncSession = Depends(get_db),
):
    """
    CMDB 전체 장비 수명 주기 분석.
    EOL / CRITICAL / WARNING / HEALTHY 분류 및 교체 권고.
    """
    result = await run_lifecycle_analysis(db, equipment_type, max_items)
    return result


@router.get("/lifecycle/{source}")
async def lifecycle_single(
    source: str,
    install_date: Optional[str] = Query(None, description="설치일 (YYYY-MM-DD)"),
    equipment_type: str = Query("SERVER"),
    db: AsyncSession = Depends(get_db),
):
    """단일 장비 수명 주기 평가."""
    if install_date:
        try:
            install_dt = datetime.strptime(install_date, "%Y-%m-%d")
        except ValueError:
            raise HTTPException(400, "install_date 형식 오류 (YYYY-MM-DD)")
    else:
        # DB에서 서버 정보 조회 시도
        try:
            from models import Server
            srv = (await db.execute(
                select(Server).where(Server.hostname == source)
            )).scalars().first()
            if srv and hasattr(srv, "install_date") and srv.install_date:
                install_dt = srv.install_date
                equipment_type = getattr(srv, "server_type", equipment_type) or equipment_type
            else:
                # 기본값: 3년 전
                install_dt = datetime.utcnow() - timedelta(days=365 * 3)
        except Exception:
            install_dt = datetime.utcnow() - timedelta(days=365 * 3)

    result = assess_equipment_lifecycle(
        equipment_type   = equipment_type,
        install_date     = install_dt,
    )
    result["source"] = source
    return result


# ── 임계값 관리 ───────────────────────────────────────────────────────────────

@router.get("/thresholds")
async def get_thresholds():
    """예측 임계값 조회 (런타임 설정)."""
    return {
        "thresholds":        PREDICTION_THRESHOLDS,
        "equipment_lifespan": EQUIPMENT_LIFESPAN,
    }


@router.put("/thresholds/{metric_type}")
async def update_threshold(
    metric_type: str,
    body: ThresholdUpdateRequest,
):
    """예측 임계값 런타임 수정 (재시작 시 초기화)."""
    mt = metric_type.upper()
    if mt not in PREDICTION_THRESHOLDS:
        raise HTTPException(404, f"알 수 없는 메트릭: {mt}")

    if body.warning is not None:
        PREDICTION_THRESHOLDS[mt]["warning"] = body.warning
    if body.critical is not None:
        PREDICTION_THRESHOLDS[mt]["critical"] = body.critical
    if body.horizon_hours is not None:
        PREDICTION_THRESHOLDS[mt]["horizon_hours"] = body.horizon_hours

    return {
        "metric_type": mt,
        "updated":     PREDICTION_THRESHOLDS[mt],
    }


# ── 통계 ─────────────────────────────────────────────────────────────────────

@router.get("/stats")
async def get_predictive_stats(
    hours: int = Query(24, ge=1, le=720),
    db: AsyncSession = Depends(get_db),
):
    """예측 유지보수 통계 (데이터 소스 수, 메트릭 포인트 수 등)."""
    try:
        from models import MetricSnapshot, SRRequest

        since = datetime.utcnow() - timedelta(hours=hours)

        # 메트릭 소스 수
        sources_count = (await db.execute(
            select(func.count(func.distinct(MetricSnapshot.source)))
            .where(MetricSnapshot.measured_at >= since)
        )).scalar() or 0

        # 메트릭 포인트 수
        points_count = (await db.execute(
            select(func.count())
            .select_from(MetricSnapshot)
            .where(MetricSnapshot.measured_at >= since)
        )).scalar() or 0

        # 예방 SR 수
        pm_sr_count = (await db.execute(
            select(func.count())
            .select_from(SRRequest)
            .where(
                SRRequest.title.like("%[예방]%"),
                SRRequest.created_at >= since,
            )
        )).scalar() or 0

    except Exception as e:
        logger.debug("예측 통계 조회 오류: %s", e)
        sources_count = 0
        points_count  = 0
        pm_sr_count   = 0

    return {
        "period_hours":           hours,
        "monitored_sources":      sources_count,
        "metric_data_points":     points_count,
        "preventive_srs_created": pm_sr_count,
        "supported_metrics":      list(PREDICTION_THRESHOLDS.keys()),
        "supported_equipment_types": list(EQUIPMENT_LIFESPAN.keys()),
    }


# ── 시뮬레이션 / 테스트 ────────────────────────────────────────────────────────

@router.post("/simulate")
async def simulate_prediction(
    metric_type: str = Query("CPU_USAGE"),
    baseline: float = Query(60.0, description="기준선 값"),
    slope_per_hour: float = Query(0.5, description="시간당 증가율"),
    data_points: int = Query(48, ge=5, le=200),
    horizon_hours: int = Query(24, ge=1, le=168),
):
    """
    예측 로직 시뮬레이션 (실제 DB 없이 테스트용).
    지정된 기준선과 기울기로 가상 데이터를 생성하고 예측 결과 반환.
    """
    import random
    random.seed(42)

    # 가상 시계열 생성
    x_vals = list(range(data_points))
    y_vals = [
        baseline + slope_per_hour * x + random.gauss(0, 2)
        for x in x_vals
    ]

    slope, intercept, r_sq = linear_regression(
        [float(x) for x in x_vals],
        y_vals,
    )
    predicted = slope * (data_points - 1 + horizon_hours) + intercept

    cfg       = PREDICTION_THRESHOLDS.get(metric_type.upper(), {})
    warn_th   = cfg.get("warning")
    crit_th   = cfg.get("critical")

    from core.predictive import time_to_reach
    current_x = float(data_points - 1)
    ttr_warn  = time_to_reach(slope, intercept, current_x, warn_th) if warn_th else None
    ttr_crit  = time_to_reach(slope, intercept, current_x, crit_th) if crit_th else None

    ma = moving_average(y_vals, 5)

    return {
        "simulation":        True,
        "metric_type":       metric_type.upper(),
        "baseline":          baseline,
        "input_slope":       slope_per_hour,
        "detected_slope":    round(slope, 4),
        "r_squared":         round(r_sq, 4),
        "current_value":     round(y_vals[-1], 2),
        "predicted_value":   round(predicted, 2),
        "horizon_hours":     horizon_hours,
        "ttr_warning_hours": round(ttr_warn, 1) if ttr_warn else None,
        "ttr_critical_hours": round(ttr_crit, 1) if ttr_crit else None,
        "moving_avg_last5":  [round(v, 2) for v in ma[-5:]],
        "data_points":       data_points,
    }