guardia-itsm/routers/predictive_ops.py
2026-06-02 06:07:36 +09:00

316 lines
11 KiB
Python

"""
예측 운영 분석 — 기존 predictive.py 고도화
기존 predictive.py(B-6)에서 단순 통계 예측을 넘어
Ollama LLM 기반 인사이트 + 시계열 이동평균으로 고도화.
예측 항목:
1. SLA 위반 확률 (7일 후)
2. SR 급증 예측 (부하 급상승 감지)
3. 서버 장애 예측 (메트릭 트렌드 기반)
4. 용량 소진 예측 (디스크/메모리)
엔드포인트:
GET /api/predict/sla-breach — SLA 위반 예측
GET /api/predict/sr-surge — SR 급증 예측
GET /api/predict/server-failure/{id} — 서버 장애 예측
GET /api/predict/capacity — 전체 용량 예측
GET /api/predict/summary — 예측 요약 (대시보드용)
POST /api/predict/insight — Ollama AI 인사이트 생성
"""
from __future__ import annotations
import logging
import statistics
from datetime import date, datetime, timedelta
from typing import List, Optional
import httpx
from fastapi import APIRouter, Depends, HTTPException, Query
from pydantic import BaseModel
from sqlalchemy import select, func, and_, desc
from sqlalchemy.ext.asyncio import AsyncSession
from core.auth import get_current_user
from database import get_db
from models import User, SRRequest, SRStatus, Server
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/predict", tags=["Predictive Ops"])
OLLAMA_URL = "http://localhost:11434"
CHAT_MODEL = "llama3"
# ── 통계 유틸 ────────────────────────────────────────────────────────────────
def _moving_average(data: list[float], window: int = 3) -> list[float]:
"""단순 이동평균."""
if len(data) < window:
return data
result = []
for i in range(len(data)):
start = max(0, i - window + 1)
result.append(sum(data[start:i + 1]) / (i - start + 1))
return result
def _linear_forecast(data: list[float], horizon: int = 7) -> list[float]:
"""선형 회귀 기반 예측 (최근 30일 데이터 사용)."""
n = len(data)
if n < 2:
return [data[-1] if data else 0.0] * horizon
# 최소 제곱법
x_mean = (n - 1) / 2
y_mean = sum(data) / n
numerator = sum((i - x_mean) * (y - y_mean) for i, y in enumerate(data))
denominator = sum((i - x_mean) ** 2 for i in range(n))
slope = numerator / denominator if denominator else 0
intercept = y_mean - slope * x_mean
return [max(0.0, intercept + slope * (n + i)) for i in range(horizon)]
def _breach_probability(historical_rates: list[float], target: float) -> float:
"""SLA 위반 확률 추정 (미래 예측값 기반)."""
if not historical_rates:
return 0.0
forecast = _linear_forecast(historical_rates, 7)
breaches = sum(1 for v in forecast if v < target)
return round(breaches / len(forecast), 2)
async def _ollama_insight(prompt: str) -> str:
"""Ollama로 인사이트 텍스트 생성 (실패 시 빈 문자열)."""
try:
async with httpx.AsyncClient(timeout=30) as client:
r = await client.post(
f"{OLLAMA_URL}/api/generate",
json={
"model": CHAT_MODEL,
"system": "GUARDiA ITSM 운영 분석 전문가. 한국어로 간결하게 3문장 이내로 답변.",
"prompt": prompt,
"stream": False,
}
)
if r.status_code == 200:
return r.json().get("response", "").strip()
except Exception as e:
logger.warning(f"Ollama 인사이트 실패: {e}")
return ""
# ── 엔드포인트 ───────────────────────────────────────────────────────────────
@router.get("/sla-breach")
async def predict_sla_breach(
horizon_days: int = Query(7, ge=1, le=30),
db: AsyncSession = Depends(get_db),
user: User = Depends(get_current_user),
):
"""SLA 준수율 트렌드 기반 위반 예측."""
# 최근 30일 일별 SLA 준수율
today = date.today()
dates = [today - timedelta(days=i) for i in range(29, -1, -1)]
daily_rates = []
for d in dates:
total_r = await db.execute(
select(func.count(SRRequest.id)).where(
func.date(SRRequest.updated_at) == d,
SRRequest.status == SRStatus.DONE,
)
)
total = total_r.scalar() or 0
if total == 0:
daily_rates.append(None)
continue
on_time_r = await db.execute(
select(func.count(SRRequest.id)).where(
func.date(SRRequest.updated_at) == d,
SRRequest.status == SRStatus.DONE,
func.extract('epoch', SRRequest.updated_at - SRRequest.created_at) <= 14400,
)
)
on_time = on_time_r.scalar() or 0
daily_rates.append(on_time / total * 100)
# None 제거 후 이동평균
valid_rates = [r for r in daily_rates if r is not None]
if not valid_rates:
return {"status": "NO_DATA", "message": "충분한 데이터 없음"}
smoothed = _moving_average(valid_rates, 7)
forecast = _linear_forecast(smoothed, horizon_days)
breach_prob = _breach_probability(smoothed, 95.0)
# Ollama 인사이트
insight = ""
if len(smoothed) >= 7:
trend_desc = "하락" if smoothed[-1] < smoothed[-7] else "상승"
insight = await _ollama_insight(
f"SLA 준수율이 최근 7일 {trend_desc} 추세. 현재 {smoothed[-1]:.1f}%. "
f"7일 후 위반 확률 {breach_prob*100:.0f}%. 원인과 조치 방안을 제시하세요."
)
return {
"current_rate": round(smoothed[-1] if smoothed else 0, 1),
"target": 95.0,
"breach_probability_7d": breach_prob,
"forecast": [round(v, 1) for v in forecast],
"status": "CRITICAL" if breach_prob > 0.5 else "WARNING" if breach_prob > 0.2 else "NORMAL",
"insight": insight,
"horizon_days": horizon_days,
}
@router.get("/sr-surge")
async def predict_sr_surge(
db: AsyncSession = Depends(get_db),
user: User = Depends(get_current_user),
):
"""SR 급증 감지 — 최근 7일 이동평균 대비 오늘 SR 수."""
today = date.today()
# 최근 14일 일별 SR 수
daily_counts = []
for i in range(13, -1, -1):
d = today - timedelta(days=i)
r = await db.execute(
select(func.count(SRRequest.id)).where(func.date(SRRequest.created_at) == d)
)
daily_counts.append(r.scalar() or 0)
if len(daily_counts) < 7:
return {"status": "NO_DATA"}
avg_7d = sum(daily_counts[-8:-1]) / 7
today_count = daily_counts[-1]
surge_ratio = today_count / avg_7d if avg_7d > 0 else 0
status = "SURGE" if surge_ratio > 2.0 else "HIGH" if surge_ratio > 1.5 else "NORMAL"
forecast_7d = _linear_forecast(daily_counts, 7)
insight = ""
if status in ("SURGE", "HIGH"):
insight = await _ollama_insight(
f"오늘 SR 접수 {today_count}건으로 7일 평균 {avg_7d:.0f}건 대비 {surge_ratio:.1f}배. "
"급증 원인 분석 및 대응 방안 3줄로 제시."
)
return {
"today_count": today_count,
"avg_7d": round(avg_7d, 1),
"surge_ratio": round(surge_ratio, 2),
"status": status,
"forecast_7d": [round(v, 0) for v in forecast_7d],
"daily_trend": daily_counts[-7:],
"insight": insight,
}
@router.get("/server-failure/{server_id}")
async def predict_server_failure(
server_id: int,
db: AsyncSession = Depends(get_db),
user: User = Depends(get_current_user),
):
"""서버 장애 예측 — SR 이력 + 메트릭 트렌드."""
srv_row = await db.execute(select(Server).where(Server.id == server_id))
server = srv_row.scalar_one_or_none()
if not server:
raise HTTPException(404, "서버를 찾을 수 없습니다")
# 최근 30일 해당 서버 관련 SR
today = date.today()
month_start = today - timedelta(days=30)
sr_r = await db.execute(
select(func.count(SRRequest.id)).where(
SRRequest.target_server.contains(str(server_id)),
SRRequest.created_at >= month_start,
)
)
sr_count = sr_r.scalar() or 0
# 최근 7일 일별 SR 수 (트렌드)
daily_sr = []
for i in range(6, -1, -1):
d = today - timedelta(days=i)
r = await db.execute(
select(func.count(SRRequest.id)).where(
SRRequest.target_server.contains(str(server_id)),
func.date(SRRequest.created_at) == d,
)
)
daily_sr.append(r.scalar() or 0)
# 트렌드 분석
recent_avg = sum(daily_sr[-3:]) / 3
old_avg = sum(daily_sr[:4]) / 4
trend_ratio = recent_avg / old_avg if old_avg > 0 else 1.0
failure_prob = min(0.95, max(0.0,
0.1 * trend_ratio +
0.05 * (sr_count / 30) +
(0.3 if trend_ratio > 1.5 else 0)
))
status = "HIGH_RISK" if failure_prob > 0.5 else "MEDIUM_RISK" if failure_prob > 0.2 else "LOW_RISK"
insight = ""
if status != "LOW_RISK":
insight = await _ollama_insight(
f"서버 SR 30일 총 {sr_count}건, 최근 3일 평균 {recent_avg:.1f}건으로 상승 추세. "
f"장애 위험도 {failure_prob*100:.0f}%. 예방 조치 방안 제시."
)
return {
"server_id": server_id,
"server_name": getattr(server, 'hostname', str(server_id)),
"failure_probability_7d": round(failure_prob, 2),
"status": status,
"sr_count_30d": sr_count,
"daily_sr_7d": daily_sr,
"trend_ratio": round(trend_ratio, 2),
"insight": insight,
}
@router.get("/summary")
async def prediction_summary(
db: AsyncSession = Depends(get_db),
user: User = Depends(get_current_user),
):
"""예측 요약 — 대시보드 카드용."""
sla = await predict_sla_breach(7, db, user)
surge = await predict_sr_surge(db, user)
alerts = []
if sla.get("breach_probability_7d", 0) > 0.3:
alerts.append({"type": "SLA", "severity": "HIGH",
"message": f"SLA 위반 예측 {sla['breach_probability_7d']*100:.0f}% 가능성"})
if surge.get("status") in ("SURGE", "HIGH"):
alerts.append({"type": "SR_SURGE", "severity": "MEDIUM",
"message": f"SR 급증: 평균 대비 {surge['surge_ratio']:.1f}"})
return {
"alerts": alerts,
"sla_status": sla.get("status", "NO_DATA"),
"sla_breach_prob": sla.get("breach_probability_7d", 0),
"surge_status": surge.get("status", "NO_DATA"),
"surge_ratio": surge.get("surge_ratio", 1.0),
"updated_at": datetime.utcnow(),
}
class InsightRequest(BaseModel):
context: str
question: str
@router.post("/insight")
async def generate_insight(
req: InsightRequest,
user: User = Depends(get_current_user),
):
"""Ollama 기반 운영 인사이트 생성."""
prompt = f"운영 현황: {req.context}\n\n질문: {req.question}"
insight = await _ollama_insight(prompt)
return {"insight": insight, "model": CHAT_MODEL}