zioinfo-mail/workspace/guardia-itsm/core/predictive.py
DESKTOP-TKLFCPR\ython cfe2901a55 refactor(structure): consolidate all projects under workspace/
- itsm/    -> workspace/guardia-itsm/
- manager/ -> workspace/guardia-manager/
- app/     -> workspace/guardia-messenger/
- manual/  -> workspace/guardia-docs/

workspace/zioinfo-web/ unchanged.
git mv preserves full commit history.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-31 23:50:56 +09:00

657 lines
22 KiB
Python

"""
B-6: 예측 유지보수 엔진
기능:
1. 메트릭 트렌드 분석 (선형 회귀 기반)
2. 임계값 도달 시점 예측 (TTR: Time-To-Reach)
3. 예방적 SR 자동 생성
4. 장비 노후화 분석 (CMDB 수명 기반)
5. 예측 정확도 피드백 (실제 vs 예측)
"""
from __future__ import annotations
import logging
import math
from datetime import datetime, timedelta
from typing import Any, Dict, List, Optional, Tuple
from sqlalchemy import select, and_, desc, func
from sqlalchemy.ext.asyncio import AsyncSession
logger = logging.getLogger(__name__)
# ── 예측 임계값 기본값 ────────────────────────────────────────────────────────
PREDICTION_THRESHOLDS: Dict[str, Dict] = {
"CPU_USAGE": {
"warning": 75.0,
"critical": 90.0,
"unit": "%",
"horizon_hours": 24,
},
"MEMORY_USAGE": {
"warning": 80.0,
"critical": 90.0,
"unit": "%",
"horizon_hours": 24,
},
"DISK_USAGE": {
"warning": 75.0,
"critical": 85.0,
"unit": "%",
"horizon_hours": 72,
},
"RESPONSE_TIME": {
"warning": 3000.0,
"critical": 5000.0,
"unit": "ms",
"horizon_hours": 12,
},
}
# 장비 수명 기준 (년)
EQUIPMENT_LIFESPAN: Dict[str, int] = {
"SERVER": 7,
"NETWORK": 5,
"STORAGE": 5,
"PC": 5,
"PRINTER": 7,
"UPS": 5,
"DEFAULT": 7,
}
# ── 수치 연산 유틸리티 ─────────────────────────────────────────────────────────
def linear_regression(x_vals: List[float], y_vals: List[float]) -> Tuple[float, float, float]:
"""
최소제곱법 선형 회귀.
Returns: (slope, intercept, r_squared)
- slope: 단위 x당 y 변화량
- intercept: y절편
- r_squared: 결정계수 (0~1, 1에 가까울수록 선형 트렌드가 강함)
"""
n = len(x_vals)
if n < 2:
return 0.0, (y_vals[0] if y_vals else 0.0), 0.0
sum_x = sum(x_vals)
sum_y = sum(y_vals)
sum_xx = sum(x * x for x in x_vals)
sum_xy = sum(x * y for x, y in zip(x_vals, y_vals))
denom = n * sum_xx - sum_x ** 2
if abs(denom) < 1e-10:
return 0.0, sum_y / n, 0.0
slope = (n * sum_xy - sum_x * sum_y) / denom
intercept = (sum_y - slope * sum_x) / n
# R² 계산
y_mean = sum_y / n
ss_tot = sum((y - y_mean) ** 2 for y in y_vals)
ss_res = sum((y - (slope * x + intercept)) ** 2 for x, y in zip(x_vals, y_vals))
r_sq = 1.0 - (ss_res / ss_tot) if abs(ss_tot) > 1e-10 else 0.0
r_sq = max(0.0, min(1.0, r_sq))
return slope, intercept, r_sq
def predict_value(slope: float, intercept: float, x: float) -> float:
"""회귀선으로 특정 x에서 y 예측."""
return slope * x + intercept
def time_to_reach(
slope: float,
intercept: float,
current_x: float,
target_y: float,
) -> Optional[float]:
"""
현재 추세로 target_y에 도달하는 x 오프셋(시간) 계산.
slope가 0이거나 방향이 반대이면 None 반환.
Returns: 도달까지 걸리는 x단위 시간 (None이면 도달 불가)
"""
if abs(slope) < 1e-10:
return None
x_target = (target_y - intercept) / slope
delta = x_target - current_x
if delta <= 0:
return None # 이미 초과했거나 감소 추세
return delta
def moving_average(values: List[float], window: int = 5) -> List[float]:
"""이동 평균 계산."""
if not values or window <= 0:
return values
result = []
for i in range(len(values)):
start = max(0, i - window + 1)
chunk = values[start:i + 1]
result.append(sum(chunk) / len(chunk))
return result
def detect_seasonal_pattern(values: List[float], period: int = 24) -> Dict:
"""
주기성(일별/시간별) 패턴 감지.
Returns: {"has_pattern": bool, "peak_index": int, "amplitude": float}
"""
if len(values) < period * 2:
return {"has_pattern": False, "peak_index": 0, "amplitude": 0.0}
# 하나의 주기(period) 평균
cycle = [0.0] * period
count = [0] * period
for i, v in enumerate(values):
idx = i % period
cycle[idx] += v
count[idx] += 1
avg_cycle = [cycle[i] / count[i] if count[i] > 0 else 0.0 for i in range(period)]
amplitude = max(avg_cycle) - min(avg_cycle)
overall_mean = sum(values) / len(values) if values else 0.0
cv = amplitude / overall_mean if overall_mean > 0 else 0.0
peak_index = avg_cycle.index(max(avg_cycle))
return {
"has_pattern": cv > 0.1, # 변동계수 10% 이상이면 패턴 있음
"peak_index": peak_index,
"amplitude": round(amplitude, 2),
"cycle_values": [round(v, 2) for v in avg_cycle],
}
# ── DB 기반 예측 분석 ──────────────────────────────────────────────────────────
async def fetch_metric_history(
db: AsyncSession,
source: str,
metric_type: str,
hours_back: int = 72,
max_points: int = 200,
) -> List[Tuple[datetime, float]]:
"""MetricSnapshot에서 시계열 데이터 조회."""
try:
from models import MetricSnapshot
since = datetime.utcnow() - timedelta(hours=hours_back)
rows = (await db.execute(
select(MetricSnapshot)
.where(
and_(
MetricSnapshot.source == source,
MetricSnapshot.metric_type == metric_type,
MetricSnapshot.measured_at >= since,
)
)
.order_by(MetricSnapshot.measured_at)
.limit(max_points)
)).scalars().all()
return [(r.measured_at, float(r.value)) for r in rows]
except Exception as e:
logger.debug("메트릭 이력 조회 실패: %s", e)
return []
async def predict_metric_trend(
db: AsyncSession,
source: str,
metric_type: str,
horizon_hours: int = 24,
hours_back: int = 72,
) -> Dict:
"""
메트릭 트렌드 예측.
Returns: {
"source", "metric_type",
"current_value", "predicted_value",
"slope_per_hour", "r_squared",
"ttr_warning_hours", "ttr_critical_hours",
"trend_direction", "confidence",
"data_points",
}
"""
history = await fetch_metric_history(db, source, metric_type, hours_back=hours_back)
if len(history) < 5:
return {
"source": source,
"metric_type": metric_type,
"current_value": history[-1][1] if history else 0.0,
"predicted_value": None,
"slope_per_hour": 0.0,
"r_squared": 0.0,
"ttr_warning_hours": None,
"ttr_critical_hours": None,
"trend_direction": "STABLE",
"confidence": "LOW",
"data_points": len(history),
"error": "데이터 부족 (최소 5개 필요)",
}
# 시간 기준으로 x축 구성 (첫 측정 시각 기준 시간 오프셋)
t0 = history[0][0]
x_vals = [(h[0] - t0).total_seconds() / 3600.0 for h in history]
y_vals = [h[1] for h in history]
slope, intercept, r_sq = linear_regression(x_vals, y_vals)
current_x = x_vals[-1]
current_val = y_vals[-1]
predicted = predict_value(slope, intercept, current_x + horizon_hours)
# 임계값 도달 시간 계산
cfg = PREDICTION_THRESHOLDS.get(metric_type, {})
warn_thresh = cfg.get("warning")
crit_thresh = cfg.get("critical")
ttr_warn = None
ttr_crit = None
if warn_thresh is not None:
ttr_warn = time_to_reach(slope, intercept, current_x, warn_thresh)
if crit_thresh is not None:
ttr_crit = time_to_reach(slope, intercept, current_x, crit_thresh)
# 트렌드 방향 결정
if abs(slope) < 0.05:
direction = "STABLE"
elif slope > 0:
direction = "INCREASING"
else:
direction = "DECREASING"
# 신뢰도 판정
if r_sq >= 0.8 and len(history) >= 20:
confidence = "HIGH"
elif r_sq >= 0.5 and len(history) >= 10:
confidence = "MEDIUM"
else:
confidence = "LOW"
return {
"source": source,
"metric_type": metric_type,
"current_value": round(current_val, 2),
"predicted_value": round(predicted, 2),
"horizon_hours": horizon_hours,
"slope_per_hour": round(slope, 4),
"r_squared": round(r_sq, 4),
"ttr_warning_hours": round(ttr_warn, 1) if ttr_warn else None,
"ttr_critical_hours": round(ttr_crit, 1) if ttr_crit else None,
"trend_direction": direction,
"confidence": confidence,
"data_points": len(history),
"warning_threshold": warn_thresh,
"critical_threshold": crit_thresh,
"unit": cfg.get("unit", ""),
}
async def analyze_server_health(
db: AsyncSession,
source: str,
metric_types: Optional[List[str]] = None,
) -> Dict:
"""
서버 전체 건강도 분석 (여러 메트릭 종합).
Returns: {"source", "health_score", "risk_level", "metrics": [...], "recommendations": [...]}
"""
if metric_types is None:
metric_types = list(PREDICTION_THRESHOLDS.keys())
metrics_result = []
risk_scores = []
recommendations = []
for mt in metric_types:
pred = await predict_metric_trend(db, source, mt)
metrics_result.append(pred)
# 위험도 점수 계산 (0~100)
curr = pred.get("current_value", 0)
cfg = PREDICTION_THRESHOLDS.get(mt, {})
crit = cfg.get("critical", 100)
warn = cfg.get("warning", 80)
if crit and curr >= crit:
risk_scores.append(90)
elif warn and curr >= warn:
risk_scores.append(60)
else:
risk_scores.append(max(0, (curr / crit * 50) if crit else 0))
# 예측 기반 권고 생성
ttr_crit = pred.get("ttr_critical_hours")
ttr_warn = pred.get("ttr_warning_hours")
if ttr_crit and ttr_crit < 24:
recommendations.append({
"metric": mt,
"severity": "CRITICAL",
"message": f"{mt} {round(ttr_crit)}시간 내 임계값 초과 예측 — 즉시 조치 필요",
"ttr_hours": ttr_crit,
})
elif ttr_warn and ttr_warn < 48:
recommendations.append({
"metric": mt,
"severity": "WARNING",
"message": f"{mt} {round(ttr_warn)}시간 내 경고 임계값 도달 예측 — 모니터링 강화",
"ttr_hours": ttr_warn,
})
avg_risk = sum(risk_scores) / len(risk_scores) if risk_scores else 0
health_score = round(100 - avg_risk, 1)
if health_score >= 80:
risk_level = "LOW"
elif health_score >= 60:
risk_level = "MEDIUM"
elif health_score >= 40:
risk_level = "HIGH"
else:
risk_level = "CRITICAL"
return {
"source": source,
"health_score": health_score,
"risk_level": risk_level,
"metrics": metrics_result,
"recommendations": recommendations,
"analyzed_at": datetime.utcnow().isoformat(),
}
# ── 예방적 SR 자동 생성 ────────────────────────────────────────────────────────
async def create_preventive_sr(
db: AsyncSession,
source: str,
metric_type: str,
ttr_hours: float,
predicted_value: float,
threshold: float,
severity: str = "WARNING",
) -> Optional[Dict]:
"""
예방적 SR 자동 생성 (예측 기반 선제 대응).
Returns: {"created": bool, "sr_id": str, "title": str}
"""
try:
from models import SRRequest, SRStatus, SRType
# 같은 소스+메트릭으로 24시간 내 예방 SR이 있으면 중복 생성 방지
since = datetime.utcnow() - timedelta(hours=24)
keyword = f"[예방] {source} {metric_type}"
existing = (await db.execute(
select(SRRequest).where(
and_(
SRRequest.created_at >= since,
SRRequest.title.like(f"%{source}%"),
SRRequest.title.like(f"%예방%"),
SRRequest.status.notin_(["COMPLETED", "CANCELLED"]),
)
)
)).scalars().first()
if existing:
return {"created": False, "reason": "중복 예방 SR 존재", "sr_id": existing.sr_id}
unit = PREDICTION_THRESHOLDS.get(metric_type, {}).get("unit", "")
title = (
f"[예방] {source} {metric_type} {round(ttr_hours)}h 내 "
f"임계값({threshold}{unit}) 초과 예측"
)
desc = (
f"예측 유지보수 에이전트가 자동 감지:\n"
f"- 대상 서버: {source}\n"
f"- 메트릭: {metric_type}\n"
f"- 현재 예측값: {round(predicted_value, 1)}{unit}\n"
f"- 임계값: {threshold}{unit}\n"
f"- 예상 도달 시간: {round(ttr_hours, 1)}시간 후\n"
f"- 심각도: {severity}\n\n"
f"권고 조치: 해당 서버의 {metric_type} 원인 분석 후 선제 대응 수행"
)
# SR ID 생성 (간단 시퀀스)
today = datetime.utcnow().strftime("%Y%m%d")
prefix = f"PM-{today}-"
last_sr = (await db.execute(
select(SRRequest.sr_id)
.where(SRRequest.sr_id.like(f"{prefix}%"))
.order_by(desc(SRRequest.sr_id))
.limit(1)
)).scalar()
seq = 1
if last_sr:
try:
seq = int(last_sr.split("-")[-1]) + 1
except ValueError:
seq = 1
sr_id = f"{prefix}{seq:04d}"
sr = SRRequest(
sr_id = sr_id,
title = title,
description = desc,
status = "OPEN",
priority = "HIGH" if severity == "CRITICAL" else "MEDIUM",
sr_type = "OTHER",
created_at = datetime.utcnow(),
)
db.add(sr)
await db.commit()
await db.refresh(sr)
logger.info("예방 SR 생성: %s (%s %s TTR=%.1fh)", sr_id, source, metric_type, ttr_hours)
return {
"created": True,
"sr_id": sr_id,
"title": title,
}
except Exception as e:
logger.error("예방 SR 생성 실패: %s", e)
return {"created": False, "reason": str(e)[:100]}
# ── 장비 노후화 분석 ──────────────────────────────────────────────────────────
def calculate_equipment_age(install_date: datetime) -> float:
"""장비 사용 연수 계산."""
delta = datetime.utcnow() - install_date
return delta.days / 365.25
def assess_equipment_lifecycle(
equipment_type: str,
install_date: datetime,
last_maintenance: Optional[datetime] = None,
) -> Dict:
"""
장비 수명 주기 평가.
Returns: {"age_years", "lifespan_years", "usage_pct", "status", "months_to_eol", "recommendation"}
"""
age_years = calculate_equipment_age(install_date)
lifespan = EQUIPMENT_LIFESPAN.get(equipment_type.upper(), EQUIPMENT_LIFESPAN["DEFAULT"])
usage_pct = min(100.0, round(age_years / lifespan * 100, 1))
months_to_eol = round((lifespan - age_years) * 12, 1)
if usage_pct >= 100:
status = "EOL" # End of Life
recommendation = "즉시 교체 계획 수립 필요 (수명 초과)"
elif usage_pct >= 85:
status = "CRITICAL"
recommendation = f"교체 준비 필요 (약 {max(0, round(months_to_eol))}개월 후 수명 종료)"
elif usage_pct >= 70:
status = "WARNING"
recommendation = f"교체 예산 편성 검토 권장 (약 {round(months_to_eol)}개월 후)"
else:
status = "HEALTHY"
recommendation = "정상 운영 가능"
# 마지막 유지보수 경과일
days_since_maint = None
if last_maintenance:
days_since_maint = (datetime.utcnow() - last_maintenance).days
return {
"age_years": round(age_years, 2),
"lifespan_years": lifespan,
"usage_pct": usage_pct,
"status": status,
"months_to_eol": months_to_eol,
"recommendation": recommendation,
"days_since_maintenance": days_since_maint,
}
async def run_lifecycle_analysis(
db: AsyncSession,
equipment_type_filter: Optional[str] = None,
max_items: int = 100,
) -> Dict:
"""
CMDB 서버 목록을 기반으로 수명 주기 분석 실행.
Returns: {"analyzed": int, "critical": int, "warning": int, "eol": int, "items": [...]}
"""
try:
from models import Server
q = select(Server)
if equipment_type_filter:
q = q.where(Server.server_type == equipment_type_filter.upper())
q = q.limit(max_items)
servers = (await db.execute(q)).scalars().all()
items = []
counts = {"EOL": 0, "CRITICAL": 0, "WARNING": 0, "HEALTHY": 0}
for srv in servers:
install_date = getattr(srv, "install_date", None) or (datetime.utcnow() - timedelta(days=365 * 3))
srv_type = getattr(srv, "server_type", "SERVER") or "SERVER"
assessment = assess_equipment_lifecycle(
equipment_type = srv_type,
install_date = install_date,
last_maintenance = getattr(srv, "last_maintenance_at", None),
)
assessment["server_id"] = srv.id
assessment["server_name"] = getattr(srv, "hostname", str(srv.id))
assessment["server_type"] = srv_type
items.append(assessment)
counts[assessment["status"]] = counts.get(assessment["status"], 0) + 1
# 위험도 순 정렬
order = {"EOL": 0, "CRITICAL": 1, "WARNING": 2, "HEALTHY": 3}
items.sort(key=lambda x: (order.get(x["status"], 4), -x["usage_pct"]))
return {
"analyzed": len(items),
"eol": counts["EOL"],
"critical": counts["CRITICAL"],
"warning": counts["WARNING"],
"healthy": counts["HEALTHY"],
"items": items,
"analyzed_at": datetime.utcnow().isoformat(),
}
except Exception as e:
logger.error("수명 주기 분석 오류: %s", e)
return {
"analyzed": 0,
"eol": 0, "critical": 0, "warning": 0, "healthy": 0,
"items": [],
"error": str(e)[:100],
}
# ── 예측 배치 실행 ────────────────────────────────────────────────────────────
async def run_predictive_batch(
db: AsyncSession,
auto_create_sr: bool = True,
ttr_threshold_hours: float = 48.0,
max_sources: int = 50,
) -> Dict:
"""
모든 활성 서버에 대해 예측 유지보수 배치 실행.
Returns: {"analyzed": int, "alerts": int, "srs_created": int, "results": [...]}
"""
try:
from models import MetricSnapshot
# 최근 24시간 내 메트릭이 있는 소스 목록 조회
since = datetime.utcnow() - timedelta(hours=24)
sources_rows = (await db.execute(
select(MetricSnapshot.source)
.where(MetricSnapshot.measured_at >= since)
.distinct()
.limit(max_sources)
)).scalars().all()
sources = list(sources_rows)
except Exception:
sources = []
results = []
alerts = 0
srs_created = 0
for source in sources:
for mt in PREDICTION_THRESHOLDS.keys():
try:
pred = await predict_metric_trend(db, source, mt)
ttr_crit = pred.get("ttr_critical_hours")
ttr_warn = pred.get("ttr_warning_hours")
alert_level = None
ttr_used = None
if ttr_crit and ttr_crit <= ttr_threshold_hours:
alert_level = "CRITICAL"
ttr_used = ttr_crit
alerts += 1
elif ttr_warn and ttr_warn <= ttr_threshold_hours * 2:
alert_level = "WARNING"
ttr_used = ttr_warn
alerts += 1
if alert_level and auto_create_sr and ttr_used:
sr_result = await create_preventive_sr(
db = db,
source = source,
metric_type = mt,
ttr_hours = ttr_used,
predicted_value = pred.get("predicted_value", 0),
threshold = (
PREDICTION_THRESHOLDS[mt]["critical"]
if alert_level == "CRITICAL"
else PREDICTION_THRESHOLDS[mt]["warning"]
),
severity = alert_level,
)
if sr_result and sr_result.get("created"):
srs_created += 1
pred["preventive_sr"] = sr_result
if alert_level:
pred["alert_level"] = alert_level
results.append(pred)
except Exception as e:
logger.debug("예측 배치 오류 (%s/%s): %s", source, mt, e)
return {
"analyzed": len(sources) * len(PREDICTION_THRESHOLDS),
"sources": len(sources),
"alerts": alerts,
"srs_created": srs_created,
"results": results,
"run_at": datetime.utcnow().isoformat(),
}