""" B-6: 예측 유지보수 엔진 기능: 1. 메트릭 트렌드 분석 (선형 회귀 기반) 2. 임계값 도달 시점 예측 (TTR: Time-To-Reach) 3. 예방적 SR 자동 생성 4. 장비 노후화 분석 (CMDB 수명 기반) 5. 예측 정확도 피드백 (실제 vs 예측) """ from __future__ import annotations import logging import math from datetime import datetime, timedelta from typing import Any, Dict, List, Optional, Tuple from sqlalchemy import select, and_, desc, func from sqlalchemy.ext.asyncio import AsyncSession logger = logging.getLogger(__name__) # ── 예측 임계값 기본값 ──────────────────────────────────────────────────────── PREDICTION_THRESHOLDS: Dict[str, Dict] = { "CPU_USAGE": { "warning": 75.0, "critical": 90.0, "unit": "%", "horizon_hours": 24, }, "MEMORY_USAGE": { "warning": 80.0, "critical": 90.0, "unit": "%", "horizon_hours": 24, }, "DISK_USAGE": { "warning": 75.0, "critical": 85.0, "unit": "%", "horizon_hours": 72, }, "RESPONSE_TIME": { "warning": 3000.0, "critical": 5000.0, "unit": "ms", "horizon_hours": 12, }, } # 장비 수명 기준 (년) EQUIPMENT_LIFESPAN: Dict[str, int] = { "SERVER": 7, "NETWORK": 5, "STORAGE": 5, "PC": 5, "PRINTER": 7, "UPS": 5, "DEFAULT": 7, } # ── 수치 연산 유틸리티 ───────────────────────────────────────────────────────── def linear_regression(x_vals: List[float], y_vals: List[float]) -> Tuple[float, float, float]: """ 최소제곱법 선형 회귀. Returns: (slope, intercept, r_squared) - slope: 단위 x당 y 변화량 - intercept: y절편 - r_squared: 결정계수 (0~1, 1에 가까울수록 선형 트렌드가 강함) """ n = len(x_vals) if n < 2: return 0.0, (y_vals[0] if y_vals else 0.0), 0.0 sum_x = sum(x_vals) sum_y = sum(y_vals) sum_xx = sum(x * x for x in x_vals) sum_xy = sum(x * y for x, y in zip(x_vals, y_vals)) denom = n * sum_xx - sum_x ** 2 if abs(denom) < 1e-10: return 0.0, sum_y / n, 0.0 slope = (n * sum_xy - sum_x * sum_y) / denom intercept = (sum_y - slope * sum_x) / n # R² 계산 y_mean = sum_y / n ss_tot = sum((y - y_mean) ** 2 for y in y_vals) ss_res = sum((y - (slope * x + intercept)) ** 2 for x, y in zip(x_vals, y_vals)) r_sq = 1.0 - (ss_res / ss_tot) if abs(ss_tot) > 1e-10 else 0.0 r_sq = max(0.0, min(1.0, r_sq)) return slope, intercept, r_sq def predict_value(slope: float, intercept: float, x: float) -> float: """회귀선으로 특정 x에서 y 예측.""" return slope * x + intercept def time_to_reach( slope: float, intercept: float, current_x: float, target_y: float, ) -> Optional[float]: """ 현재 추세로 target_y에 도달하는 x 오프셋(시간) 계산. slope가 0이거나 방향이 반대이면 None 반환. Returns: 도달까지 걸리는 x단위 시간 (None이면 도달 불가) """ if abs(slope) < 1e-10: return None x_target = (target_y - intercept) / slope delta = x_target - current_x if delta <= 0: return None # 이미 초과했거나 감소 추세 return delta def moving_average(values: List[float], window: int = 5) -> List[float]: """이동 평균 계산.""" if not values or window <= 0: return values result = [] for i in range(len(values)): start = max(0, i - window + 1) chunk = values[start:i + 1] result.append(sum(chunk) / len(chunk)) return result def detect_seasonal_pattern(values: List[float], period: int = 24) -> Dict: """ 주기성(일별/시간별) 패턴 감지. Returns: {"has_pattern": bool, "peak_index": int, "amplitude": float} """ if len(values) < period * 2: return {"has_pattern": False, "peak_index": 0, "amplitude": 0.0} # 하나의 주기(period) 평균 cycle = [0.0] * period count = [0] * period for i, v in enumerate(values): idx = i % period cycle[idx] += v count[idx] += 1 avg_cycle = [cycle[i] / count[i] if count[i] > 0 else 0.0 for i in range(period)] amplitude = max(avg_cycle) - min(avg_cycle) overall_mean = sum(values) / len(values) if values else 0.0 cv = amplitude / overall_mean if overall_mean > 0 else 0.0 peak_index = avg_cycle.index(max(avg_cycle)) return { "has_pattern": cv > 0.1, # 변동계수 10% 이상이면 패턴 있음 "peak_index": peak_index, "amplitude": round(amplitude, 2), "cycle_values": [round(v, 2) for v in avg_cycle], } # ── DB 기반 예측 분석 ────────────────────────────────────────────────────────── async def fetch_metric_history( db: AsyncSession, source: str, metric_type: str, hours_back: int = 72, max_points: int = 200, ) -> List[Tuple[datetime, float]]: """MetricSnapshot에서 시계열 데이터 조회.""" try: from models import MetricSnapshot since = datetime.utcnow() - timedelta(hours=hours_back) rows = (await db.execute( select(MetricSnapshot) .where( and_( MetricSnapshot.source == source, MetricSnapshot.metric_type == metric_type, MetricSnapshot.measured_at >= since, ) ) .order_by(MetricSnapshot.measured_at) .limit(max_points) )).scalars().all() return [(r.measured_at, float(r.value)) for r in rows] except Exception as e: logger.debug("메트릭 이력 조회 실패: %s", e) return [] async def predict_metric_trend( db: AsyncSession, source: str, metric_type: str, horizon_hours: int = 24, hours_back: int = 72, ) -> Dict: """ 메트릭 트렌드 예측. Returns: { "source", "metric_type", "current_value", "predicted_value", "slope_per_hour", "r_squared", "ttr_warning_hours", "ttr_critical_hours", "trend_direction", "confidence", "data_points", } """ history = await fetch_metric_history(db, source, metric_type, hours_back=hours_back) if len(history) < 5: return { "source": source, "metric_type": metric_type, "current_value": history[-1][1] if history else 0.0, "predicted_value": None, "slope_per_hour": 0.0, "r_squared": 0.0, "ttr_warning_hours": None, "ttr_critical_hours": None, "trend_direction": "STABLE", "confidence": "LOW", "data_points": len(history), "error": "데이터 부족 (최소 5개 필요)", } # 시간 기준으로 x축 구성 (첫 측정 시각 기준 시간 오프셋) t0 = history[0][0] x_vals = [(h[0] - t0).total_seconds() / 3600.0 for h in history] y_vals = [h[1] for h in history] slope, intercept, r_sq = linear_regression(x_vals, y_vals) current_x = x_vals[-1] current_val = y_vals[-1] predicted = predict_value(slope, intercept, current_x + horizon_hours) # 임계값 도달 시간 계산 cfg = PREDICTION_THRESHOLDS.get(metric_type, {}) warn_thresh = cfg.get("warning") crit_thresh = cfg.get("critical") ttr_warn = None ttr_crit = None if warn_thresh is not None: ttr_warn = time_to_reach(slope, intercept, current_x, warn_thresh) if crit_thresh is not None: ttr_crit = time_to_reach(slope, intercept, current_x, crit_thresh) # 트렌드 방향 결정 if abs(slope) < 0.05: direction = "STABLE" elif slope > 0: direction = "INCREASING" else: direction = "DECREASING" # 신뢰도 판정 if r_sq >= 0.8 and len(history) >= 20: confidence = "HIGH" elif r_sq >= 0.5 and len(history) >= 10: confidence = "MEDIUM" else: confidence = "LOW" return { "source": source, "metric_type": metric_type, "current_value": round(current_val, 2), "predicted_value": round(predicted, 2), "horizon_hours": horizon_hours, "slope_per_hour": round(slope, 4), "r_squared": round(r_sq, 4), "ttr_warning_hours": round(ttr_warn, 1) if ttr_warn else None, "ttr_critical_hours": round(ttr_crit, 1) if ttr_crit else None, "trend_direction": direction, "confidence": confidence, "data_points": len(history), "warning_threshold": warn_thresh, "critical_threshold": crit_thresh, "unit": cfg.get("unit", ""), } async def analyze_server_health( db: AsyncSession, source: str, metric_types: Optional[List[str]] = None, ) -> Dict: """ 서버 전체 건강도 분석 (여러 메트릭 종합). Returns: {"source", "health_score", "risk_level", "metrics": [...], "recommendations": [...]} """ if metric_types is None: metric_types = list(PREDICTION_THRESHOLDS.keys()) metrics_result = [] risk_scores = [] recommendations = [] for mt in metric_types: pred = await predict_metric_trend(db, source, mt) metrics_result.append(pred) # 위험도 점수 계산 (0~100) curr = pred.get("current_value", 0) cfg = PREDICTION_THRESHOLDS.get(mt, {}) crit = cfg.get("critical", 100) warn = cfg.get("warning", 80) if crit and curr >= crit: risk_scores.append(90) elif warn and curr >= warn: risk_scores.append(60) else: risk_scores.append(max(0, (curr / crit * 50) if crit else 0)) # 예측 기반 권고 생성 ttr_crit = pred.get("ttr_critical_hours") ttr_warn = pred.get("ttr_warning_hours") if ttr_crit and ttr_crit < 24: recommendations.append({ "metric": mt, "severity": "CRITICAL", "message": f"{mt} {round(ttr_crit)}시간 내 임계값 초과 예측 — 즉시 조치 필요", "ttr_hours": ttr_crit, }) elif ttr_warn and ttr_warn < 48: recommendations.append({ "metric": mt, "severity": "WARNING", "message": f"{mt} {round(ttr_warn)}시간 내 경고 임계값 도달 예측 — 모니터링 강화", "ttr_hours": ttr_warn, }) avg_risk = sum(risk_scores) / len(risk_scores) if risk_scores else 0 health_score = round(100 - avg_risk, 1) if health_score >= 80: risk_level = "LOW" elif health_score >= 60: risk_level = "MEDIUM" elif health_score >= 40: risk_level = "HIGH" else: risk_level = "CRITICAL" return { "source": source, "health_score": health_score, "risk_level": risk_level, "metrics": metrics_result, "recommendations": recommendations, "analyzed_at": datetime.utcnow().isoformat(), } # ── 예방적 SR 자동 생성 ──────────────────────────────────────────────────────── async def create_preventive_sr( db: AsyncSession, source: str, metric_type: str, ttr_hours: float, predicted_value: float, threshold: float, severity: str = "WARNING", ) -> Optional[Dict]: """ 예방적 SR 자동 생성 (예측 기반 선제 대응). Returns: {"created": bool, "sr_id": str, "title": str} """ try: from models import SRRequest, SRStatus, SRType # 같은 소스+메트릭으로 24시간 내 예방 SR이 있으면 중복 생성 방지 since = datetime.utcnow() - timedelta(hours=24) keyword = f"[예방] {source} {metric_type}" existing = (await db.execute( select(SRRequest).where( and_( SRRequest.created_at >= since, SRRequest.title.like(f"%{source}%"), SRRequest.title.like(f"%예방%"), SRRequest.status.notin_(["COMPLETED", "CANCELLED"]), ) ) )).scalars().first() if existing: return {"created": False, "reason": "중복 예방 SR 존재", "sr_id": existing.sr_id} unit = PREDICTION_THRESHOLDS.get(metric_type, {}).get("unit", "") title = ( f"[예방] {source} {metric_type} {round(ttr_hours)}h 내 " f"임계값({threshold}{unit}) 초과 예측" ) desc = ( f"예측 유지보수 에이전트가 자동 감지:\n" f"- 대상 서버: {source}\n" f"- 메트릭: {metric_type}\n" f"- 현재 예측값: {round(predicted_value, 1)}{unit}\n" f"- 임계값: {threshold}{unit}\n" f"- 예상 도달 시간: {round(ttr_hours, 1)}시간 후\n" f"- 심각도: {severity}\n\n" f"권고 조치: 해당 서버의 {metric_type} 원인 분석 후 선제 대응 수행" ) # SR ID 생성 (간단 시퀀스) today = datetime.utcnow().strftime("%Y%m%d") prefix = f"PM-{today}-" last_sr = (await db.execute( select(SRRequest.sr_id) .where(SRRequest.sr_id.like(f"{prefix}%")) .order_by(desc(SRRequest.sr_id)) .limit(1) )).scalar() seq = 1 if last_sr: try: seq = int(last_sr.split("-")[-1]) + 1 except ValueError: seq = 1 sr_id = f"{prefix}{seq:04d}" sr = SRRequest( sr_id = sr_id, title = title, description = desc, status = "OPEN", priority = "HIGH" if severity == "CRITICAL" else "MEDIUM", sr_type = "OTHER", created_at = datetime.utcnow(), ) db.add(sr) await db.commit() await db.refresh(sr) logger.info("예방 SR 생성: %s (%s %s TTR=%.1fh)", sr_id, source, metric_type, ttr_hours) return { "created": True, "sr_id": sr_id, "title": title, } except Exception as e: logger.error("예방 SR 생성 실패: %s", e) return {"created": False, "reason": str(e)[:100]} # ── 장비 노후화 분석 ────────────────────────────────────────────────────────── def calculate_equipment_age(install_date: datetime) -> float: """장비 사용 연수 계산.""" delta = datetime.utcnow() - install_date return delta.days / 365.25 def assess_equipment_lifecycle( equipment_type: str, install_date: datetime, last_maintenance: Optional[datetime] = None, ) -> Dict: """ 장비 수명 주기 평가. Returns: {"age_years", "lifespan_years", "usage_pct", "status", "months_to_eol", "recommendation"} """ age_years = calculate_equipment_age(install_date) lifespan = EQUIPMENT_LIFESPAN.get(equipment_type.upper(), EQUIPMENT_LIFESPAN["DEFAULT"]) usage_pct = min(100.0, round(age_years / lifespan * 100, 1)) months_to_eol = round((lifespan - age_years) * 12, 1) if usage_pct >= 100: status = "EOL" # End of Life recommendation = "즉시 교체 계획 수립 필요 (수명 초과)" elif usage_pct >= 85: status = "CRITICAL" recommendation = f"교체 준비 필요 (약 {max(0, round(months_to_eol))}개월 후 수명 종료)" elif usage_pct >= 70: status = "WARNING" recommendation = f"교체 예산 편성 검토 권장 (약 {round(months_to_eol)}개월 후)" else: status = "HEALTHY" recommendation = "정상 운영 가능" # 마지막 유지보수 경과일 days_since_maint = None if last_maintenance: days_since_maint = (datetime.utcnow() - last_maintenance).days return { "age_years": round(age_years, 2), "lifespan_years": lifespan, "usage_pct": usage_pct, "status": status, "months_to_eol": months_to_eol, "recommendation": recommendation, "days_since_maintenance": days_since_maint, } async def run_lifecycle_analysis( db: AsyncSession, equipment_type_filter: Optional[str] = None, max_items: int = 100, ) -> Dict: """ CMDB 서버 목록을 기반으로 수명 주기 분석 실행. Returns: {"analyzed": int, "critical": int, "warning": int, "eol": int, "items": [...]} """ try: from models import Server q = select(Server) if equipment_type_filter: q = q.where(Server.server_type == equipment_type_filter.upper()) q = q.limit(max_items) servers = (await db.execute(q)).scalars().all() items = [] counts = {"EOL": 0, "CRITICAL": 0, "WARNING": 0, "HEALTHY": 0} for srv in servers: install_date = getattr(srv, "install_date", None) or (datetime.utcnow() - timedelta(days=365 * 3)) srv_type = getattr(srv, "server_type", "SERVER") or "SERVER" assessment = assess_equipment_lifecycle( equipment_type = srv_type, install_date = install_date, last_maintenance = getattr(srv, "last_maintenance_at", None), ) assessment["server_id"] = srv.id assessment["server_name"] = getattr(srv, "hostname", str(srv.id)) assessment["server_type"] = srv_type items.append(assessment) counts[assessment["status"]] = counts.get(assessment["status"], 0) + 1 # 위험도 순 정렬 order = {"EOL": 0, "CRITICAL": 1, "WARNING": 2, "HEALTHY": 3} items.sort(key=lambda x: (order.get(x["status"], 4), -x["usage_pct"])) return { "analyzed": len(items), "eol": counts["EOL"], "critical": counts["CRITICAL"], "warning": counts["WARNING"], "healthy": counts["HEALTHY"], "items": items, "analyzed_at": datetime.utcnow().isoformat(), } except Exception as e: logger.error("수명 주기 분석 오류: %s", e) return { "analyzed": 0, "eol": 0, "critical": 0, "warning": 0, "healthy": 0, "items": [], "error": str(e)[:100], } # ── 예측 배치 실행 ──────────────────────────────────────────────────────────── async def run_predictive_batch( db: AsyncSession, auto_create_sr: bool = True, ttr_threshold_hours: float = 48.0, max_sources: int = 50, ) -> Dict: """ 모든 활성 서버에 대해 예측 유지보수 배치 실행. Returns: {"analyzed": int, "alerts": int, "srs_created": int, "results": [...]} """ try: from models import MetricSnapshot # 최근 24시간 내 메트릭이 있는 소스 목록 조회 since = datetime.utcnow() - timedelta(hours=24) sources_rows = (await db.execute( select(MetricSnapshot.source) .where(MetricSnapshot.measured_at >= since) .distinct() .limit(max_sources) )).scalars().all() sources = list(sources_rows) except Exception: sources = [] results = [] alerts = 0 srs_created = 0 for source in sources: for mt in PREDICTION_THRESHOLDS.keys(): try: pred = await predict_metric_trend(db, source, mt) ttr_crit = pred.get("ttr_critical_hours") ttr_warn = pred.get("ttr_warning_hours") alert_level = None ttr_used = None if ttr_crit and ttr_crit <= ttr_threshold_hours: alert_level = "CRITICAL" ttr_used = ttr_crit alerts += 1 elif ttr_warn and ttr_warn <= ttr_threshold_hours * 2: alert_level = "WARNING" ttr_used = ttr_warn alerts += 1 if alert_level and auto_create_sr and ttr_used: sr_result = await create_preventive_sr( db = db, source = source, metric_type = mt, ttr_hours = ttr_used, predicted_value = pred.get("predicted_value", 0), threshold = ( PREDICTION_THRESHOLDS[mt]["critical"] if alert_level == "CRITICAL" else PREDICTION_THRESHOLDS[mt]["warning"] ), severity = alert_level, ) if sr_result and sr_result.get("created"): srs_created += 1 pred["preventive_sr"] = sr_result if alert_level: pred["alert_level"] = alert_level results.append(pred) except Exception as e: logger.debug("예측 배치 오류 (%s/%s): %s", source, mt, e) return { "analyzed": len(sources) * len(PREDICTION_THRESHOLDS), "sources": len(sources), "alerts": alerts, "srs_created": srs_created, "results": results, "run_at": datetime.utcnow().isoformat(), }