G-1: 메신저 Webhook Relay + _send_to_room 실제 httpx 호출 구현 G-2: POST /api/tasks/bulk SR 대량작업 엔드포인트 (최대 100건) G-3: 라이선스 만료 알림 스케줄러 (매일 09:00 KST) G-4: 체험판 upgrade_banner 필드 + license.py 배너 로직 G-5: core/auto_rca.py + incidents/problem auto-rca 엔드포인트 G-6: core/deploy_impact.py + vibe impact-analysis 엔드포인트 G-7: core/ticket_classifier.py + SR 생성 시 AI 분류 + ai-suggestion API G-8: VulnPatchRecord 모델 + vuln_scan 패치추적 4개 엔드포인트 G-9: core/jira_sync.py + gateway Jira/Confluence 연동 엔드포인트 G-10: core/push_notify.py + routers/push.py + PushSubscription 모델 G-11: approvals 다중승인 (위임/서명/기한초과/마감연장) G-12: alembic.ini + migrations/ + cicd/migrate_to_postgres.sh 하네스: guardia-orchestrator 확장기능 Phase 반영 봇명령어: /sr /status /license /bulk 슬래시 명령어 추가 설치스크립트: setup/ (Ubuntu, CentOS, RHEL, Windows) --test 옵션 포함 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
657 lines
22 KiB
Python
657 lines
22 KiB
Python
"""
|
|
B-6: 예측 유지보수 엔진
|
|
|
|
기능:
|
|
1. 메트릭 트렌드 분석 (선형 회귀 기반)
|
|
2. 임계값 도달 시점 예측 (TTR: Time-To-Reach)
|
|
3. 예방적 SR 자동 생성
|
|
4. 장비 노후화 분석 (CMDB 수명 기반)
|
|
5. 예측 정확도 피드백 (실제 vs 예측)
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import math
|
|
from datetime import datetime, timedelta
|
|
from typing import Any, Dict, List, Optional, Tuple
|
|
|
|
from sqlalchemy import select, and_, desc, func
|
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
# ── 예측 임계값 기본값 ────────────────────────────────────────────────────────
|
|
|
|
PREDICTION_THRESHOLDS: Dict[str, Dict] = {
|
|
"CPU_USAGE": {
|
|
"warning": 75.0,
|
|
"critical": 90.0,
|
|
"unit": "%",
|
|
"horizon_hours": 24,
|
|
},
|
|
"MEMORY_USAGE": {
|
|
"warning": 80.0,
|
|
"critical": 90.0,
|
|
"unit": "%",
|
|
"horizon_hours": 24,
|
|
},
|
|
"DISK_USAGE": {
|
|
"warning": 75.0,
|
|
"critical": 85.0,
|
|
"unit": "%",
|
|
"horizon_hours": 72,
|
|
},
|
|
"RESPONSE_TIME": {
|
|
"warning": 3000.0,
|
|
"critical": 5000.0,
|
|
"unit": "ms",
|
|
"horizon_hours": 12,
|
|
},
|
|
}
|
|
|
|
# 장비 수명 기준 (년)
|
|
EQUIPMENT_LIFESPAN: Dict[str, int] = {
|
|
"SERVER": 7,
|
|
"NETWORK": 5,
|
|
"STORAGE": 5,
|
|
"PC": 5,
|
|
"PRINTER": 7,
|
|
"UPS": 5,
|
|
"DEFAULT": 7,
|
|
}
|
|
|
|
|
|
# ── 수치 연산 유틸리티 ─────────────────────────────────────────────────────────
|
|
|
|
def linear_regression(x_vals: List[float], y_vals: List[float]) -> Tuple[float, float, float]:
|
|
"""
|
|
최소제곱법 선형 회귀.
|
|
Returns: (slope, intercept, r_squared)
|
|
- slope: 단위 x당 y 변화량
|
|
- intercept: y절편
|
|
- r_squared: 결정계수 (0~1, 1에 가까울수록 선형 트렌드가 강함)
|
|
"""
|
|
n = len(x_vals)
|
|
if n < 2:
|
|
return 0.0, (y_vals[0] if y_vals else 0.0), 0.0
|
|
|
|
sum_x = sum(x_vals)
|
|
sum_y = sum(y_vals)
|
|
sum_xx = sum(x * x for x in x_vals)
|
|
sum_xy = sum(x * y for x, y in zip(x_vals, y_vals))
|
|
|
|
denom = n * sum_xx - sum_x ** 2
|
|
if abs(denom) < 1e-10:
|
|
return 0.0, sum_y / n, 0.0
|
|
|
|
slope = (n * sum_xy - sum_x * sum_y) / denom
|
|
intercept = (sum_y - slope * sum_x) / n
|
|
|
|
# R² 계산
|
|
y_mean = sum_y / n
|
|
ss_tot = sum((y - y_mean) ** 2 for y in y_vals)
|
|
ss_res = sum((y - (slope * x + intercept)) ** 2 for x, y in zip(x_vals, y_vals))
|
|
r_sq = 1.0 - (ss_res / ss_tot) if abs(ss_tot) > 1e-10 else 0.0
|
|
r_sq = max(0.0, min(1.0, r_sq))
|
|
|
|
return slope, intercept, r_sq
|
|
|
|
|
|
def predict_value(slope: float, intercept: float, x: float) -> float:
|
|
"""회귀선으로 특정 x에서 y 예측."""
|
|
return slope * x + intercept
|
|
|
|
|
|
def time_to_reach(
|
|
slope: float,
|
|
intercept: float,
|
|
current_x: float,
|
|
target_y: float,
|
|
) -> Optional[float]:
|
|
"""
|
|
현재 추세로 target_y에 도달하는 x 오프셋(시간) 계산.
|
|
slope가 0이거나 방향이 반대이면 None 반환.
|
|
Returns: 도달까지 걸리는 x단위 시간 (None이면 도달 불가)
|
|
"""
|
|
if abs(slope) < 1e-10:
|
|
return None
|
|
x_target = (target_y - intercept) / slope
|
|
delta = x_target - current_x
|
|
if delta <= 0:
|
|
return None # 이미 초과했거나 감소 추세
|
|
return delta
|
|
|
|
|
|
def moving_average(values: List[float], window: int = 5) -> List[float]:
|
|
"""이동 평균 계산."""
|
|
if not values or window <= 0:
|
|
return values
|
|
result = []
|
|
for i in range(len(values)):
|
|
start = max(0, i - window + 1)
|
|
chunk = values[start:i + 1]
|
|
result.append(sum(chunk) / len(chunk))
|
|
return result
|
|
|
|
|
|
def detect_seasonal_pattern(values: List[float], period: int = 24) -> Dict:
|
|
"""
|
|
주기성(일별/시간별) 패턴 감지.
|
|
Returns: {"has_pattern": bool, "peak_index": int, "amplitude": float}
|
|
"""
|
|
if len(values) < period * 2:
|
|
return {"has_pattern": False, "peak_index": 0, "amplitude": 0.0}
|
|
|
|
# 하나의 주기(period) 평균
|
|
cycle = [0.0] * period
|
|
count = [0] * period
|
|
for i, v in enumerate(values):
|
|
idx = i % period
|
|
cycle[idx] += v
|
|
count[idx] += 1
|
|
|
|
avg_cycle = [cycle[i] / count[i] if count[i] > 0 else 0.0 for i in range(period)]
|
|
amplitude = max(avg_cycle) - min(avg_cycle)
|
|
overall_mean = sum(values) / len(values) if values else 0.0
|
|
cv = amplitude / overall_mean if overall_mean > 0 else 0.0
|
|
|
|
peak_index = avg_cycle.index(max(avg_cycle))
|
|
|
|
return {
|
|
"has_pattern": cv > 0.1, # 변동계수 10% 이상이면 패턴 있음
|
|
"peak_index": peak_index,
|
|
"amplitude": round(amplitude, 2),
|
|
"cycle_values": [round(v, 2) for v in avg_cycle],
|
|
}
|
|
|
|
|
|
# ── DB 기반 예측 분석 ──────────────────────────────────────────────────────────
|
|
|
|
async def fetch_metric_history(
|
|
db: AsyncSession,
|
|
source: str,
|
|
metric_type: str,
|
|
hours_back: int = 72,
|
|
max_points: int = 200,
|
|
) -> List[Tuple[datetime, float]]:
|
|
"""MetricSnapshot에서 시계열 데이터 조회."""
|
|
try:
|
|
from models import MetricSnapshot
|
|
since = datetime.utcnow() - timedelta(hours=hours_back)
|
|
rows = (await db.execute(
|
|
select(MetricSnapshot)
|
|
.where(
|
|
and_(
|
|
MetricSnapshot.source == source,
|
|
MetricSnapshot.metric_type == metric_type,
|
|
MetricSnapshot.measured_at >= since,
|
|
)
|
|
)
|
|
.order_by(MetricSnapshot.measured_at)
|
|
.limit(max_points)
|
|
)).scalars().all()
|
|
|
|
return [(r.measured_at, float(r.value)) for r in rows]
|
|
except Exception as e:
|
|
logger.debug("메트릭 이력 조회 실패: %s", e)
|
|
return []
|
|
|
|
|
|
async def predict_metric_trend(
|
|
db: AsyncSession,
|
|
source: str,
|
|
metric_type: str,
|
|
horizon_hours: int = 24,
|
|
hours_back: int = 72,
|
|
) -> Dict:
|
|
"""
|
|
메트릭 트렌드 예측.
|
|
Returns: {
|
|
"source", "metric_type",
|
|
"current_value", "predicted_value",
|
|
"slope_per_hour", "r_squared",
|
|
"ttr_warning_hours", "ttr_critical_hours",
|
|
"trend_direction", "confidence",
|
|
"data_points",
|
|
}
|
|
"""
|
|
history = await fetch_metric_history(db, source, metric_type, hours_back=hours_back)
|
|
|
|
if len(history) < 5:
|
|
return {
|
|
"source": source,
|
|
"metric_type": metric_type,
|
|
"current_value": history[-1][1] if history else 0.0,
|
|
"predicted_value": None,
|
|
"slope_per_hour": 0.0,
|
|
"r_squared": 0.0,
|
|
"ttr_warning_hours": None,
|
|
"ttr_critical_hours": None,
|
|
"trend_direction": "STABLE",
|
|
"confidence": "LOW",
|
|
"data_points": len(history),
|
|
"error": "데이터 부족 (최소 5개 필요)",
|
|
}
|
|
|
|
# 시간 기준으로 x축 구성 (첫 측정 시각 기준 시간 오프셋)
|
|
t0 = history[0][0]
|
|
x_vals = [(h[0] - t0).total_seconds() / 3600.0 for h in history]
|
|
y_vals = [h[1] for h in history]
|
|
|
|
slope, intercept, r_sq = linear_regression(x_vals, y_vals)
|
|
|
|
current_x = x_vals[-1]
|
|
current_val = y_vals[-1]
|
|
predicted = predict_value(slope, intercept, current_x + horizon_hours)
|
|
|
|
# 임계값 도달 시간 계산
|
|
cfg = PREDICTION_THRESHOLDS.get(metric_type, {})
|
|
warn_thresh = cfg.get("warning")
|
|
crit_thresh = cfg.get("critical")
|
|
|
|
ttr_warn = None
|
|
ttr_crit = None
|
|
if warn_thresh is not None:
|
|
ttr_warn = time_to_reach(slope, intercept, current_x, warn_thresh)
|
|
if crit_thresh is not None:
|
|
ttr_crit = time_to_reach(slope, intercept, current_x, crit_thresh)
|
|
|
|
# 트렌드 방향 결정
|
|
if abs(slope) < 0.05:
|
|
direction = "STABLE"
|
|
elif slope > 0:
|
|
direction = "INCREASING"
|
|
else:
|
|
direction = "DECREASING"
|
|
|
|
# 신뢰도 판정
|
|
if r_sq >= 0.8 and len(history) >= 20:
|
|
confidence = "HIGH"
|
|
elif r_sq >= 0.5 and len(history) >= 10:
|
|
confidence = "MEDIUM"
|
|
else:
|
|
confidence = "LOW"
|
|
|
|
return {
|
|
"source": source,
|
|
"metric_type": metric_type,
|
|
"current_value": round(current_val, 2),
|
|
"predicted_value": round(predicted, 2),
|
|
"horizon_hours": horizon_hours,
|
|
"slope_per_hour": round(slope, 4),
|
|
"r_squared": round(r_sq, 4),
|
|
"ttr_warning_hours": round(ttr_warn, 1) if ttr_warn else None,
|
|
"ttr_critical_hours": round(ttr_crit, 1) if ttr_crit else None,
|
|
"trend_direction": direction,
|
|
"confidence": confidence,
|
|
"data_points": len(history),
|
|
"warning_threshold": warn_thresh,
|
|
"critical_threshold": crit_thresh,
|
|
"unit": cfg.get("unit", ""),
|
|
}
|
|
|
|
|
|
async def analyze_server_health(
|
|
db: AsyncSession,
|
|
source: str,
|
|
metric_types: Optional[List[str]] = None,
|
|
) -> Dict:
|
|
"""
|
|
서버 전체 건강도 분석 (여러 메트릭 종합).
|
|
Returns: {"source", "health_score", "risk_level", "metrics": [...], "recommendations": [...]}
|
|
"""
|
|
if metric_types is None:
|
|
metric_types = list(PREDICTION_THRESHOLDS.keys())
|
|
|
|
metrics_result = []
|
|
risk_scores = []
|
|
recommendations = []
|
|
|
|
for mt in metric_types:
|
|
pred = await predict_metric_trend(db, source, mt)
|
|
metrics_result.append(pred)
|
|
|
|
# 위험도 점수 계산 (0~100)
|
|
curr = pred.get("current_value", 0)
|
|
cfg = PREDICTION_THRESHOLDS.get(mt, {})
|
|
crit = cfg.get("critical", 100)
|
|
warn = cfg.get("warning", 80)
|
|
|
|
if crit and curr >= crit:
|
|
risk_scores.append(90)
|
|
elif warn and curr >= warn:
|
|
risk_scores.append(60)
|
|
else:
|
|
risk_scores.append(max(0, (curr / crit * 50) if crit else 0))
|
|
|
|
# 예측 기반 권고 생성
|
|
ttr_crit = pred.get("ttr_critical_hours")
|
|
ttr_warn = pred.get("ttr_warning_hours")
|
|
if ttr_crit and ttr_crit < 24:
|
|
recommendations.append({
|
|
"metric": mt,
|
|
"severity": "CRITICAL",
|
|
"message": f"{mt} {round(ttr_crit)}시간 내 임계값 초과 예측 — 즉시 조치 필요",
|
|
"ttr_hours": ttr_crit,
|
|
})
|
|
elif ttr_warn and ttr_warn < 48:
|
|
recommendations.append({
|
|
"metric": mt,
|
|
"severity": "WARNING",
|
|
"message": f"{mt} {round(ttr_warn)}시간 내 경고 임계값 도달 예측 — 모니터링 강화",
|
|
"ttr_hours": ttr_warn,
|
|
})
|
|
|
|
avg_risk = sum(risk_scores) / len(risk_scores) if risk_scores else 0
|
|
health_score = round(100 - avg_risk, 1)
|
|
|
|
if health_score >= 80:
|
|
risk_level = "LOW"
|
|
elif health_score >= 60:
|
|
risk_level = "MEDIUM"
|
|
elif health_score >= 40:
|
|
risk_level = "HIGH"
|
|
else:
|
|
risk_level = "CRITICAL"
|
|
|
|
return {
|
|
"source": source,
|
|
"health_score": health_score,
|
|
"risk_level": risk_level,
|
|
"metrics": metrics_result,
|
|
"recommendations": recommendations,
|
|
"analyzed_at": datetime.utcnow().isoformat(),
|
|
}
|
|
|
|
|
|
# ── 예방적 SR 자동 생성 ────────────────────────────────────────────────────────
|
|
|
|
async def create_preventive_sr(
|
|
db: AsyncSession,
|
|
source: str,
|
|
metric_type: str,
|
|
ttr_hours: float,
|
|
predicted_value: float,
|
|
threshold: float,
|
|
severity: str = "WARNING",
|
|
) -> Optional[Dict]:
|
|
"""
|
|
예방적 SR 자동 생성 (예측 기반 선제 대응).
|
|
Returns: {"created": bool, "sr_id": str, "title": str}
|
|
"""
|
|
try:
|
|
from models import SRRequest, SRStatus, SRType
|
|
|
|
# 같은 소스+메트릭으로 24시간 내 예방 SR이 있으면 중복 생성 방지
|
|
since = datetime.utcnow() - timedelta(hours=24)
|
|
keyword = f"[예방] {source} {metric_type}"
|
|
existing = (await db.execute(
|
|
select(SRRequest).where(
|
|
and_(
|
|
SRRequest.created_at >= since,
|
|
SRRequest.title.like(f"%{source}%"),
|
|
SRRequest.title.like(f"%예방%"),
|
|
SRRequest.status.notin_(["COMPLETED", "CANCELLED"]),
|
|
)
|
|
)
|
|
)).scalars().first()
|
|
|
|
if existing:
|
|
return {"created": False, "reason": "중복 예방 SR 존재", "sr_id": existing.sr_id}
|
|
|
|
unit = PREDICTION_THRESHOLDS.get(metric_type, {}).get("unit", "")
|
|
title = (
|
|
f"[예방] {source} {metric_type} {round(ttr_hours)}h 내 "
|
|
f"임계값({threshold}{unit}) 초과 예측"
|
|
)
|
|
desc = (
|
|
f"예측 유지보수 에이전트가 자동 감지:\n"
|
|
f"- 대상 서버: {source}\n"
|
|
f"- 메트릭: {metric_type}\n"
|
|
f"- 현재 예측값: {round(predicted_value, 1)}{unit}\n"
|
|
f"- 임계값: {threshold}{unit}\n"
|
|
f"- 예상 도달 시간: {round(ttr_hours, 1)}시간 후\n"
|
|
f"- 심각도: {severity}\n\n"
|
|
f"권고 조치: 해당 서버의 {metric_type} 원인 분석 후 선제 대응 수행"
|
|
)
|
|
|
|
# SR ID 생성 (간단 시퀀스)
|
|
today = datetime.utcnow().strftime("%Y%m%d")
|
|
prefix = f"PM-{today}-"
|
|
last_sr = (await db.execute(
|
|
select(SRRequest.sr_id)
|
|
.where(SRRequest.sr_id.like(f"{prefix}%"))
|
|
.order_by(desc(SRRequest.sr_id))
|
|
.limit(1)
|
|
)).scalar()
|
|
seq = 1
|
|
if last_sr:
|
|
try:
|
|
seq = int(last_sr.split("-")[-1]) + 1
|
|
except ValueError:
|
|
seq = 1
|
|
sr_id = f"{prefix}{seq:04d}"
|
|
|
|
sr = SRRequest(
|
|
sr_id = sr_id,
|
|
title = title,
|
|
description = desc,
|
|
status = "OPEN",
|
|
priority = "HIGH" if severity == "CRITICAL" else "MEDIUM",
|
|
sr_type = "OTHER",
|
|
created_at = datetime.utcnow(),
|
|
)
|
|
db.add(sr)
|
|
await db.commit()
|
|
await db.refresh(sr)
|
|
|
|
logger.info("예방 SR 생성: %s (%s %s TTR=%.1fh)", sr_id, source, metric_type, ttr_hours)
|
|
return {
|
|
"created": True,
|
|
"sr_id": sr_id,
|
|
"title": title,
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error("예방 SR 생성 실패: %s", e)
|
|
return {"created": False, "reason": str(e)[:100]}
|
|
|
|
|
|
# ── 장비 노후화 분석 ──────────────────────────────────────────────────────────
|
|
|
|
def calculate_equipment_age(install_date: datetime) -> float:
|
|
"""장비 사용 연수 계산."""
|
|
delta = datetime.utcnow() - install_date
|
|
return delta.days / 365.25
|
|
|
|
|
|
def assess_equipment_lifecycle(
|
|
equipment_type: str,
|
|
install_date: datetime,
|
|
last_maintenance: Optional[datetime] = None,
|
|
) -> Dict:
|
|
"""
|
|
장비 수명 주기 평가.
|
|
Returns: {"age_years", "lifespan_years", "usage_pct", "status", "months_to_eol", "recommendation"}
|
|
"""
|
|
age_years = calculate_equipment_age(install_date)
|
|
lifespan = EQUIPMENT_LIFESPAN.get(equipment_type.upper(), EQUIPMENT_LIFESPAN["DEFAULT"])
|
|
usage_pct = min(100.0, round(age_years / lifespan * 100, 1))
|
|
months_to_eol = round((lifespan - age_years) * 12, 1)
|
|
|
|
if usage_pct >= 100:
|
|
status = "EOL" # End of Life
|
|
recommendation = "즉시 교체 계획 수립 필요 (수명 초과)"
|
|
elif usage_pct >= 85:
|
|
status = "CRITICAL"
|
|
recommendation = f"교체 준비 필요 (약 {max(0, round(months_to_eol))}개월 후 수명 종료)"
|
|
elif usage_pct >= 70:
|
|
status = "WARNING"
|
|
recommendation = f"교체 예산 편성 검토 권장 (약 {round(months_to_eol)}개월 후)"
|
|
else:
|
|
status = "HEALTHY"
|
|
recommendation = "정상 운영 가능"
|
|
|
|
# 마지막 유지보수 경과일
|
|
days_since_maint = None
|
|
if last_maintenance:
|
|
days_since_maint = (datetime.utcnow() - last_maintenance).days
|
|
|
|
return {
|
|
"age_years": round(age_years, 2),
|
|
"lifespan_years": lifespan,
|
|
"usage_pct": usage_pct,
|
|
"status": status,
|
|
"months_to_eol": months_to_eol,
|
|
"recommendation": recommendation,
|
|
"days_since_maintenance": days_since_maint,
|
|
}
|
|
|
|
|
|
async def run_lifecycle_analysis(
|
|
db: AsyncSession,
|
|
equipment_type_filter: Optional[str] = None,
|
|
max_items: int = 100,
|
|
) -> Dict:
|
|
"""
|
|
CMDB 서버 목록을 기반으로 수명 주기 분석 실행.
|
|
Returns: {"analyzed": int, "critical": int, "warning": int, "eol": int, "items": [...]}
|
|
"""
|
|
try:
|
|
from models import Server
|
|
|
|
q = select(Server)
|
|
if equipment_type_filter:
|
|
q = q.where(Server.server_type == equipment_type_filter.upper())
|
|
q = q.limit(max_items)
|
|
|
|
servers = (await db.execute(q)).scalars().all()
|
|
items = []
|
|
counts = {"EOL": 0, "CRITICAL": 0, "WARNING": 0, "HEALTHY": 0}
|
|
|
|
for srv in servers:
|
|
install_date = getattr(srv, "install_date", None) or (datetime.utcnow() - timedelta(days=365 * 3))
|
|
srv_type = getattr(srv, "server_type", "SERVER") or "SERVER"
|
|
|
|
assessment = assess_equipment_lifecycle(
|
|
equipment_type = srv_type,
|
|
install_date = install_date,
|
|
last_maintenance = getattr(srv, "last_maintenance_at", None),
|
|
)
|
|
assessment["server_id"] = srv.id
|
|
assessment["server_name"] = getattr(srv, "hostname", str(srv.id))
|
|
assessment["server_type"] = srv_type
|
|
|
|
items.append(assessment)
|
|
counts[assessment["status"]] = counts.get(assessment["status"], 0) + 1
|
|
|
|
# 위험도 순 정렬
|
|
order = {"EOL": 0, "CRITICAL": 1, "WARNING": 2, "HEALTHY": 3}
|
|
items.sort(key=lambda x: (order.get(x["status"], 4), -x["usage_pct"]))
|
|
|
|
return {
|
|
"analyzed": len(items),
|
|
"eol": counts["EOL"],
|
|
"critical": counts["CRITICAL"],
|
|
"warning": counts["WARNING"],
|
|
"healthy": counts["HEALTHY"],
|
|
"items": items,
|
|
"analyzed_at": datetime.utcnow().isoformat(),
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error("수명 주기 분석 오류: %s", e)
|
|
return {
|
|
"analyzed": 0,
|
|
"eol": 0, "critical": 0, "warning": 0, "healthy": 0,
|
|
"items": [],
|
|
"error": str(e)[:100],
|
|
}
|
|
|
|
|
|
# ── 예측 배치 실행 ────────────────────────────────────────────────────────────
|
|
|
|
async def run_predictive_batch(
|
|
db: AsyncSession,
|
|
auto_create_sr: bool = True,
|
|
ttr_threshold_hours: float = 48.0,
|
|
max_sources: int = 50,
|
|
) -> Dict:
|
|
"""
|
|
모든 활성 서버에 대해 예측 유지보수 배치 실행.
|
|
Returns: {"analyzed": int, "alerts": int, "srs_created": int, "results": [...]}
|
|
"""
|
|
try:
|
|
from models import MetricSnapshot
|
|
|
|
# 최근 24시간 내 메트릭이 있는 소스 목록 조회
|
|
since = datetime.utcnow() - timedelta(hours=24)
|
|
sources_rows = (await db.execute(
|
|
select(MetricSnapshot.source)
|
|
.where(MetricSnapshot.measured_at >= since)
|
|
.distinct()
|
|
.limit(max_sources)
|
|
)).scalars().all()
|
|
sources = list(sources_rows)
|
|
|
|
except Exception:
|
|
sources = []
|
|
|
|
results = []
|
|
alerts = 0
|
|
srs_created = 0
|
|
|
|
for source in sources:
|
|
for mt in PREDICTION_THRESHOLDS.keys():
|
|
try:
|
|
pred = await predict_metric_trend(db, source, mt)
|
|
ttr_crit = pred.get("ttr_critical_hours")
|
|
ttr_warn = pred.get("ttr_warning_hours")
|
|
|
|
alert_level = None
|
|
ttr_used = None
|
|
|
|
if ttr_crit and ttr_crit <= ttr_threshold_hours:
|
|
alert_level = "CRITICAL"
|
|
ttr_used = ttr_crit
|
|
alerts += 1
|
|
elif ttr_warn and ttr_warn <= ttr_threshold_hours * 2:
|
|
alert_level = "WARNING"
|
|
ttr_used = ttr_warn
|
|
alerts += 1
|
|
|
|
if alert_level and auto_create_sr and ttr_used:
|
|
sr_result = await create_preventive_sr(
|
|
db = db,
|
|
source = source,
|
|
metric_type = mt,
|
|
ttr_hours = ttr_used,
|
|
predicted_value = pred.get("predicted_value", 0),
|
|
threshold = (
|
|
PREDICTION_THRESHOLDS[mt]["critical"]
|
|
if alert_level == "CRITICAL"
|
|
else PREDICTION_THRESHOLDS[mt]["warning"]
|
|
),
|
|
severity = alert_level,
|
|
)
|
|
if sr_result and sr_result.get("created"):
|
|
srs_created += 1
|
|
pred["preventive_sr"] = sr_result
|
|
|
|
if alert_level:
|
|
pred["alert_level"] = alert_level
|
|
results.append(pred)
|
|
|
|
except Exception as e:
|
|
logger.debug("예측 배치 오류 (%s/%s): %s", source, mt, e)
|
|
|
|
return {
|
|
"analyzed": len(sources) * len(PREDICTION_THRESHOLDS),
|
|
"sources": len(sources),
|
|
"alerts": alerts,
|
|
"srs_created": srs_created,
|
|
"results": results,
|
|
"run_at": datetime.utcnow().isoformat(),
|
|
}
|