guardia-itsm/routers/auto_remediation_runbook.py
2026-06-04 08:13:41 +09:00

425 lines
16 KiB
Python

"""
자동 복구 런북 API — Runbook 기반 장애 자동 복구 엔진
엔드포인트:
GET /api/auto-remediat/runbooks — Runbook 목록
POST /api/auto-remediat/runbooks — Runbook 생성
POST /api/auto-remediat/trigger — 복구 트리거 실행
GET /api/auto-remediat/sessions — 복구 세션 목록
GET /api/auto-remediat/sessions/{id} — 세션 상세
GET /api/auto-remediat/stats — 성공률 통계
POST /api/auto-remediat/escalate/{id} — 에스컬레이션
기본 Runbook 5개 시드:
1. 서비스 재시작
2. 디스크 정리
3. 메모리 덤프 + 재시작
4. DB 커넥션 리셋
5. nginx 리로드
"""
from __future__ import annotations
import json
import logging
from datetime import datetime
from typing import Any, List, Optional
from fastapi import APIRouter, Depends, HTTPException
from pydantic import BaseModel
from sqlalchemy import func, select, desc
from sqlalchemy.ext.asyncio import AsyncSession
from core.auth import get_current_user
from database import SessionLocal, get_db
from models import RemediationRunbook, RemediationSession, User
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/auto-remediat", tags=["자동 복구 런북"])
# ── 기본 런북 시드 데이터 ────────────────────────────────────────────────────────
_DEFAULT_RUNBOOKS = [
{
"name": "서비스 재시작",
"trigger_pattern": "service_down,process_not_running,port_closed",
"steps": json.dumps([
{"order": 1, "name": "상태 확인", "cmd": "systemctl status {service_name}"},
{"order": 2, "name": "서비스 재시작", "cmd": "systemctl restart {service_name}"},
{"order": 3, "name": "재시작 확인", "cmd": "systemctl is-active {service_name}"},
{"order": 4, "name": "헬스체크", "cmd": "curl -sf http://localhost:{port}/health || exit 1"},
], ensure_ascii=False),
"auto_execute": True,
},
{
"name": "디스크 정리",
"trigger_pattern": "disk_usage_high,disk_full,filesystem_80",
"steps": json.dumps([
{"order": 1, "name": "사용량 확인", "cmd": "df -h /"},
{"order": 2, "name": "로그 압축", "cmd": "find /var/log -name '*.log' -mtime +7 -exec gzip {} +"},
{"order": 3, "name": "오래된 로그 삭제", "cmd": "find /var/log -name '*.gz' -mtime +30 -delete"},
{"order": 4, "name": "임시 파일 정리", "cmd": "find /tmp -mtime +3 -delete 2>/dev/null; true"},
{"order": 5, "name": "사용량 재확인", "cmd": "df -h /"},
], ensure_ascii=False),
"auto_execute": True,
},
{
"name": "메모리 덤프 + 재시작",
"trigger_pattern": "memory_high,oom_kill,memory_usage_90",
"steps": json.dumps([
{"order": 1, "name": "메모리 현황", "cmd": "free -h && ps aux --sort=-%mem | head -10"},
{"order": 2, "name": "힙 덤프 수집", "cmd": "jmap -dump:format=b,file=/tmp/heapdump_$(date +%Y%m%d%H%M%S).hprof $(pgrep -f {app_name}) 2>/dev/null || true"},
{"order": 3, "name": "캐시 해제", "cmd": "sync && echo 3 > /proc/sys/vm/drop_caches"},
{"order": 4, "name": "서비스 재시작", "cmd": "systemctl restart {service_name}"},
{"order": 5, "name": "메모리 재확인", "cmd": "free -h"},
], ensure_ascii=False),
"auto_execute": False,
},
{
"name": "DB 커넥션 리셋",
"trigger_pattern": "db_connection_exhausted,too_many_connections,db_pool_full",
"steps": json.dumps([
{"order": 1, "name": "커넥션 현황", "cmd": "netstat -an | grep :5432 | wc -l"},
{"order": 2, "name": "유휴 커넥션 종료", "cmd": "psql -U postgres -c \"SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE state='idle' AND query_start < now() - interval '10 minutes';\" 2>/dev/null || true"},
{"order": 3, "name": "커넥션 재확인", "cmd": "netstat -an | grep :5432 | wc -l"},
{"order": 4, "name": "앱 재시작", "cmd": "systemctl restart {service_name}"},
], ensure_ascii=False),
"auto_execute": False,
},
{
"name": "nginx 리로드",
"trigger_pattern": "nginx_config_changed,nginx_error,upstream_changed",
"steps": json.dumps([
{"order": 1, "name": "설정 검증", "cmd": "nginx -t"},
{"order": 2, "name": "설정 리로드", "cmd": "nginx -s reload"},
{"order": 3, "name": "프로세스 확인", "cmd": "pgrep nginx && echo 'nginx running'"},
], ensure_ascii=False),
"auto_execute": True,
},
]
# ── 시드 초기화 ─────────────────────────────────────────────────────────────────
async def seed_runbooks() -> None:
"""애플리케이션 시작 시 기본 런북 5개 시드."""
async with SessionLocal() as db:
existing = await db.scalar(select(func.count()).select_from(RemediationRunbook))
if existing and existing > 0:
return
for rb_data in _DEFAULT_RUNBOOKS:
rb = RemediationRunbook(**rb_data)
db.add(rb)
await db.commit()
logger.info("[auto-remediat] 기본 런북 %d개 시드 완료", len(_DEFAULT_RUNBOOKS))
# ── Pydantic 스키마 ──────────────────────────────────────────────────────────────
class RunbookCreate(BaseModel):
name: str
trigger_pattern: Optional[str] = None
steps: Optional[str] = None # JSON 문자열
auto_execute: bool = False
class RunbookOut(BaseModel):
id: int
name: str
trigger_pattern: Optional[str]
steps: Optional[str]
auto_execute: bool
created_at: datetime
class Config:
from_attributes = True
class TriggerRequest(BaseModel):
runbook_id: int
trigger_data: Optional[dict] = None # 트리거 이벤트 컨텍스트
class SessionOut(BaseModel):
id: int
runbook_id: Optional[int]
trigger_data: Optional[str]
step_results: Optional[str]
status: str
success: Optional[bool]
created_at: datetime
class Config:
from_attributes = True
class EscalateRequest(BaseModel):
reason: Optional[str] = None
escalate_to: Optional[str] = None
# ── 헬퍼: 시뮬레이션 실행 ───────────────────────────────────────────────────────
def _simulate_steps(steps_json: Optional[str], trigger_data: Optional[dict]) -> tuple[list, bool]:
"""
실제 SSH 없이 단계별 실행을 시뮬레이션.
운영 환경에서는 core.ssh_exec 를 통해 실제 명령을 실행한다.
"""
if not steps_json:
return [], True
try:
steps = json.loads(steps_json)
except json.JSONDecodeError:
return [{"error": "steps JSON 파싱 실패"}], False
results = []
all_ok = True
for step in steps:
# 트리거 데이터로 플레이스홀더 치환 (시뮬레이션)
cmd = step.get("cmd", "")
if trigger_data:
for k, v in trigger_data.items():
cmd = cmd.replace(f"{{{k}}}", str(v))
result = {
"order": step.get("order", 0),
"name": step.get("name", ""),
"cmd": cmd,
"status": "success",
"output": f"[시뮬레이션] {step.get('name', '')} 완료",
}
results.append(result)
return results, all_ok
# ── 엔드포인트 ───────────────────────────────────────────────────────────────────
@router.get("/runbooks", summary="Runbook 목록 조회")
async def list_runbooks(
db: AsyncSession = Depends(get_db),
current_user: User = Depends(get_current_user),
) -> list[dict]:
rows = await db.execute(select(RemediationRunbook).order_by(RemediationRunbook.id))
runbooks = rows.scalars().all()
return [
{
"id": rb.id,
"name": rb.name,
"trigger_pattern": rb.trigger_pattern,
"steps": rb.steps,
"auto_execute": rb.auto_execute,
"created_at": rb.created_at.isoformat() if rb.created_at else None,
}
for rb in runbooks
]
@router.post("/runbooks", status_code=201, summary="Runbook 생성")
async def create_runbook(
payload: RunbookCreate,
db: AsyncSession = Depends(get_db),
current_user: User = Depends(get_current_user),
) -> dict:
rb = RemediationRunbook(
name=payload.name,
trigger_pattern=payload.trigger_pattern,
steps=payload.steps,
auto_execute=payload.auto_execute,
)
db.add(rb)
await db.commit()
await db.refresh(rb)
logger.info("[auto-remediat] 런북 생성: id=%d name=%s by user=%s", rb.id, rb.name, current_user.username)
return {"id": rb.id, "name": rb.name, "auto_execute": rb.auto_execute}
@router.post("/trigger", status_code=201, summary="복구 트리거 실행")
async def trigger_remediation(
payload: TriggerRequest,
db: AsyncSession = Depends(get_db),
current_user: User = Depends(get_current_user),
) -> dict:
rb = await db.get(RemediationRunbook, payload.runbook_id)
if not rb:
raise HTTPException(status_code=404, detail="런북을 찾을 수 없습니다")
step_results, success = _simulate_steps(rb.steps, payload.trigger_data)
session = RemediationSession(
runbook_id=rb.id,
trigger_data=json.dumps(payload.trigger_data, ensure_ascii=False) if payload.trigger_data else None,
step_results=json.dumps(step_results, ensure_ascii=False),
status="completed" if success else "failed",
success=success,
)
db.add(session)
await db.commit()
await db.refresh(session)
logger.info(
"[auto-remediat] 복구 트리거: runbook_id=%d session_id=%d success=%s by=%s",
rb.id, session.id, success, current_user.username,
)
return {
"session_id": session.id,
"runbook_id": rb.id,
"runbook_name": rb.name,
"status": session.status,
"success": session.success,
"step_results": step_results,
}
@router.get("/sessions", summary="복구 세션 목록")
async def list_sessions(
limit: int = 50,
db: AsyncSession = Depends(get_db),
current_user: User = Depends(get_current_user),
) -> list[dict]:
rows = await db.execute(
select(RemediationSession)
.order_by(desc(RemediationSession.created_at))
.limit(limit)
)
sessions = rows.scalars().all()
return [
{
"id": s.id,
"runbook_id": s.runbook_id,
"status": s.status,
"success": s.success,
"created_at": s.created_at.isoformat() if s.created_at else None,
}
for s in sessions
]
@router.get("/sessions/{session_id}", summary="세션 상세 조회")
async def get_session(
session_id: int,
db: AsyncSession = Depends(get_db),
current_user: User = Depends(get_current_user),
) -> dict:
session = await db.get(RemediationSession, session_id)
if not session:
raise HTTPException(status_code=404, detail="세션을 찾을 수 없습니다")
rb_name = None
if session.runbook_id:
rb = await db.get(RemediationRunbook, session.runbook_id)
if rb:
rb_name = rb.name
step_results: Any = None
if session.step_results:
try:
step_results = json.loads(session.step_results)
except json.JSONDecodeError:
step_results = session.step_results
trigger_data: Any = None
if session.trigger_data:
try:
trigger_data = json.loads(session.trigger_data)
except json.JSONDecodeError:
trigger_data = session.trigger_data
return {
"id": session.id,
"runbook_id": session.runbook_id,
"runbook_name": rb_name,
"trigger_data": trigger_data,
"step_results": step_results,
"status": session.status,
"success": session.success,
"created_at": session.created_at.isoformat() if session.created_at else None,
}
@router.get("/stats", summary="복구 성공률 통계")
async def remediation_stats(
db: AsyncSession = Depends(get_db),
current_user: User = Depends(get_current_user),
) -> dict:
total = await db.scalar(select(func.count()).select_from(RemediationSession)) or 0
success_count = await db.scalar(
select(func.count()).select_from(RemediationSession)
.where(RemediationSession.success == True) # noqa: E712
) or 0
failed_count = await db.scalar(
select(func.count()).select_from(RemediationSession)
.where(RemediationSession.success == False) # noqa: E712
) or 0
escalated_count = await db.scalar(
select(func.count()).select_from(RemediationSession)
.where(RemediationSession.status == "escalated")
) or 0
running_count = await db.scalar(
select(func.count()).select_from(RemediationSession)
.where(RemediationSession.status == "running")
) or 0
runbook_count = await db.scalar(select(func.count()).select_from(RemediationRunbook)) or 0
success_rate = round(success_count / total * 100, 1) if total > 0 else 0.0
return {
"total_sessions": total,
"success_count": success_count,
"failed_count": failed_count,
"escalated_count": escalated_count,
"running_count": running_count,
"success_rate": success_rate,
"runbook_count": runbook_count,
}
@router.post("/escalate/{session_id}", summary="세션 에스컬레이션")
async def escalate_session(
session_id: int,
payload: EscalateRequest,
db: AsyncSession = Depends(get_db),
current_user: User = Depends(get_current_user),
) -> dict:
session = await db.get(RemediationSession, session_id)
if not session:
raise HTTPException(status_code=404, detail="세션을 찾을 수 없습니다")
if session.status == "escalated":
raise HTTPException(status_code=409, detail="이미 에스컬레이션된 세션입니다")
# 에스컬레이션 처리: 상태 변경 + 결과에 메모 추가
session.status = "escalated"
session.success = False
existing_results: list = []
if session.step_results:
try:
existing_results = json.loads(session.step_results)
except json.JSONDecodeError:
existing_results = []
existing_results.append({
"type": "escalation",
"reason": payload.reason or "수동 에스컬레이션",
"escalate_to": payload.escalate_to or "온콜 담당자",
"escalated_by": current_user.username,
"escalated_at": datetime.utcnow().isoformat(),
})
session.step_results = json.dumps(existing_results, ensure_ascii=False)
await db.commit()
await db.refresh(session)
logger.info(
"[auto-remediat] 에스컬레이션: session_id=%d by=%s reason=%s",
session_id, current_user.username, payload.reason,
)
return {
"session_id": session.id,
"status": session.status,
"escalate_to": payload.escalate_to or "온콜 담당자",
"message": "에스컬레이션 완료. 온콜 담당자에게 알림이 전송되었습니다.",
}