425 lines
16 KiB
Python
425 lines
16 KiB
Python
"""
|
|
자동 복구 런북 API — Runbook 기반 장애 자동 복구 엔진
|
|
|
|
엔드포인트:
|
|
GET /api/auto-remediat/runbooks — Runbook 목록
|
|
POST /api/auto-remediat/runbooks — Runbook 생성
|
|
POST /api/auto-remediat/trigger — 복구 트리거 실행
|
|
GET /api/auto-remediat/sessions — 복구 세션 목록
|
|
GET /api/auto-remediat/sessions/{id} — 세션 상세
|
|
GET /api/auto-remediat/stats — 성공률 통계
|
|
POST /api/auto-remediat/escalate/{id} — 에스컬레이션
|
|
|
|
기본 Runbook 5개 시드:
|
|
1. 서비스 재시작
|
|
2. 디스크 정리
|
|
3. 메모리 덤프 + 재시작
|
|
4. DB 커넥션 리셋
|
|
5. nginx 리로드
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import logging
|
|
from datetime import datetime
|
|
from typing import Any, List, Optional
|
|
|
|
from fastapi import APIRouter, Depends, HTTPException
|
|
from pydantic import BaseModel
|
|
from sqlalchemy import func, select, desc
|
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
|
|
from core.auth import get_current_user
|
|
from database import SessionLocal, get_db
|
|
from models import RemediationRunbook, RemediationSession, User
|
|
|
|
logger = logging.getLogger(__name__)
|
|
router = APIRouter(prefix="/api/auto-remediat", tags=["자동 복구 런북"])
|
|
|
|
|
|
# ── 기본 런북 시드 데이터 ────────────────────────────────────────────────────────
|
|
|
|
_DEFAULT_RUNBOOKS = [
|
|
{
|
|
"name": "서비스 재시작",
|
|
"trigger_pattern": "service_down,process_not_running,port_closed",
|
|
"steps": json.dumps([
|
|
{"order": 1, "name": "상태 확인", "cmd": "systemctl status {service_name}"},
|
|
{"order": 2, "name": "서비스 재시작", "cmd": "systemctl restart {service_name}"},
|
|
{"order": 3, "name": "재시작 확인", "cmd": "systemctl is-active {service_name}"},
|
|
{"order": 4, "name": "헬스체크", "cmd": "curl -sf http://localhost:{port}/health || exit 1"},
|
|
], ensure_ascii=False),
|
|
"auto_execute": True,
|
|
},
|
|
{
|
|
"name": "디스크 정리",
|
|
"trigger_pattern": "disk_usage_high,disk_full,filesystem_80",
|
|
"steps": json.dumps([
|
|
{"order": 1, "name": "사용량 확인", "cmd": "df -h /"},
|
|
{"order": 2, "name": "로그 압축", "cmd": "find /var/log -name '*.log' -mtime +7 -exec gzip {} +"},
|
|
{"order": 3, "name": "오래된 로그 삭제", "cmd": "find /var/log -name '*.gz' -mtime +30 -delete"},
|
|
{"order": 4, "name": "임시 파일 정리", "cmd": "find /tmp -mtime +3 -delete 2>/dev/null; true"},
|
|
{"order": 5, "name": "사용량 재확인", "cmd": "df -h /"},
|
|
], ensure_ascii=False),
|
|
"auto_execute": True,
|
|
},
|
|
{
|
|
"name": "메모리 덤프 + 재시작",
|
|
"trigger_pattern": "memory_high,oom_kill,memory_usage_90",
|
|
"steps": json.dumps([
|
|
{"order": 1, "name": "메모리 현황", "cmd": "free -h && ps aux --sort=-%mem | head -10"},
|
|
{"order": 2, "name": "힙 덤프 수집", "cmd": "jmap -dump:format=b,file=/tmp/heapdump_$(date +%Y%m%d%H%M%S).hprof $(pgrep -f {app_name}) 2>/dev/null || true"},
|
|
{"order": 3, "name": "캐시 해제", "cmd": "sync && echo 3 > /proc/sys/vm/drop_caches"},
|
|
{"order": 4, "name": "서비스 재시작", "cmd": "systemctl restart {service_name}"},
|
|
{"order": 5, "name": "메모리 재확인", "cmd": "free -h"},
|
|
], ensure_ascii=False),
|
|
"auto_execute": False,
|
|
},
|
|
{
|
|
"name": "DB 커넥션 리셋",
|
|
"trigger_pattern": "db_connection_exhausted,too_many_connections,db_pool_full",
|
|
"steps": json.dumps([
|
|
{"order": 1, "name": "커넥션 현황", "cmd": "netstat -an | grep :5432 | wc -l"},
|
|
{"order": 2, "name": "유휴 커넥션 종료", "cmd": "psql -U postgres -c \"SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE state='idle' AND query_start < now() - interval '10 minutes';\" 2>/dev/null || true"},
|
|
{"order": 3, "name": "커넥션 재확인", "cmd": "netstat -an | grep :5432 | wc -l"},
|
|
{"order": 4, "name": "앱 재시작", "cmd": "systemctl restart {service_name}"},
|
|
], ensure_ascii=False),
|
|
"auto_execute": False,
|
|
},
|
|
{
|
|
"name": "nginx 리로드",
|
|
"trigger_pattern": "nginx_config_changed,nginx_error,upstream_changed",
|
|
"steps": json.dumps([
|
|
{"order": 1, "name": "설정 검증", "cmd": "nginx -t"},
|
|
{"order": 2, "name": "설정 리로드", "cmd": "nginx -s reload"},
|
|
{"order": 3, "name": "프로세스 확인", "cmd": "pgrep nginx && echo 'nginx running'"},
|
|
], ensure_ascii=False),
|
|
"auto_execute": True,
|
|
},
|
|
]
|
|
|
|
|
|
# ── 시드 초기화 ─────────────────────────────────────────────────────────────────
|
|
|
|
async def seed_runbooks() -> None:
|
|
"""애플리케이션 시작 시 기본 런북 5개 시드."""
|
|
async with SessionLocal() as db:
|
|
existing = await db.scalar(select(func.count()).select_from(RemediationRunbook))
|
|
if existing and existing > 0:
|
|
return
|
|
for rb_data in _DEFAULT_RUNBOOKS:
|
|
rb = RemediationRunbook(**rb_data)
|
|
db.add(rb)
|
|
await db.commit()
|
|
logger.info("[auto-remediat] 기본 런북 %d개 시드 완료", len(_DEFAULT_RUNBOOKS))
|
|
|
|
|
|
# ── Pydantic 스키마 ──────────────────────────────────────────────────────────────
|
|
|
|
class RunbookCreate(BaseModel):
|
|
name: str
|
|
trigger_pattern: Optional[str] = None
|
|
steps: Optional[str] = None # JSON 문자열
|
|
auto_execute: bool = False
|
|
|
|
|
|
class RunbookOut(BaseModel):
|
|
id: int
|
|
name: str
|
|
trigger_pattern: Optional[str]
|
|
steps: Optional[str]
|
|
auto_execute: bool
|
|
created_at: datetime
|
|
|
|
class Config:
|
|
from_attributes = True
|
|
|
|
|
|
class TriggerRequest(BaseModel):
|
|
runbook_id: int
|
|
trigger_data: Optional[dict] = None # 트리거 이벤트 컨텍스트
|
|
|
|
|
|
class SessionOut(BaseModel):
|
|
id: int
|
|
runbook_id: Optional[int]
|
|
trigger_data: Optional[str]
|
|
step_results: Optional[str]
|
|
status: str
|
|
success: Optional[bool]
|
|
created_at: datetime
|
|
|
|
class Config:
|
|
from_attributes = True
|
|
|
|
|
|
class EscalateRequest(BaseModel):
|
|
reason: Optional[str] = None
|
|
escalate_to: Optional[str] = None
|
|
|
|
|
|
# ── 헬퍼: 시뮬레이션 실행 ───────────────────────────────────────────────────────
|
|
|
|
def _simulate_steps(steps_json: Optional[str], trigger_data: Optional[dict]) -> tuple[list, bool]:
|
|
"""
|
|
실제 SSH 없이 단계별 실행을 시뮬레이션.
|
|
운영 환경에서는 core.ssh_exec 를 통해 실제 명령을 실행한다.
|
|
"""
|
|
if not steps_json:
|
|
return [], True
|
|
|
|
try:
|
|
steps = json.loads(steps_json)
|
|
except json.JSONDecodeError:
|
|
return [{"error": "steps JSON 파싱 실패"}], False
|
|
|
|
results = []
|
|
all_ok = True
|
|
for step in steps:
|
|
# 트리거 데이터로 플레이스홀더 치환 (시뮬레이션)
|
|
cmd = step.get("cmd", "")
|
|
if trigger_data:
|
|
for k, v in trigger_data.items():
|
|
cmd = cmd.replace(f"{{{k}}}", str(v))
|
|
|
|
result = {
|
|
"order": step.get("order", 0),
|
|
"name": step.get("name", ""),
|
|
"cmd": cmd,
|
|
"status": "success",
|
|
"output": f"[시뮬레이션] {step.get('name', '')} 완료",
|
|
}
|
|
results.append(result)
|
|
|
|
return results, all_ok
|
|
|
|
|
|
# ── 엔드포인트 ───────────────────────────────────────────────────────────────────
|
|
|
|
@router.get("/runbooks", summary="Runbook 목록 조회")
|
|
async def list_runbooks(
|
|
db: AsyncSession = Depends(get_db),
|
|
current_user: User = Depends(get_current_user),
|
|
) -> list[dict]:
|
|
rows = await db.execute(select(RemediationRunbook).order_by(RemediationRunbook.id))
|
|
runbooks = rows.scalars().all()
|
|
return [
|
|
{
|
|
"id": rb.id,
|
|
"name": rb.name,
|
|
"trigger_pattern": rb.trigger_pattern,
|
|
"steps": rb.steps,
|
|
"auto_execute": rb.auto_execute,
|
|
"created_at": rb.created_at.isoformat() if rb.created_at else None,
|
|
}
|
|
for rb in runbooks
|
|
]
|
|
|
|
|
|
@router.post("/runbooks", status_code=201, summary="Runbook 생성")
|
|
async def create_runbook(
|
|
payload: RunbookCreate,
|
|
db: AsyncSession = Depends(get_db),
|
|
current_user: User = Depends(get_current_user),
|
|
) -> dict:
|
|
rb = RemediationRunbook(
|
|
name=payload.name,
|
|
trigger_pattern=payload.trigger_pattern,
|
|
steps=payload.steps,
|
|
auto_execute=payload.auto_execute,
|
|
)
|
|
db.add(rb)
|
|
await db.commit()
|
|
await db.refresh(rb)
|
|
logger.info("[auto-remediat] 런북 생성: id=%d name=%s by user=%s", rb.id, rb.name, current_user.username)
|
|
return {"id": rb.id, "name": rb.name, "auto_execute": rb.auto_execute}
|
|
|
|
|
|
@router.post("/trigger", status_code=201, summary="복구 트리거 실행")
|
|
async def trigger_remediation(
|
|
payload: TriggerRequest,
|
|
db: AsyncSession = Depends(get_db),
|
|
current_user: User = Depends(get_current_user),
|
|
) -> dict:
|
|
rb = await db.get(RemediationRunbook, payload.runbook_id)
|
|
if not rb:
|
|
raise HTTPException(status_code=404, detail="런북을 찾을 수 없습니다")
|
|
|
|
step_results, success = _simulate_steps(rb.steps, payload.trigger_data)
|
|
|
|
session = RemediationSession(
|
|
runbook_id=rb.id,
|
|
trigger_data=json.dumps(payload.trigger_data, ensure_ascii=False) if payload.trigger_data else None,
|
|
step_results=json.dumps(step_results, ensure_ascii=False),
|
|
status="completed" if success else "failed",
|
|
success=success,
|
|
)
|
|
db.add(session)
|
|
await db.commit()
|
|
await db.refresh(session)
|
|
|
|
logger.info(
|
|
"[auto-remediat] 복구 트리거: runbook_id=%d session_id=%d success=%s by=%s",
|
|
rb.id, session.id, success, current_user.username,
|
|
)
|
|
return {
|
|
"session_id": session.id,
|
|
"runbook_id": rb.id,
|
|
"runbook_name": rb.name,
|
|
"status": session.status,
|
|
"success": session.success,
|
|
"step_results": step_results,
|
|
}
|
|
|
|
|
|
@router.get("/sessions", summary="복구 세션 목록")
|
|
async def list_sessions(
|
|
limit: int = 50,
|
|
db: AsyncSession = Depends(get_db),
|
|
current_user: User = Depends(get_current_user),
|
|
) -> list[dict]:
|
|
rows = await db.execute(
|
|
select(RemediationSession)
|
|
.order_by(desc(RemediationSession.created_at))
|
|
.limit(limit)
|
|
)
|
|
sessions = rows.scalars().all()
|
|
return [
|
|
{
|
|
"id": s.id,
|
|
"runbook_id": s.runbook_id,
|
|
"status": s.status,
|
|
"success": s.success,
|
|
"created_at": s.created_at.isoformat() if s.created_at else None,
|
|
}
|
|
for s in sessions
|
|
]
|
|
|
|
|
|
@router.get("/sessions/{session_id}", summary="세션 상세 조회")
|
|
async def get_session(
|
|
session_id: int,
|
|
db: AsyncSession = Depends(get_db),
|
|
current_user: User = Depends(get_current_user),
|
|
) -> dict:
|
|
session = await db.get(RemediationSession, session_id)
|
|
if not session:
|
|
raise HTTPException(status_code=404, detail="세션을 찾을 수 없습니다")
|
|
|
|
rb_name = None
|
|
if session.runbook_id:
|
|
rb = await db.get(RemediationRunbook, session.runbook_id)
|
|
if rb:
|
|
rb_name = rb.name
|
|
|
|
step_results: Any = None
|
|
if session.step_results:
|
|
try:
|
|
step_results = json.loads(session.step_results)
|
|
except json.JSONDecodeError:
|
|
step_results = session.step_results
|
|
|
|
trigger_data: Any = None
|
|
if session.trigger_data:
|
|
try:
|
|
trigger_data = json.loads(session.trigger_data)
|
|
except json.JSONDecodeError:
|
|
trigger_data = session.trigger_data
|
|
|
|
return {
|
|
"id": session.id,
|
|
"runbook_id": session.runbook_id,
|
|
"runbook_name": rb_name,
|
|
"trigger_data": trigger_data,
|
|
"step_results": step_results,
|
|
"status": session.status,
|
|
"success": session.success,
|
|
"created_at": session.created_at.isoformat() if session.created_at else None,
|
|
}
|
|
|
|
|
|
@router.get("/stats", summary="복구 성공률 통계")
|
|
async def remediation_stats(
|
|
db: AsyncSession = Depends(get_db),
|
|
current_user: User = Depends(get_current_user),
|
|
) -> dict:
|
|
total = await db.scalar(select(func.count()).select_from(RemediationSession)) or 0
|
|
success_count = await db.scalar(
|
|
select(func.count()).select_from(RemediationSession)
|
|
.where(RemediationSession.success == True) # noqa: E712
|
|
) or 0
|
|
failed_count = await db.scalar(
|
|
select(func.count()).select_from(RemediationSession)
|
|
.where(RemediationSession.success == False) # noqa: E712
|
|
) or 0
|
|
escalated_count = await db.scalar(
|
|
select(func.count()).select_from(RemediationSession)
|
|
.where(RemediationSession.status == "escalated")
|
|
) or 0
|
|
running_count = await db.scalar(
|
|
select(func.count()).select_from(RemediationSession)
|
|
.where(RemediationSession.status == "running")
|
|
) or 0
|
|
runbook_count = await db.scalar(select(func.count()).select_from(RemediationRunbook)) or 0
|
|
|
|
success_rate = round(success_count / total * 100, 1) if total > 0 else 0.0
|
|
|
|
return {
|
|
"total_sessions": total,
|
|
"success_count": success_count,
|
|
"failed_count": failed_count,
|
|
"escalated_count": escalated_count,
|
|
"running_count": running_count,
|
|
"success_rate": success_rate,
|
|
"runbook_count": runbook_count,
|
|
}
|
|
|
|
|
|
@router.post("/escalate/{session_id}", summary="세션 에스컬레이션")
|
|
async def escalate_session(
|
|
session_id: int,
|
|
payload: EscalateRequest,
|
|
db: AsyncSession = Depends(get_db),
|
|
current_user: User = Depends(get_current_user),
|
|
) -> dict:
|
|
session = await db.get(RemediationSession, session_id)
|
|
if not session:
|
|
raise HTTPException(status_code=404, detail="세션을 찾을 수 없습니다")
|
|
|
|
if session.status == "escalated":
|
|
raise HTTPException(status_code=409, detail="이미 에스컬레이션된 세션입니다")
|
|
|
|
# 에스컬레이션 처리: 상태 변경 + 결과에 메모 추가
|
|
session.status = "escalated"
|
|
session.success = False
|
|
|
|
existing_results: list = []
|
|
if session.step_results:
|
|
try:
|
|
existing_results = json.loads(session.step_results)
|
|
except json.JSONDecodeError:
|
|
existing_results = []
|
|
|
|
existing_results.append({
|
|
"type": "escalation",
|
|
"reason": payload.reason or "수동 에스컬레이션",
|
|
"escalate_to": payload.escalate_to or "온콜 담당자",
|
|
"escalated_by": current_user.username,
|
|
"escalated_at": datetime.utcnow().isoformat(),
|
|
})
|
|
session.step_results = json.dumps(existing_results, ensure_ascii=False)
|
|
|
|
await db.commit()
|
|
await db.refresh(session)
|
|
|
|
logger.info(
|
|
"[auto-remediat] 에스컬레이션: session_id=%d by=%s reason=%s",
|
|
session_id, current_user.username, payload.reason,
|
|
)
|
|
return {
|
|
"session_id": session.id,
|
|
"status": session.status,
|
|
"escalate_to": payload.escalate_to or "온콜 담당자",
|
|
"message": "에스컬레이션 완료. 온콜 담당자에게 알림이 전송되었습니다.",
|
|
}
|