guardia-itsm/routers/self_healer.py

126 lines
4.2 KiB
Python

"""자가 수복 — 장애 감지 후 자동 서비스 재시작·복구."""
from __future__ import annotations
import asyncio, logging, subprocess
from datetime import datetime
from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException
from pydantic import BaseModel
from sqlalchemy import select, desc
from sqlalchemy.ext.asyncio import AsyncSession
from core.auth import get_current_user, require_admin_role
from database import get_db
from models import User, AutonomousAction
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/self-heal", tags=["자가수복"])
HEALABLE_SERVICES = {
"guardia": "systemctl restart guardia",
"guardia-manager": "systemctl restart guardia-manager",
"zioinfo": "systemctl restart zioinfo",
"zioinfo-mail": "systemctl restart zioinfo-mail",
"nginx": "systemctl restart nginx",
"ollama": "systemctl restart ollama",
}
class HealRequest(BaseModel):
service: str
reason: str = "manual"
async def _attempt_heal(service: str, reason: str, actor: str = "auto"):
"""서비스 재시작 시도 후 결과 기록."""
cmd = HEALABLE_SERVICES.get(service)
if not cmd:
return False, f"알 수 없는 서비스: {service}"
try:
proc = await asyncio.create_subprocess_shell(
cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.STDOUT
)
stdout, _ = await asyncio.wait_for(proc.communicate(), timeout=30)
success = proc.returncode == 0
result = stdout.decode("utf-8", "replace")
except Exception as e:
success = False
result = str(e)
from database import AsyncSessionLocal
async with AsyncSessionLocal() as db:
action = AutonomousAction(
trigger_type=reason,
trigger_data=f"service={service}",
action_type="restart",
action_cmd=cmd,
result=result[:500],
success=success,
approved_by=actor,
executed_at=datetime.utcnow(),
)
db.add(action)
await db.commit()
if success:
try:
import httpx
async with httpx.AsyncClient(timeout=5) as c:
await c.post("http://127.0.0.1:9001/api/messenger/webhook", json={
"event": "auto_heal_success",
"room": "ops",
"message": f"🔧 자가수복 완료: {service} 재시작 성공",
})
except Exception:
pass
return success, result
@router.post("/trigger")
async def trigger_heal(
req: HealRequest,
background_tasks: BackgroundTasks,
user: User = Depends(require_admin_role),
):
"""수동으로 서비스 자가수복 트리거."""
if req.service not in HEALABLE_SERVICES:
raise HTTPException(400, detail=f"지원 서비스: {list(HEALABLE_SERVICES)}")
background_tasks.add_task(_attempt_heal, req.service, req.reason, user.username)
return {"ok": True, "message": f"{req.service} 자가수복 시작됨 (백그라운드)"}
@router.get("/services")
async def list_services(user: User = Depends(get_current_user)):
"""수복 가능한 서비스 목록 + 현재 상태."""
status_map = {}
for svc in HEALABLE_SERVICES:
try:
r = subprocess.run(
["systemctl", "is-active", svc],
capture_output=True, text=True, timeout=3
)
status_map[svc] = r.stdout.strip()
except Exception:
status_map[svc] = "unknown"
return {"services": status_map}
@router.get("/history")
async def heal_history(
limit: int = 20,
db: AsyncSession = Depends(get_db),
user: User = Depends(get_current_user),
):
"""자가수복 실행 이력."""
rows = await db.execute(
select(AutonomousAction)
.where(AutonomousAction.action_type == "restart")
.order_by(desc(AutonomousAction.executed_at))
.limit(limit)
)
return [{
"id": a.id, "trigger_type": a.trigger_type,
"trigger_data": a.trigger_data, "action_cmd": a.action_cmd,
"success": a.success, "result": a.result,
"approved_by": a.approved_by, "executed_at": a.executed_at,
} for a in rows.scalars().all()]