"""자가 수복 — 장애 감지 후 자동 서비스 재시작·복구.""" from __future__ import annotations import asyncio, logging, subprocess from datetime import datetime from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException from pydantic import BaseModel from sqlalchemy import select, desc from sqlalchemy.ext.asyncio import AsyncSession from core.auth import get_current_user, require_admin_role from database import get_db from models import User, AutonomousAction logger = logging.getLogger(__name__) router = APIRouter(prefix="/api/self-heal", tags=["자가수복"]) HEALABLE_SERVICES = { "guardia": "systemctl restart guardia", "guardia-manager": "systemctl restart guardia-manager", "zioinfo": "systemctl restart zioinfo", "zioinfo-mail": "systemctl restart zioinfo-mail", "nginx": "systemctl restart nginx", "ollama": "systemctl restart ollama", } class HealRequest(BaseModel): service: str reason: str = "manual" async def _attempt_heal(service: str, reason: str, actor: str = "auto"): """서비스 재시작 시도 후 결과 기록.""" cmd = HEALABLE_SERVICES.get(service) if not cmd: return False, f"알 수 없는 서비스: {service}" try: proc = await asyncio.create_subprocess_shell( cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.STDOUT ) stdout, _ = await asyncio.wait_for(proc.communicate(), timeout=30) success = proc.returncode == 0 result = stdout.decode("utf-8", "replace") except Exception as e: success = False result = str(e) from database import AsyncSessionLocal async with AsyncSessionLocal() as db: action = AutonomousAction( trigger_type=reason, trigger_data=f"service={service}", action_type="restart", action_cmd=cmd, result=result[:500], success=success, approved_by=actor, executed_at=datetime.utcnow(), ) db.add(action) await db.commit() if success: try: import httpx async with httpx.AsyncClient(timeout=5) as c: await c.post("http://127.0.0.1:9001/api/messenger/webhook", json={ "event": "auto_heal_success", "room": "ops", "message": f"🔧 자가수복 완료: {service} 재시작 성공", }) except Exception: pass return success, result @router.post("/trigger") async def trigger_heal( req: HealRequest, background_tasks: BackgroundTasks, user: User = Depends(require_admin_role), ): """수동으로 서비스 자가수복 트리거.""" if req.service not in HEALABLE_SERVICES: raise HTTPException(400, detail=f"지원 서비스: {list(HEALABLE_SERVICES)}") background_tasks.add_task(_attempt_heal, req.service, req.reason, user.username) return {"ok": True, "message": f"{req.service} 자가수복 시작됨 (백그라운드)"} @router.get("/services") async def list_services(user: User = Depends(get_current_user)): """수복 가능한 서비스 목록 + 현재 상태.""" status_map = {} for svc in HEALABLE_SERVICES: try: r = subprocess.run( ["systemctl", "is-active", svc], capture_output=True, text=True, timeout=3 ) status_map[svc] = r.stdout.strip() except Exception: status_map[svc] = "unknown" return {"services": status_map} @router.get("/history") async def heal_history( limit: int = 20, db: AsyncSession = Depends(get_db), user: User = Depends(get_current_user), ): """자가수복 실행 이력.""" rows = await db.execute( select(AutonomousAction) .where(AutonomousAction.action_type == "restart") .order_by(desc(AutonomousAction.executed_at)) .limit(limit) ) return [{ "id": a.id, "trigger_type": a.trigger_type, "trigger_data": a.trigger_data, "action_cmd": a.action_cmd, "success": a.success, "result": a.result, "approved_by": a.approved_by, "executed_at": a.executed_at, } for a in rows.scalars().all()]