126 lines
4.2 KiB
Python
126 lines
4.2 KiB
Python
"""자가 수복 — 장애 감지 후 자동 서비스 재시작·복구."""
|
|
from __future__ import annotations
|
|
import asyncio, logging, subprocess
|
|
from datetime import datetime
|
|
from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException
|
|
from pydantic import BaseModel
|
|
from sqlalchemy import select, desc
|
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
from core.auth import get_current_user, require_admin_role
|
|
from database import get_db
|
|
from models import User, AutonomousAction
|
|
|
|
logger = logging.getLogger(__name__)
|
|
router = APIRouter(prefix="/api/self-heal", tags=["자가수복"])
|
|
|
|
HEALABLE_SERVICES = {
|
|
"guardia": "systemctl restart guardia",
|
|
"guardia-manager": "systemctl restart guardia-manager",
|
|
"zioinfo": "systemctl restart zioinfo",
|
|
"zioinfo-mail": "systemctl restart zioinfo-mail",
|
|
"nginx": "systemctl restart nginx",
|
|
"ollama": "systemctl restart ollama",
|
|
}
|
|
|
|
|
|
class HealRequest(BaseModel):
|
|
service: str
|
|
reason: str = "manual"
|
|
|
|
|
|
async def _attempt_heal(service: str, reason: str, actor: str = "auto"):
|
|
"""서비스 재시작 시도 후 결과 기록."""
|
|
cmd = HEALABLE_SERVICES.get(service)
|
|
if not cmd:
|
|
return False, f"알 수 없는 서비스: {service}"
|
|
|
|
try:
|
|
proc = await asyncio.create_subprocess_shell(
|
|
cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.STDOUT
|
|
)
|
|
stdout, _ = await asyncio.wait_for(proc.communicate(), timeout=30)
|
|
success = proc.returncode == 0
|
|
result = stdout.decode("utf-8", "replace")
|
|
except Exception as e:
|
|
success = False
|
|
result = str(e)
|
|
|
|
from database import AsyncSessionLocal
|
|
async with AsyncSessionLocal() as db:
|
|
action = AutonomousAction(
|
|
trigger_type=reason,
|
|
trigger_data=f"service={service}",
|
|
action_type="restart",
|
|
action_cmd=cmd,
|
|
result=result[:500],
|
|
success=success,
|
|
approved_by=actor,
|
|
executed_at=datetime.utcnow(),
|
|
)
|
|
db.add(action)
|
|
await db.commit()
|
|
|
|
if success:
|
|
try:
|
|
import httpx
|
|
async with httpx.AsyncClient(timeout=5) as c:
|
|
await c.post("http://127.0.0.1:9001/api/messenger/webhook", json={
|
|
"event": "auto_heal_success",
|
|
"room": "ops",
|
|
"message": f"🔧 자가수복 완료: {service} 재시작 성공",
|
|
})
|
|
except Exception:
|
|
pass
|
|
|
|
return success, result
|
|
|
|
|
|
@router.post("/trigger")
|
|
async def trigger_heal(
|
|
req: HealRequest,
|
|
background_tasks: BackgroundTasks,
|
|
user: User = Depends(require_admin_role),
|
|
):
|
|
"""수동으로 서비스 자가수복 트리거."""
|
|
if req.service not in HEALABLE_SERVICES:
|
|
raise HTTPException(400, detail=f"지원 서비스: {list(HEALABLE_SERVICES)}")
|
|
background_tasks.add_task(_attempt_heal, req.service, req.reason, user.username)
|
|
return {"ok": True, "message": f"{req.service} 자가수복 시작됨 (백그라운드)"}
|
|
|
|
|
|
@router.get("/services")
|
|
async def list_services(user: User = Depends(get_current_user)):
|
|
"""수복 가능한 서비스 목록 + 현재 상태."""
|
|
status_map = {}
|
|
for svc in HEALABLE_SERVICES:
|
|
try:
|
|
r = subprocess.run(
|
|
["systemctl", "is-active", svc],
|
|
capture_output=True, text=True, timeout=3
|
|
)
|
|
status_map[svc] = r.stdout.strip()
|
|
except Exception:
|
|
status_map[svc] = "unknown"
|
|
return {"services": status_map}
|
|
|
|
|
|
@router.get("/history")
|
|
async def heal_history(
|
|
limit: int = 20,
|
|
db: AsyncSession = Depends(get_db),
|
|
user: User = Depends(get_current_user),
|
|
):
|
|
"""자가수복 실행 이력."""
|
|
rows = await db.execute(
|
|
select(AutonomousAction)
|
|
.where(AutonomousAction.action_type == "restart")
|
|
.order_by(desc(AutonomousAction.executed_at))
|
|
.limit(limit)
|
|
)
|
|
return [{
|
|
"id": a.id, "trigger_type": a.trigger_type,
|
|
"trigger_data": a.trigger_data, "action_cmd": a.action_cmd,
|
|
"success": a.success, "result": a.result,
|
|
"approved_by": a.approved_by, "executed_at": a.executed_at,
|
|
} for a in rows.scalars().all()]
|