guardia-itsm/routers/batch_ssh.py
2026-06-02 19:49:59 +09:00

216 lines
6.9 KiB
Python

"""
다중 서버 배치 SSH 실행
여러 서버에 동일 명령을 동시에 실행하고 결과를 수집.
PAM 승인 게이트 적용 — 위험 명령어는 관리자 승인 필요.
엔드포인트:
POST /api/batch-ssh/run — 배치 명령 실행 (비동기)
GET /api/batch-ssh/jobs — 작업 목록
GET /api/batch-ssh/jobs/{id} — 작업 결과 상세
DELETE /api/batch-ssh/jobs/{id} — 작업 삭제
"""
from __future__ import annotations
import asyncio
import json
import logging
import re
from datetime import datetime
from typing import List, Optional
import paramiko
from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException
from pydantic import BaseModel, Field
from sqlalchemy import select, desc
from sqlalchemy.ext.asyncio import AsyncSession
from core.auth import get_current_user, require_admin_role
from core.ssh_exec import _decrypt_password as decrypt_password
from database import get_db
from models import User, Server, BatchSSHJob, AuditLog
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/batch-ssh", tags=["배치 SSH"])
# 위험 명령어 패턴 (PAM 승인 필요)
DANGEROUS_PATTERNS = [
r'\brm\s+-rf\b', r'\bmkfs\b', r'\bdd\b.*if=',
r'\bshutdown\b', r'\breboot\b', r'\bhalt\b',
r'\bchmod\s+777\b', r'\bchown\s+.*root\b',
r'>\s*/etc/(passwd|shadow|sudoers)',
]
class BatchSSHRequest(BaseModel):
server_ids: List[int] = Field(..., min_length=1, max_length=50)
command: str = Field(..., min_length=1, max_length=500)
timeout_sec: int = Field(30, ge=5, le=300)
require_approval: bool = False
def _is_dangerous(command: str) -> bool:
return any(re.search(p, command, re.IGNORECASE) for p in DANGEROUS_PATTERNS)
async def _run_on_server(server: Server, command: str, timeout: int) -> dict:
try:
pw = decrypt_password(server.os_pw_enc)
ssh = paramiko.SSHClient()
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
ssh.connect(server.ip_addr, username=server.ssh_user, password=pw, timeout=10)
_, stdout, stderr = ssh.exec_command(command, timeout=timeout)
exit_code = stdout.channel.recv_exit_status()
out = stdout.read().decode('utf-8', 'replace').strip()
err = stderr.read().decode('utf-8', 'replace').strip()
ssh.close()
return {
"server_id": server.id,
"hostname": server.hostname or server.ip_addr,
"ip": server.ip_addr,
"exit_code": exit_code,
"stdout": out[:2000],
"stderr": err[:500],
"status": "SUCCESS" if exit_code == 0 else "FAILED",
}
except Exception as e:
return {
"server_id": server.id,
"hostname": getattr(server, 'hostname', '') or server.ip_addr,
"ip": server.ip_addr,
"exit_code": -1,
"stdout": "",
"stderr": str(e)[:200],
"status": "ERROR",
}
async def _execute_batch(job_id: int, servers: list, command: str,
timeout: int, db: AsyncSession):
job_row = await db.execute(select(BatchSSHJob).where(BatchSSHJob.id == job_id))
job = job_row.scalar_one_or_none()
if not job:
return
try:
job.status = "RUNNING"
await db.commit()
tasks = [_run_on_server(s, command, timeout) for s in servers]
results = await asyncio.gather(*tasks)
success = sum(1 for r in results if r["status"] == "SUCCESS")
job.results_json = json.dumps(results, ensure_ascii=False)
job.success_count = success
job.total_count = len(servers)
job.status = "DONE"
except Exception as e:
job.status = "FAILED"
job.results_json = json.dumps({"error": str(e)})
finally:
job.finished_at = datetime.utcnow()
await db.commit()
@router.post("/run")
async def run_batch(
req: BatchSSHRequest,
background_tasks: BackgroundTasks,
db: AsyncSession = Depends(get_db),
user: User = Depends(get_current_user),
):
# 위험 명령어 체크 — 관리자만 가능
if _is_dangerous(req.command):
if user.role.value not in ("ADMIN", "admin"):
raise HTTPException(403, "위험 명령어는 관리자만 실행 가능합니다")
# 서버 목록 조회
rows = await db.execute(
select(Server).where(Server.id.in_(req.server_ids))
)
servers = rows.scalars().all()
if not servers:
raise HTTPException(404, "서버를 찾을 수 없습니다")
job = BatchSSHJob(
command=req.command,
server_ids=req.server_ids,
total_count=len(servers),
timeout_sec=req.timeout_sec,
status="QUEUED",
created_by=user.id,
created_at=datetime.utcnow(),
)
db.add(job)
log = AuditLog(
user_id=user.id,
action="BATCH_SSH",
detail=f"배치 SSH: {len(servers)}개 서버, 명령: {req.command[:100]}",
created_at=datetime.utcnow(),
)
db.add(log)
await db.commit()
await db.refresh(job)
background_tasks.add_task(_execute_batch, job.id, servers, req.command, req.timeout_sec, db)
return {"ok": True, "job_id": job.id, "server_count": len(servers)}
@router.get("/jobs")
async def list_jobs(
limit: int = 30,
db: AsyncSession = Depends(get_db),
user: User = Depends(get_current_user),
):
rows = await db.execute(
select(BatchSSHJob).where(BatchSSHJob.created_by == user.id)
.order_by(desc(BatchSSHJob.created_at)).limit(limit)
)
jobs = rows.scalars().all()
return [
{
"id": j.id, "command": j.command[:80],
"status": j.status,
"success": j.success_count, "total": j.total_count,
"created_at": j.created_at, "finished_at": j.finished_at,
}
for j in jobs
]
@router.get("/jobs/{job_id}")
async def get_job(
job_id: int,
db: AsyncSession = Depends(get_db),
user: User = Depends(get_current_user),
):
row = await db.execute(
select(BatchSSHJob).where(BatchSSHJob.id == job_id)
)
job = row.scalar_one_or_none()
if not job:
raise HTTPException(404)
results = json.loads(job.results_json or "[]") if job.results_json else []
return {
"id": job.id, "command": job.command,
"status": job.status,
"success": job.success_count, "total": job.total_count,
"created_at": job.created_at, "finished_at": job.finished_at,
"results": results,
}
@router.delete("/jobs/{job_id}")
async def delete_job(
job_id: int,
db: AsyncSession = Depends(get_db),
user: User = Depends(get_current_user),
):
row = await db.execute(
select(BatchSSHJob).where(BatchSSHJob.id == job_id, BatchSSHJob.created_by == user.id)
)
job = row.scalar_one_or_none()
if not job:
raise HTTPException(404)
await db.delete(job)
await db.commit()
return {"ok": True}