guardia-itsm/routers/batch.py
DESKTOP-TKLFCPRython 64c27c3509 feat(itsm): G-1~G-12 확장 기능 + 하네스/봇/설치스크립트 구현
G-1: 메신저 Webhook Relay + _send_to_room 실제 httpx 호출 구현
G-2: POST /api/tasks/bulk SR 대량작업 엔드포인트 (최대 100건)
G-3: 라이선스 만료 알림 스케줄러 (매일 09:00 KST)
G-4: 체험판 upgrade_banner 필드 + license.py 배너 로직
G-5: core/auto_rca.py + incidents/problem auto-rca 엔드포인트
G-6: core/deploy_impact.py + vibe impact-analysis 엔드포인트
G-7: core/ticket_classifier.py + SR 생성 시 AI 분류 + ai-suggestion API
G-8: VulnPatchRecord 모델 + vuln_scan 패치추적 4개 엔드포인트
G-9: core/jira_sync.py + gateway Jira/Confluence 연동 엔드포인트
G-10: core/push_notify.py + routers/push.py + PushSubscription 모델
G-11: approvals 다중승인 (위임/서명/기한초과/마감연장)
G-12: alembic.ini + migrations/ + cicd/migrate_to_postgres.sh

하네스: guardia-orchestrator 확장기능 Phase 반영
봇명령어: /sr /status /license /bulk 슬래시 명령어 추가
설치스크립트: setup/ (Ubuntu, CentOS, RHEL, Windows) --test 옵션 포함

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-29 18:18:52 +09:00

493 lines
18 KiB
Python

"""
배치 작업 관리 API.
엔드포인트:
GET /api/batch/jobs — 배치 작업 목록
POST /api/batch/jobs — 배치 작업 등록
GET /api/batch/jobs/{id} — 배치 작업 상세
PATCH /api/batch/jobs/{id} — 배치 작업 수정
DELETE /api/batch/jobs/{id} — 배치 작업 삭제 (비활성화)
POST /api/batch/jobs/{id}/run — 수동 즉시 실행 (SSH 비동기)
POST /api/batch/jobs/{id}/enable — 활성화
POST /api/batch/jobs/{id}/disable — 비활성화
GET /api/batch/jobs/{id}/runs — 실행 이력 목록
GET /api/batch/runs/{run_id} — 실행 이력 상세
보안: 명령어 위험 패턴 차단, SSH 실행 결과에서 민감정보 제거.
"""
from __future__ import annotations
import asyncio
import logging
from datetime import datetime
from typing import List, Optional
from uuid import uuid4
from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, Query
from fastapi.responses import StreamingResponse
from sqlalchemy import select, desc
from sqlalchemy.ext.asyncio import AsyncSession
from core.auth import get_current_user
from database import get_db
from models import (
Institution, Server, SRRequest, SRStatus, SRType, Priority,
BatchJob, BatchJobCreate, BatchJobOut, BatchJobStatus,
BatchRun, BatchRunOut, BatchRunResult,
User, UserRole,
)
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/batch", tags=["batch"])
# 위험 명령어 패턴 (SSH 명령어 안전성 검증)
_DANGEROUS_PATTERNS = [
"rm -rf /", "rm -rf /*", "mkfs", "dd if=",
"shutdown", ":(){:|:&};:", "chmod -R 777 /",
"chown -R root /", ">(", "curl.*|.*sh", "wget.*|.*sh",
]
def _validate_command(cmd: str) -> None:
for pattern in _DANGEROUS_PATTERNS:
if pattern.lower() in cmd.lower():
raise HTTPException(
422,
f"위험한 명령어 패턴이 감지되었습니다: '{pattern}'. "
f"보안 정책에 의해 차단되었습니다.",
)
def _new_sr() -> str:
return f"SR-{datetime.now().strftime('%Y%m%d')}-{str(uuid4())[:6].upper()}"
# ── 배치 작업 목록 ────────────────────────────────────────────────────────────
@router.get("/jobs", response_model=List[BatchJobOut])
async def list_jobs(
inst_id: Optional[int] = Query(None),
status: Optional[str] = Query(None),
keyword: Optional[str] = Query(None),
skip: int = 0,
limit: int = 100,
db: AsyncSession = Depends(get_db),
current_user: User = Depends(get_current_user),
):
q = select(BatchJob)
if inst_id:
q = q.where(BatchJob.inst_id == inst_id)
if current_user.role == UserRole.CUSTOMER and current_user.inst_code:
r_i = await db.execute(
select(Institution).where(Institution.inst_code == current_user.inst_code)
)
own = r_i.scalars().first()
q = q.where(BatchJob.inst_id == own.id) if own else q.where(BatchJob.id == -1)
if status:
q = q.where(BatchJob.status == status)
if keyword:
q = q.where(BatchJob.job_name.contains(keyword))
q = q.order_by(BatchJob.job_name).offset(skip).limit(limit)
result = await db.execute(q)
return result.scalars().all()
# ── 배치 작업 등록 ────────────────────────────────────────────────────────────
@router.post("/jobs", response_model=BatchJobOut, status_code=201)
async def create_job(
payload: BatchJobCreate,
db: AsyncSession = Depends(get_db),
current_user: User = Depends(get_current_user),
):
if current_user.role not in (UserRole.ADMIN, UserRole.PM, UserRole.ENGINEER):
raise HTTPException(403, "권한이 없습니다.")
_validate_command(payload.command)
# cron 표현식 기본 검증 (5 필드)
parts = payload.cron_expr.strip().split()
if len(parts) != 5:
raise HTTPException(422, "cron_expr은 5개의 필드여야 합니다 (예: '0 2 * * *').")
# 중복 job_name 확인
dup = (await db.execute(
select(BatchJob).where(BatchJob.job_name == payload.job_name)
)).scalars().first()
if dup:
raise HTTPException(409, f"배치 작업 이름 '{payload.job_name}'이 이미 존재합니다.")
job = BatchJob(**payload.model_dump())
db.add(job)
await db.commit()
await db.refresh(job)
return job
# ── 배치 작업 상세 ────────────────────────────────────────────────────────────
@router.get("/jobs/{job_id}", response_model=BatchJobOut)
async def get_job(
job_id: int,
db: AsyncSession = Depends(get_db),
_u: User = Depends(get_current_user),
):
r = await db.execute(select(BatchJob).where(BatchJob.id == job_id))
job = r.scalars().first()
if not job:
raise HTTPException(404, "배치 작업을 찾을 수 없습니다.")
return job
# ── 배치 작업 수정 ────────────────────────────────────────────────────────────
@router.patch("/jobs/{job_id}", response_model=BatchJobOut)
async def update_job(
job_id: int,
payload: BatchJobCreate,
db: AsyncSession = Depends(get_db),
current_user: User = Depends(get_current_user),
):
if current_user.role not in (UserRole.ADMIN, UserRole.PM, UserRole.ENGINEER):
raise HTTPException(403, "권한이 없습니다.")
r = await db.execute(select(BatchJob).where(BatchJob.id == job_id))
job = r.scalars().first()
if not job:
raise HTTPException(404, "배치 작업을 찾을 수 없습니다.")
if payload.command:
_validate_command(payload.command)
for k, v in payload.model_dump(exclude_unset=True).items():
setattr(job, k, v)
job.updated_at = datetime.now()
await db.commit()
await db.refresh(job)
return job
# ── 활성화/비활성화 ───────────────────────────────────────────────────────────
@router.post("/jobs/{job_id}/enable", response_model=BatchJobOut)
async def enable_job(
job_id: int,
db: AsyncSession = Depends(get_db),
current_user: User = Depends(get_current_user),
):
if current_user.role not in (UserRole.ADMIN, UserRole.PM):
raise HTTPException(403, "ADMIN 또는 PM 권한이 필요합니다.")
r = await db.execute(select(BatchJob).where(BatchJob.id == job_id))
job = r.scalars().first()
if not job:
raise HTTPException(404, "배치 작업을 찾을 수 없습니다.")
job.status = BatchJobStatus.ACTIVE
job.updated_at = datetime.now()
await db.commit()
await db.refresh(job)
# APScheduler에 동적 등록
from core.scheduler import enable_batch_job
enable_batch_job(job.id, job.cron_expr)
return job
@router.post("/jobs/{job_id}/disable", response_model=BatchJobOut)
async def disable_job(
job_id: int,
db: AsyncSession = Depends(get_db),
current_user: User = Depends(get_current_user),
):
if current_user.role not in (UserRole.ADMIN, UserRole.PM):
raise HTTPException(403, "ADMIN 또는 PM 권한이 필요합니다.")
r = await db.execute(select(BatchJob).where(BatchJob.id == job_id))
job = r.scalars().first()
if not job:
raise HTTPException(404, "배치 작업을 찾을 수 없습니다.")
job.status = BatchJobStatus.DISABLED
job.updated_at = datetime.now()
await db.commit()
await db.refresh(job)
# APScheduler에서 동적 제거
from core.scheduler import disable_batch_job
disable_batch_job(job.id)
return job
@router.delete("/jobs/{job_id}", status_code=204)
async def delete_job(
job_id: int,
db: AsyncSession = Depends(get_db),
current_user: User = Depends(get_current_user),
):
if current_user.role not in (UserRole.ADMIN, UserRole.PM):
raise HTTPException(403, "ADMIN 또는 PM 권한이 필요합니다.")
r = await db.execute(select(BatchJob).where(BatchJob.id == job_id))
job = r.scalars().first()
if not job:
raise HTTPException(404, "배치 작업을 찾을 수 없습니다.")
job.status = BatchJobStatus.DISABLED # 소프트 삭제
await db.commit()
# ── 수동 즉시 실행 ────────────────────────────────────────────────────────────
@router.post("/jobs/{job_id}/run", status_code=202)
async def run_job_now(
job_id: int,
background_tasks: BackgroundTasks,
db: AsyncSession = Depends(get_db),
current_user: User = Depends(get_current_user),
):
"""
배치 작업 즉시 실행 (백그라운드 SSH).
202 Accepted 반환 후 비동기 실행.
실행 결과는 GET /api/batch/jobs/{id}/runs 에서 확인.
"""
if current_user.role not in (UserRole.ADMIN, UserRole.PM, UserRole.ENGINEER):
raise HTTPException(403, "권한이 없습니다.")
r = await db.execute(select(BatchJob).where(BatchJob.id == job_id))
job = r.scalars().first()
if not job:
raise HTTPException(404, "배치 작업을 찾을 수 없습니다.")
if job.status != BatchJobStatus.ACTIVE:
raise HTTPException(422, "비활성화된 배치 작업은 실행할 수 없습니다.")
if not job.server_id:
raise HTTPException(422, "배치 작업에 대상 서버가 지정되지 않았습니다.")
# BatchRun 생성 (RUNNING 상태)
run = BatchRun(
job_id = job_id,
result = BatchRunResult.RUNNING,
started_at = datetime.now(),
)
db.add(run)
await db.commit()
await db.refresh(run)
run_id = run.id
background_tasks.add_task(
_execute_batch_run,
job_id = job_id,
run_id = run_id,
triggered_by = current_user.username,
)
return {
"run_id": run_id,
"job_id": job_id,
"status": "RUNNING",
"message": f"배치 실행 시작 (run_id={run_id}). 결과는 GET /api/batch/runs/{run_id} 에서 확인하세요.",
}
# ── 실행 이력 ─────────────────────────────────────────────────────────────────
@router.get("/jobs/{job_id}/runs", response_model=List[BatchRunOut])
async def list_runs(
job_id: int,
skip: int = 0,
limit: int = 50,
db: AsyncSession = Depends(get_db),
_u: User = Depends(get_current_user),
):
result = await db.execute(
select(BatchRun)
.where(BatchRun.job_id == job_id)
.order_by(desc(BatchRun.started_at))
.offset(skip).limit(limit)
)
return result.scalars().all()
@router.get("/runs/{run_id}", response_model=BatchRunOut)
async def get_run(
run_id: int,
db: AsyncSession = Depends(get_db),
_u: User = Depends(get_current_user),
):
r = await db.execute(select(BatchRun).where(BatchRun.id == run_id))
run = r.scalars().first()
if not run:
raise HTTPException(404, "실행 이력을 찾을 수 없습니다.")
return run
# ── 배치 실행 로그 SSE 스트리밍 ────────────────────────────────────────────────
@router.get("/runs/{run_id}/stream")
async def stream_run_log(
run_id: int,
db: AsyncSession = Depends(get_db),
_u: User = Depends(get_current_user),
):
"""
배치 실행 로그 실시간 스트리밍 (Server-Sent Events).
클라이언트에서 EventSource 사용:
const es = new EventSource('/api/batch/runs/{run_id}/stream');
es.onmessage = (e) => {
const d = JSON.parse(e.data);
if (d.type === 'log') console.log(d.line);
if (d.type === 'done') { es.close(); showResult(d); }
if (d.type === 'timeout') es.close();
};
이벤트 타입:
log — {"type":"log", "run_id":N, "line":"..."}
heartbeat — {"type":"heartbeat", "elapsed":N}
done — {"type":"done", "run_id":N, "result":"...", "exit_code":N, ...}
error — {"type":"error", "message":"..."}
timeout — {"type":"timeout", "message":"..."}
"""
# 실행 이력 존재 여부 사전 확인
r = await db.execute(select(BatchRun).where(BatchRun.id == run_id))
run = r.scalars().first()
if not run:
raise HTTPException(404, "실행 이력을 찾을 수 없습니다.")
from core.events import stream_batch_log
return StreamingResponse(
stream_batch_log(run_id),
media_type="text/event-stream",
headers={
"Cache-Control": "no-cache",
"X-Accel-Buffering": "no", # nginx SSE 버퍼 비활성화
"Access-Control-Allow-Origin": "*",
},
)
# ── 백그라운드 실행 ────────────────────────────────────────────────────────────
async def _execute_batch_run(job_id: int, run_id: int, triggered_by: str) -> None:
"""
SSH로 배치 명령 실행 후 BatchRun 결과 업데이트.
실패 시 SRRequest 자동 생성 (alert_on_fail=True인 경우).
"""
from database import SessionLocal
from models import BatchJob, BatchRun, Server
try:
async with SessionLocal() as db:
job = (await db.execute(
select(BatchJob).where(BatchJob.id == job_id)
)).scalars().first()
if not job:
return
srv = (await db.execute(
select(Server).where(Server.id == job.server_id)
)).scalars().first()
if not srv:
async with SessionLocal() as db:
run = (await db.execute(
select(BatchRun).where(BatchRun.id == run_id)
)).scalars().first()
if run:
run.result = BatchRunResult.FAILED
run.ended_at = datetime.now()
run.error_msg = "대상 서버를 찾을 수 없습니다."
await db.commit()
return
# SSH 실행
try:
from core.ssh_exec import run_command_on_server
stdout = await asyncio.wait_for(
run_command_on_server(srv, job.command),
timeout=job.timeout_sec,
)
exit_code = 0
result = BatchRunResult.SUCCESS
error_msg = None
except asyncio.TimeoutError:
stdout = ""
exit_code = -1
result = BatchRunResult.TIMEOUT
error_msg = f"타임아웃 ({job.timeout_sec}초)"
except Exception as exc:
stdout = ""
exit_code = -1
result = BatchRunResult.FAILED
error_msg = f"SSH 실행 오류: {str(exc)[:200]}"
# stdout 마지막 100줄 저장
stdout_tail = "\n".join((stdout or "").splitlines()[-100:])
# 자동 SR 생성
sr_id = None
if result in (BatchRunResult.FAILED, BatchRunResult.TIMEOUT) and job.alert_on_fail:
sr_id = _new_sr()
async with SessionLocal() as db:
sr = SRRequest(
sr_id = sr_id,
inst_id = job.inst_id,
sr_type = SRType.LOG,
title = f"[배치 실패] {job.job_name}",
description = (
f"배치 작업 실패\n"
f"작업명: {job.job_name}\n"
f"실행자: {triggered_by}\n"
f"오류: {error_msg or ''}\n"
f"출력(마지막 20줄):\n{chr(10).join((stdout or '').splitlines()[-20:])}"
),
status = SRStatus.RECEIVED,
priority = Priority.HIGH,
requested_by = "batch-scheduler",
target_server= srv.server_name if srv else "",
)
db.add(sr)
await db.commit()
logger.warning(
"배치 실패 SR 자동 생성: job=%s sr_id=%s result=%s",
job.job_name, sr_id, result,
)
# BatchRun 업데이트
async with SessionLocal() as db:
run = (await db.execute(
select(BatchRun).where(BatchRun.id == run_id)
)).scalars().first()
if run:
run.ended_at = datetime.now()
run.result = result
run.exit_code = exit_code
run.stdout_tail = stdout_tail
run.error_msg = error_msg
run.sr_id = sr_id
await db.commit()
# BatchJob 마지막 실행 정보 업데이트
job_obj = (await db.execute(
select(BatchJob).where(BatchJob.id == job_id)
)).scalars().first()
if job_obj:
job_obj.last_run_at = datetime.now()
job_obj.last_result = result
await db.commit()
logger.info(
"배치 실행 완료: job=%s run_id=%d result=%s",
job.job_name, run_id, result,
)
except Exception as exc:
logger.error("배치 실행 중 예외 발생: job_id=%d run_id=%d err=%s", job_id, run_id, exc, exc_info=True)
try:
async with SessionLocal() as db:
run = (await db.execute(
select(BatchRun).where(BatchRun.id == run_id)
)).scalars().first()
if run:
run.result = BatchRunResult.FAILED
run.ended_at = datetime.now()
run.error_msg = f"내부 오류: {str(exc)[:200]}"
await db.commit()
except Exception:
pass