guardia-itsm/core/orchestrator.py

"""
B-5: 멀티 에이전트 협업 오케스트레이션 엔진

워크플로우 유형:
  SR_TO_DEPLOY   — SR 접수 → 코드 리뷰 → 변경 승인 → 배포
  INCIDENT_RESP  — 인시던트 탐지 → 담당자 배정 → RCA → 복구
  CODE_REVIEW    — 코드 리뷰 → 취약점 스캔 → 보고서 생성
  MAINTENANCE    — 정기 점검 → 변경 관리 → 완료 보고
  CUSTOM         — 사용자 정의 단계

각 에이전트는 API 호출로 작업을 수행.
"""
from __future__ import annotations

import asyncio
import json
import logging
from datetime import datetime
from typing import Any, Dict, List, Optional

import httpx
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession

logger = logging.getLogger(__name__)

# ── 에이전트 레지스트리 (내부 API URL) ────────────────────────────────────────

_BASE = "http://localhost:8000"  # FastAPI 자기 자신

AGENT_ACTIONS: Dict[str, Dict[str, str]] = {
    "sr-manager": {
        "create_sr":     f"{_BASE}/api/tasks",
        "update_status": f"{_BASE}/api/tasks/{{sr_id}}/status",
        "assign":        f"{_BASE}/api/assign/{{sr_id}}",
    },
    "code-reviewer": {
        "quick_scan":    f"{_BASE}/api/code-review/quick-scan",
        "full_review":   f"{_BASE}/api/code-review",
        "get_result":    f"{_BASE}/api/code-review/{{review_id}}",
    },
    "deploy-engineer": {
        "create_session": f"{_BASE}/api/vibe",
        "trigger_build":  f"{_BASE}/api/vibe/{{session_id}}/build",
        "request_approval": f"{_BASE}/api/vibe/{{session_id}}/request-approval",
    },
    "sla-guardian": {
        "check_sla":     f"{_BASE}/api/sla/check",
    },
    "anomaly-detector": {
        "detect":        f"{_BASE}/api/anomaly/detect",
        "get_events":    f"{_BASE}/api/anomaly/events",
    },
    "kb-agent": {
        "analyze_sr":    f"{_BASE}/api/kb-agent/analyze/{{sr_id}}",
        "run_batch":     f"{_BASE}/api/kb-agent/run",
    },
}


# ── 워크플로우 템플릿 ─────────────────────────────────────────────────────────

def _sr_to_deploy_steps() -> List[Dict]:
    """SR → 코드 리뷰 → 배포 워크플로우 단계."""
    return [
        {
            "order": 1,
            "agent_name": "sr-manager",
            "action": "sr_status_in_progress",
            "description": "SR 상태 IN_PROGRESS 전환",
        },
        {
            "order": 2,
            "agent_name": "code-reviewer",
            "action": "quick_security_scan",
            "description": "빠른 보안 스캔 (즉시 실행)",
        },
        {
            "order": 3,
            "agent_name": "code-reviewer",
            "action": "full_code_review",
            "description": "전체 코드 리뷰 (Ollama LLM)",
            "requires_llm": True,
        },
        {
            "order": 4,
            "agent_name": "deploy-engineer",
            "action": "create_vibe_session",
            "description": "배포 세션 생성",
        },
        {
            "order": 5,
            "agent_name": "deploy-engineer",
            "action": "trigger_build",
            "description": "빌드 트리거",
        },
        {
            "order": 6,
            "agent_name": "sr-manager",
            "action": "sr_status_completed",
            "description": "SR 완료 처리",
        },
        {
            "order": 7,
            "agent_name": "kb-agent",
            "action": "analyze_and_create_kb",
            "description": "KB 자동 업데이트",
        },
    ]


def _incident_response_steps() -> List[Dict]:
    """인시던트 대응 워크플로우 단계."""
    return [
        {
            "order": 1,
            "agent_name": "sr-manager",
            "action": "create_incident_sr",
            "description": "인시던트 SR 생성 (CRITICAL 우선순위)",
        },
        {
            "order": 2,
            "agent_name": "sr-manager",
            "action": "assign_oncall_engineer",
            "description": "온콜 엔지니어 즉시 배정",
        },
        {
            "order": 3,
            "agent_name": "anomaly-detector",
            "action": "gather_metrics",
            "description": "관련 메트릭 수집",
        },
        {
            "order": 4,
            "agent_name": "code-reviewer",
            "action": "quick_security_scan",
            "description": "보안 취약점 즉시 스캔",
        },
        {
            "order": 5,
            "agent_name": "sr-manager",
            "action": "sr_status_completed",
            "description": "인시던트 해결 완료 처리",
        },
        {
            "order": 6,
            "agent_name": "kb-agent",
            "action": "analyze_and_create_kb",
            "description": "인시던트 KB 문서 자동 생성",
        },
    ]


def _code_review_steps() -> List[Dict]:
    """코드 리뷰 워크플로우 단계."""
    return [
        {
            "order": 1,
            "agent_name": "code-reviewer",
            "action": "quick_security_scan",
            "description": "빠른 보안 스캔",
        },
        {
            "order": 2,
            "agent_name": "code-reviewer",
            "action": "full_code_review",
            "description": "전체 코드 리뷰",
            "requires_llm": True,
        },
        {
            "order": 3,
            "agent_name": "kb-agent",
            "action": "analyze_and_create_kb",
            "description": "리뷰 결과 KB 업데이트",
        },
    ]


WORKFLOW_TEMPLATES: Dict[str, List[Dict]] = {
    "SR_TO_DEPLOY":  _sr_to_deploy_steps(),
    "INCIDENT_RESP": _incident_response_steps(),
    "CODE_REVIEW":   _code_review_steps(),
}


# ── 에이전트 액션 실행 ────────────────────────────────────────────────────────

async def _execute_action(
    agent_name: str,
    action: str,
    context: Dict,
    timeout: int = 30,
) -> Dict:
    """
    에이전트 액션 실행 (내부 API 호출).
    실패 시 {"success": False, "error": ...} 반환.
    """
    sr_id     = context.get("sr_id", "")
    project_id = context.get("project_id")
    project_dir = context.get("project_dir", "")

    result = {"success": True, "action": action, "agent": agent_name, "data": {}}

    try:
        async with httpx.AsyncClient(timeout=timeout) as client:

            # SR 상태 변경
            if agent_name == "sr-manager" and action.startswith("sr_status_"):
                new_status = "IN_PROGRESS" if "in_progress" in action else "COMPLETED"
                if sr_id:
                    resp = await client.patch(
                        f"{_BASE}/api/tasks/{sr_id}/status",
                        json={"status": new_status, "note": f"워크플로우 자동 처리: {new_status}"},
                    )
                    result["data"]["status_changed"] = new_status
                else:
                    result["data"]["skipped"] = "sr_id 없음"

            # 빠른 보안 스캔
            elif agent_name == "code-reviewer" and action == "quick_security_scan":
                if project_dir:
                    resp = await client.post(
                        f"{_BASE}/api/code-review/quick-scan",
                        params={"project_dir": project_dir},
                    )
                    if resp.status_code == 200:
                        data = resp.json()
                        result["data"]["findings"] = len(data.get("findings", []))
                        result["data"]["critical"] = data.get("critical_count", 0)
                    else:
                        result["data"]["skipped"] = f"scan 실패 {resp.status_code}"
                else:
                    result["data"]["skipped"] = "project_dir 없음"

            # KB 분석
            elif agent_name == "kb-agent" and action == "analyze_and_create_kb":
                if sr_id:
                    resp = await client.post(
                        f"{_BASE}/api/kb-agent/analyze/{sr_id}",
                        params={"use_llm": "false"},
                    )
                    if resp.status_code == 200:
                        data = resp.json()
                        result["data"]["kb_created"] = data.get("created", False)
                        result["data"]["kb_id"] = data.get("kb_id")
                    else:
                        result["data"]["skipped"] = "KB 생성 실패"
                else:
                    result["data"]["skipped"] = "sr_id 없음"

            # 기타 액션 — 시뮬레이션 성공
            else:
                result["data"]["simulated"] = True
                result["data"]["action"] = action

    except (httpx.ConnectError, httpx.TimeoutException) as e:
        # 자기 자신에 대한 API 호출 실패 → 시뮬레이션 모드
        logger.debug("에이전트 액션 API 호출 실패 (%s.%s): %s", agent_name, action, e)
        result["data"]["simulated"] = True
        result["data"]["note"] = "API 미연결 → 시뮬레이션"

    except Exception as e:
        logger.error("에이전트 액션 오류 (%s.%s): %s", agent_name, action, e)
        result["success"] = False
        result["error"] = str(e)[:100]

    return result


# ── 워크플로우 실행 엔진 ──────────────────────────────────────────────────────

async def execute_workflow(
    instance_id: int,
    steps: List[Dict],
    context: Dict,
    db_session_factory,
) -> None:
    """
    워크플로우 단계 순차 실행.
    각 단계 완료 후 DB 업데이트.
    """
    from models import WorkflowInstance, WorkflowStep, WorkflowStatus

    async with db_session_factory() as db:
        instance = (await db.execute(
            select(WorkflowInstance).where(WorkflowInstance.id == instance_id)
        )).scalars().first()

        if not instance:
            logger.error("워크플로우 인스턴스 %d 없음", instance_id)
            return

        instance.status     = WorkflowStatus.RUNNING
        instance.started_at = datetime.utcnow()
        await db.commit()

    results = []
    current_step = 0

    for step_def in steps:
        current_step += 1
        step_order = step_def["order"]
        agent_name = step_def["agent_name"]
        action     = step_def["action"]

        async with db_session_factory() as db:
            # 단계 조회
            step_row = (await db.execute(
                select(WorkflowStep).where(
                    WorkflowStep.instance_id == instance_id,
                    WorkflowStep.step_order  == step_order,
                )
            )).scalars().first()

            if step_row:
                step_row.status     = WorkflowStatus.RUNNING
                step_row.started_at = datetime.utcnow()
                await db.commit()

        # 액션 실행
        start_ts = datetime.utcnow()
        action_result = await _execute_action(agent_name, action, context)
        duration = int((datetime.utcnow() - start_ts).total_seconds())

        results.append(action_result)

        # 컨텍스트 업데이트 (다음 단계에 전달)
        if action_result.get("success") and action_result.get("data"):
            context[f"step_{step_order}_result"] = action_result["data"]

        async with db_session_factory() as db:
            step_row = (await db.execute(
                select(WorkflowStep).where(
                    WorkflowStep.instance_id == instance_id,
                    WorkflowStep.step_order  == step_order,
                )
            )).scalars().first()

            instance = (await db.execute(
                select(WorkflowInstance).where(WorkflowInstance.id == instance_id)
            )).scalars().first()

            if step_row:
                step_row.status       = "COMPLETED" if action_result.get("success") else "FAILED"
                step_row.output_json  = json.dumps(action_result, ensure_ascii=False)
                step_row.completed_at = datetime.utcnow()
                step_row.duration_sec = duration
                if not action_result.get("success"):
                    step_row.error_msg = action_result.get("error", "")

            if instance:
                instance.current_step = current_step
                instance.progress_pct = int(current_step / instance.total_steps * 100)

            await db.commit()

        # 단계 실패 시 워크플로우 중단
        if not action_result.get("success"):
            async with db_session_factory() as db:
                inst = (await db.execute(
                    select(WorkflowInstance).where(WorkflowInstance.id == instance_id)
                )).scalars().first()
                if inst:
                    inst.status      = WorkflowStatus.FAILED
                    inst.error_msg   = f"Step {step_order} ({agent_name}.{action}) 실패"
                    inst.completed_at = datetime.utcnow()
                    await db.commit()
            return

        # 짧은 대기 (연속 API 호출 부하 방지)
        await asyncio.sleep(0.1)

    # 완료
    async with db_session_factory() as db:
        inst = (await db.execute(
            select(WorkflowInstance).where(WorkflowInstance.id == instance_id)
        )).scalars().first()
        if inst:
            inst.status       = WorkflowStatus.COMPLETED
            inst.progress_pct = 100
            inst.completed_at = datetime.utcnow()
            inst.result_json  = json.dumps(
                {"steps": results, "context": context},
                ensure_ascii=False,
                default=str,
            )
            await db.commit()

    logger.info("워크플로우 %d 완료: %d 단계", instance_id, len(steps))


async def create_workflow_instance(
    db: AsyncSession,
    workflow_type: str,
    title: str,
    sr_id: Optional[str] = None,
    project_id: Optional[int] = None,
    context: Optional[Dict] = None,
    triggered_by: str = "system",
) -> "WorkflowInstance":
    """워크플로우 인스턴스 + 단계 레코드 생성."""
    from models import WorkflowInstance, WorkflowStep

    steps = WORKFLOW_TEMPLATES.get(workflow_type, [])
    ctx = {
        "sr_id":      sr_id or "",
        "project_id": project_id,
        **(context or {}),
    }

    instance = WorkflowInstance(
        workflow_type = workflow_type,
        status        = "PENDING",
        title         = title,
        sr_id         = sr_id,
        project_id    = project_id,
        triggered_by  = triggered_by,
        context_json  = json.dumps(ctx, ensure_ascii=False, default=str),
        total_steps   = len(steps),
        progress_pct  = 0,
    )
    db.add(instance)
    await db.flush()

    for step_def in steps:
        db.add(WorkflowStep(
            instance_id = instance.id,
            step_order  = step_def["order"],
            agent_name  = step_def["agent_name"],
            action      = step_def["action"],
            status      = "PENDING",
            input_json  = json.dumps({"description": step_def.get("description", "")}, ensure_ascii=False),
        ))

    await db.commit()
    await db.refresh(instance)
    return instance