guardia-itsm/core/llm_client.py

"""
GUARDiA ITSM — 로컬 LLM 클라이언트 (Ollama 래퍼)

온프레미스 보안 정책: 외부 AI/LLM API 완전 금지.
모든 추론은 localhost:11434 (Ollama) 에서 처리한다.

사용 모델:
  - guardia-agent    : GUARDiA 전용 파인튜닝 (권장)
  - llama3.1:8b      : 일반 에이전트 (fallback)
  - codellama:7b     : 코드 생성 에이전트

외부 호출 방지 확인:
  get_llm_client() 는 항상 OllamaClient 반환.
  Claude/OpenAI 등 외부 Provider는 개발·테스트 환경에서만 --dev-mode 플래그로 활성화.
"""
from __future__ import annotations

import json
import logging
import os
from dataclasses import dataclass, field
from typing import Optional

import httpx

logger = logging.getLogger(__name__)

# ── 설정 ─────────────────────────────────────────────────────────────────────

OLLAMA_BASE_URL: str = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
DEFAULT_MODEL:   str = os.getenv("GUARDIA_LLM_MODEL", "guardia-agent")
FALLBACK_MODEL:  str = "llama3.1:8b"
REQUEST_TIMEOUT: float = float(os.getenv("LLM_TIMEOUT_SEC", "120"))

# 개발 환경에서만 외부 API 활성화 (온프레미스 배포 시 반드시 False)
_DEV_MODE: bool = os.getenv("GUARDIA_DEV_MODE", "false").lower() == "true"


# ── 데이터 클래스 ────────────────────────────────────────────────────────────

@dataclass
class LLMResponse:
    content: str
    model: str
    tokens_prompt: int = 0
    tokens_completion: int = 0

    @property
    def tokens_total(self) -> int:
        return self.tokens_prompt + self.tokens_completion


@dataclass
class ModelInfo:
    name: str
    size: int = 0
    modified_at: str = ""


# ── Ollama 클라이언트 ────────────────────────────────────────────────────────

class OllamaClient:
    """
    Ollama 로컬 LLM 클라이언트.
    외부 API 호출 없음 — 모든 추론은 localhost:11434.
    """

    def __init__(self, base_url: str = OLLAMA_BASE_URL) -> None:
        self.base_url = base_url.rstrip("/")

    # ── 헬스체크 ─────────────────────────────────────────────────────────────

    async def health_check(self) -> bool:
        """Ollama 서버 응답 여부 확인."""
        try:
            async with httpx.AsyncClient(timeout=5.0) as client:
                r = await client.get(f"{self.base_url}/api/tags")
                return r.status_code == 200
        except Exception:
            return False

    async def list_models(self) -> list[ModelInfo]:
        """설치된 모델 목록 반환."""
        try:
            async with httpx.AsyncClient(timeout=10.0) as client:
                r = await client.get(f"{self.base_url}/api/tags")
                r.raise_for_status()
                return [
                    ModelInfo(
                        name=m.get("name", ""),
                        size=m.get("size", 0),
                        modified_at=m.get("modified_at", ""),
                    )
                    for m in r.json().get("models", [])
                ]
        except Exception as exc:
            logger.error("Ollama 모델 목록 조회 실패: %s", exc)
            return []

    async def resolve_model(self, preferred: str) -> str:
        """
        요청 모델이 설치 여부 확인 → 없으면 fallback 모델 반환.
        """
        available = [m.name for m in await self.list_models()]
        if not available:
            logger.warning("Ollama 모델 없음 — fallback: %s", FALLBACK_MODEL)
            return FALLBACK_MODEL
        if preferred in available:
            return preferred
        # 접두사 매칭 (e.g. "llama3.1" → "llama3.1:8b")
        base = preferred.split(":")[0]
        matched = next((m for m in available if m.startswith(base)), None)
        if matched:
            logger.info("모델 '%s' → '%s' 으로 대체", preferred, matched)
            return matched
        logger.warning("모델 '%s' 미설치 — fallback: %s", preferred, FALLBACK_MODEL)
        return FALLBACK_MODEL

    # ── 추론 메서드 ───────────────────────────────────────────────────────────

    async def chat(
        self,
        messages: list[dict],
        model: str = DEFAULT_MODEL,
        temperature: float = 0.2,
        timeout: float = REQUEST_TIMEOUT,
    ) -> LLMResponse:
        """
        채팅 완성 (messages 리스트 형식).
        messages: [{"role": "system"|"user"|"assistant", "content": "..."}]
        """
        resolved = await self.resolve_model(model)
        payload = {
            "model": resolved,
            "messages": messages,
            "stream": False,
            "options": {
                "temperature": temperature,
                "num_predict": 2048,
            },
        }
        async with httpx.AsyncClient(timeout=timeout) as client:
            resp = await client.post(
                f"{self.base_url}/api/chat",
                json=payload,
            )
            resp.raise_for_status()
            data = resp.json()

        content = data.get("message", {}).get("content", "")
        return LLMResponse(
            content=content,
            model=resolved,
            tokens_prompt=data.get("prompt_eval_count", 0),
            tokens_completion=data.get("eval_count", 0),
        )

    async def generate(
        self,
        prompt: str,
        model: str = DEFAULT_MODEL,
        system: Optional[str] = None,
        temperature: float = 0.2,
        timeout: float = REQUEST_TIMEOUT,
    ) -> LLMResponse:
        """단일 프롬프트 → 응답."""
        messages: list[dict] = []
        if system:
            messages.append({"role": "system", "content": system})
        messages.append({"role": "user", "content": prompt})
        return await self.chat(messages=messages, model=model, temperature=temperature, timeout=timeout)

    async def json_generate(
        self,
        prompt: str,
        model: str = DEFAULT_MODEL,
        system: Optional[str] = None,
        timeout: float = REQUEST_TIMEOUT,
    ) -> dict:
        """
        JSON 응답 전용 메서드.
        모델 응답에서 JSON 블록을 추출하여 dict로 반환.
        실패 시 빈 dict 반환 (예외 미발생).
        """
        full_system = (system or "") + (
            "\n\n반드시 순수 JSON만 응답하세요. 코드블록(```)이나 설명 없이 JSON 객체만 출력하세요."
        )
        resp = await self.generate(
            prompt=prompt,
            model=model,
            system=full_system,
            temperature=0.1,
            timeout=timeout,
        )
        raw = resp.content.strip()

        # 코드블록 제거
        if raw.startswith("```"):
            lines = raw.split("\n")
            raw = "\n".join(
                l for l in lines
                if not l.strip().startswith("```")
            ).strip()

        try:
            return json.loads(raw)
        except json.JSONDecodeError:
            # 첫 번째 { ... } 블록 추출 시도
            start = raw.find("{")
            end = raw.rfind("}")
            if start != -1 and end != -1:
                try:
                    return json.loads(raw[start : end + 1])
                except json.JSONDecodeError:
                    pass
            logger.warning("LLM JSON 파싱 실패 (model=%s): %s", model, raw[:200])
            return {}

    async def pull_model(self, model: str) -> bool:
        """모델 다운로드 (비동기 스트리밍)."""
        try:
            async with httpx.AsyncClient(timeout=600.0) as client:
                async with client.stream(
                    "POST", f"{self.base_url}/api/pull",
                    json={"name": model},
                ) as resp:
                    async for line in resp.aiter_lines():
                        if line:
                            data = json.loads(line)
                            status = data.get("status", "")
                            logger.info("모델 다운로드 [%s]: %s", model, status)
                            if status == "success":
                                return True
            return False
        except Exception as exc:
            logger.error("모델 다운로드 실패 [%s]: %s", model, exc)
            return False

    async def fine_tune(self, dataset_path: str, model_name: str) -> bool:
        """
        Ollama 커스텀 모델 파인튜닝 실행.

        tb_agent_task(COMPLETED) 데이터를 JSONL로 내보내고
        Modelfile을 생성하여 ollama create 명령으로 새 모델을 생성한다.

        보안: 외부 API 호출 없음 — Ollama localhost 전용.

        Args:
            dataset_path: JSONL 파인튜닝 데이터셋 경로
                          (예: /opt/guardia/finetune/guardia-agent-v2.jsonl)
            model_name:   생성할 모델 이름 (예: guardia-agent-v2)

        Returns:
            True: 파인튜닝 성공, False: 실패
        """
        import asyncio
        import os
        from pathlib import Path

        dataset = Path(dataset_path)
        if not dataset.exists():
            logger.error("[fine_tune] 데이터셋 파일 없음: %s", dataset_path)
            return False

        # Modelfile 생성
        modelfile_path = dataset.parent / f"Modelfile.{model_name}"
        base_model = DEFAULT_MODEL or "llama3.1:8b"
        modelfile_content = (
            f"FROM {base_model}\n"
            f"TRAIN {dataset_path}\n"
            f"PARAMETER temperature 0.1\n"
            f'SYSTEM "당신은 GUARDiA ITSM 전문 AI 에이전트입니다. 보안 규칙을 항상 준수하세요."\n'
        )
        modelfile_path.write_text(modelfile_content, encoding="utf-8")

        logger.info("[fine_tune] 파인튜닝 시작: model=%s dataset=%s", model_name, dataset_path)

        try:
            proc = await asyncio.create_subprocess_exec(
                "ollama", "create", model_name,
                "-f", str(modelfile_path),
                stdout=asyncio.subprocess.PIPE,
                stderr=asyncio.subprocess.PIPE,
            )
            stdout, stderr = await asyncio.wait_for(
                proc.communicate(),
                timeout=3600,  # 1시간
            )

            if proc.returncode == 0:
                logger.info("[fine_tune] 파인튜닝 완료: model=%s", model_name)
                return True
            else:
                err = stderr.decode("utf-8", errors="replace")[:500]
                logger.error("[fine_tune] 파인튜닝 실패: model=%s err=%s", model_name, err)
                return False

        except asyncio.TimeoutError:
            logger.error("[fine_tune] 파인튜닝 타임아웃 (1시간 초과): model=%s", model_name)
            if "proc" in dir():
                proc.kill()
            return False
        except Exception as exc:
            logger.error("[fine_tune] 파인튜닝 예외: model=%s err=%s", model_name, exc)
            return False

    async def export_finetune_dataset(self, output_path: str, limit: int = 1000) -> int:
        """
        tb_agent_task(COMPLETED) 데이터를 Ollama 파인튜닝용 JSONL로 내보낸다.

        Args:
            output_path: 저장할 JSONL 파일 경로
            limit:       최대 레코드 수

        Returns:
            내보낸 레코드 수
        """
        import json
        from pathlib import Path

        try:
            from database import SessionLocal
            from models import AgentTask, AgentTaskStatus
            from sqlalchemy import select

            async with SessionLocal() as db:
                tasks = (await db.execute(
                    select(AgentTask).where(
                        AgentTask.status == AgentTaskStatus.COMPLETED,
                        AgentTask.tokens_used > 0,
                    ).limit(limit)
                )).scalars().all()

            out_path = Path(output_path)
            out_path.parent.mkdir(parents=True, exist_ok=True)

            count = 0
            with out_path.open("w", encoding="utf-8") as f:
                for task in tasks:
                    if not task.input_data or not task.output_data:
                        continue
                    record = {
                        "prompt": json.dumps(task.input_data, ensure_ascii=False),
                        "response": json.dumps(task.output_data, ensure_ascii=False),
                    }
                    f.write(json.dumps(record, ensure_ascii=False) + "\n")
                    count += 1

            logger.info("[export_finetune] %d건 내보내기 완료: %s", count, output_path)
            return count

        except Exception as exc:
            logger.error("[export_finetune] 내보내기 실패: %s", exc)
            return 0


# ── 싱글턴 ──────────────────────────────────────────────────────────────────

_client: Optional[OllamaClient] = None


def get_llm_client() -> OllamaClient:
    """
    LLM 클라이언트 싱글턴 반환.
    항상 OllamaClient (로컬) 반환 — 외부 API 없음.
    """
    global _client
    if _client is None:
        _client = OllamaClient(base_url=OLLAMA_BASE_URL)
        logger.info("LLM 클라이언트 초기화: %s (모델: %s)", OLLAMA_BASE_URL, DEFAULT_MODEL)
    return _client