guardia-itsm/routers/auto_finetune.py

"""미래 준비 — LoRA 자동 파인튜닝 파이프라인 관리."""
from __future__ import annotations
import logging
from datetime import datetime
from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException
from pydantic import BaseModel
from sqlalchemy import select, desc
from sqlalchemy.ext.asyncio import AsyncSession
from core.auth import get_current_user, require_admin_role
from database import get_db
from models import User, AutoFinetuneJob

logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/finetune", tags=["미래준비-파인튜닝"])

MIN_DATASET_SIZE = 50


class FinetuneRequest(BaseModel):
    model_name: str = "llama3"
    epochs: int = 3
    note: str = ""


async def _run_finetune(job_id: int):
    """LoRA 파인튜닝 실행 (Ollama Modelfile 기반)."""
    from database import AsyncSessionLocal
    async with AsyncSessionLocal() as db:
        job = await db.get(AutoFinetuneJob, job_id)
        if not job:
            return
        job.status = "running"
        job.started_at = datetime.utcnow()
        await db.commit()

    # SR 이력 데이터 수집
    try:
        from database import AsyncSessionLocal
        from models import ServiceRequest
        async with AsyncSessionLocal() as db:
            rows = await db.execute(
                select(ServiceRequest).where(ServiceRequest.status == "완료").limit(500)
            )
            samples = rows.scalars().all()
    except Exception:
        samples = []

    dataset_size = len(samples)
    success = False
    loss = None
    error_msg = None

    if dataset_size < MIN_DATASET_SIZE:
        error_msg = f"데이터 부족: {dataset_size}개 (최소 {MIN_DATASET_SIZE}개 필요). 다음 달 재시도 예정."
        logger.warning(error_msg)
    else:
        try:
            import asyncio, json, tempfile, os
            dataset_file = tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False, encoding="utf-8")
            for sr in samples[:500]:
                dataset_file.write(json.dumps({
                    "prompt": getattr(sr, "description", ""),
                    "completion": getattr(sr, "resolution", ""),
                }, ensure_ascii=False) + "\n")
            dataset_file.close()

            proc = await asyncio.create_subprocess_exec(
                "python3", "/opt/guardia/scripts/lora_finetune.py",
                "--model", job.model_name,
                "--dataset", dataset_file.name,
                "--epochs", str(job.epochs),
                "--output", f"/opt/guardia/models/{job.model_name}-finetuned",
                stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.STDOUT,
            )
            stdout, _ = await asyncio.wait_for(proc.communicate(), timeout=3600)
            output = stdout.decode("utf-8", "replace")
            success = proc.returncode == 0
            for line in output.splitlines():
                if "loss" in line.lower():
                    try:
                        loss = float(line.split("loss")[-1].strip().split()[0].strip(":= "))
                    except Exception:
                        pass
            os.unlink(dataset_file.name)
        except Exception as e:
            error_msg = str(e)
            logger.error("파인튜닝 실패: %s", e)

    async with AsyncSessionLocal() as db:
        job = await db.get(AutoFinetuneJob, job_id)
        if job:
            job.status = "success" if success else ("failed" if not error_msg else "skipped")
            job.dataset_size = dataset_size
            job.loss = loss
            job.error_msg = error_msg
            job.finished_at = datetime.utcnow()
            await db.commit()

    if success:
        try:
            import httpx
            async with httpx.AsyncClient(timeout=5) as c:
                await c.post("http://127.0.0.1:9001/api/messenger/webhook", json={
                    "event": "finetune_complete",
                    "room": "ops",
                    "message": f"🧠 LoRA 파인튜닝 완료: {job.model_name} (loss={loss:.4f})",
                })
        except Exception:
            pass


@router.post("/start")
async def start_finetune(
    req: FinetuneRequest,
    background_tasks: BackgroundTasks,
    db: AsyncSession = Depends(get_db),
    user: User = Depends(require_admin_role),
):
    """LoRA 파인튜닝 작업 시작."""
    job = AutoFinetuneJob(
        model_name=req.model_name, epochs=req.epochs,
        status="pending", created_at=datetime.utcnow(),
    )
    db.add(job)
    await db.commit()
    await db.refresh(job)
    background_tasks.add_task(_run_finetune, job.id)
    return {"ok": True, "job_id": job.id, "message": "파인튜닝 작업 시작됨 (백그라운드)"}


@router.get("/jobs")
async def list_jobs(
    limit: int = 20,
    db: AsyncSession = Depends(get_db),
    user: User = Depends(get_current_user),
):
    """파인튜닝 작업 이력."""
    rows = await db.execute(
        select(AutoFinetuneJob).order_by(desc(AutoFinetuneJob.created_at)).limit(limit)
    )
    return [{
        "id": j.id, "model_name": j.model_name, "status": j.status,
        "dataset_size": j.dataset_size, "loss": j.loss, "epochs": j.epochs,
        "started_at": j.started_at, "finished_at": j.finished_at,
        "error_msg": j.error_msg,
    } for j in rows.scalars().all()]


@router.get("/jobs/{job_id}")
async def get_job(
    job_id: int,
    db: AsyncSession = Depends(get_db),
    user: User = Depends(get_current_user),
):
    job = await db.get(AutoFinetuneJob, job_id)
    if not job:
        raise HTTPException(404, "작업 없음")
    return {"id": job.id, "model_name": job.model_name, "status": job.status,
            "dataset_size": job.dataset_size, "loss": job.loss, "epochs": job.epochs,
            "started_at": job.started_at, "finished_at": job.finished_at, "error_msg": job.error_msg}