zioinfo-mail/workspace/guardia-itsm/routers/kb.py

"""
Knowledge Base router — on-premise 기술 문서 검색 + SR 자동 추천.

외부 API 없이 순수 Python 키워드 매칭으로 RAG 유사 기능 구현:
  1. 쿼리/문서 토크나이징
  2. 키워드 히트 수 + 위치 가중치 기반 스코어링
  3. SR 설명 + 작업 로그 에러를 합산해 자동 추천
"""
import re
from typing import List, Optional

from fastapi import APIRouter, Depends, Query
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession

from core.auth import get_current_user
from database import get_db
from models import KBDocument, KBDocumentOut, KBSearchResult, SRRequest, User, WorkLog

router = APIRouter(prefix="/api/kb", tags=["kb"])

# ── 한국어 + 영문 공통 불용어 ────────────────────────────────────────────────
_STOPWORDS = {
    "이", "가", "을", "를", "의", "에", "에서", "와", "과", "도", "는", "은",
    "이다", "있다", "하다", "되다", "않다", "그", "및", "또는", "또", "등",
    "the", "a", "an", "is", "are", "was", "were", "in", "on", "at", "to",
    "for", "of", "and", "or", "not", "with", "by", "from", "be",
}

_MIN_TOKEN_LEN = 2


def _tokenize(text: str) -> list[str]:
    """공백·특수문자 기준 분리, 불용어 제거, 소문자 정규화."""
    if not text:
        return []
    tokens = re.split(r'[\s,;:.(){}\[\]<>/\\|&!@#$%^*+=~`\-\'\"]+', text.lower())
    return [t for t in tokens if len(t) >= _MIN_TOKEN_LEN and t not in _STOPWORDS]


def _score_doc(query_tokens: list[str], doc: KBDocument) -> tuple[float, list[str]]:
    """
    문서 관련도 스코어 계산.
    title/tags 히트: 가중치 2.0
    symptoms/cause 히트: 가중치 1.5
    solution/commands 히트: 가중치 1.0
    """
    if not query_tokens:
        return 0.0, []

    # 필드별 가중치 텍스트
    weighted_fields = [
        (doc.title   or "", 2.0),
        (doc.tags    or "", 2.0),
        (doc.symptoms or "", 1.5),
        (doc.cause   or "", 1.5),
        (doc.solution or "", 1.0),
        (doc.commands or "", 1.0),
    ]

    matched: set[str] = set()
    raw_score = 0.0

    for field_text, weight in weighted_fields:
        field_lower = field_text.lower()
        for token in query_tokens:
            if token in field_lower:
                raw_score += weight
                matched.add(token)

    # 정규화: 최대 가능 점수 대비 비율
    max_possible = len(query_tokens) * 2.0  # 모든 토큰이 title/tags 히트 시
    normalized = raw_score / max(max_possible, 1.0)
    # 0.0~1.0 범위 클리핑
    return round(min(normalized, 1.0), 4), sorted(matched)


async def _search_docs(
    db: AsyncSession,
    query: str,
    sr_type_hint: Optional[str] = None,
    limit: int = 5,
) -> list[dict]:
    """핵심 검색 함수 — 라우터·추천 양쪽에서 재사용."""
    tokens = _tokenize(query)
    if not tokens:
        return []

    res  = await db.execute(select(KBDocument))
    docs = res.scalars().all()

    results = []
    for doc in docs:
        score, matched = _score_doc(tokens, doc)
        if score <= 0:
            continue
        # SR 유형 힌트 일치 시 보너스
        if sr_type_hint and doc.sr_type == sr_type_hint:
            score = min(score + 0.15, 1.0)
        results.append({"doc": doc, "score": score, "matched": matched})

    results.sort(key=lambda x: -x["score"])
    return results[:limit]


# ── Endpoints ──────────────────────────────────────────────────────────────────

@router.get("", response_model=List[KBSearchResult])
async def search_kb(
    q: str = Query(..., min_length=2, description="검색 키워드"),
    limit: int = Query(5, ge=1, le=20),
    db: AsyncSession = Depends(get_db),
    _u: User = Depends(get_current_user),
):
    """키워드 검색 — 관련 기술 문서 반환."""
    hits = await _search_docs(db, q, limit=limit)
    return [
        KBSearchResult(
            doc=KBDocumentOut.model_validate(h["doc"]),
            score=h["score"],
            matched_keywords=h["matched"],
        )
        for h in hits
    ]


@router.get("/suggest/{sr_id}", response_model=List[KBSearchResult])
async def suggest_for_sr(
    sr_id: str,
    db: AsyncSession = Depends(get_db),
    _u: User = Depends(get_current_user),
):
    """
    SR 설명 + 작업 로그 에러 텍스트를 종합해 관련 KB 문서 자동 추천.
    최대 3건 반환.
    """
    # SR 조회
    r = await db.execute(select(SRRequest).where(SRRequest.sr_id == sr_id))
    sr = r.scalars().first()
    if not sr:
        return []

    # 쿼리 소스 합산: 제목 + 설명 + 최근 작업 로그 결과(에러 행)
    query_parts = [sr.title or "", sr.description or ""]

    wl_res = await db.execute(
        select(WorkLog).where(WorkLog.sr_id == sr_id)
                        .order_by(WorkLog.created_at.desc()).limit(10)
    )
    for wlog in wl_res.scalars().all():
        # 에러·경고 라인만 추출
        for line in (wlog.result or "").splitlines():
            if any(kw in line.upper() for kw in ("ERROR", "WARN", "ORA-", "EXCEPTION", "FAILED")):
                query_parts.append(line)

    combined_query = " ".join(query_parts)
    hits = await _search_docs(db, combined_query, sr_type_hint=sr.sr_type, limit=3)

    return [
        KBSearchResult(
            doc=KBDocumentOut.model_validate(h["doc"]),
            score=h["score"],
            matched_keywords=h["matched"],
        )
        for h in hits
    ]


@router.get("/list", response_model=List[KBDocumentOut])
async def list_kb(
    category: Optional[str] = Query(None),
    db: AsyncSession = Depends(get_db),
    _u: User = Depends(get_current_user),
):
    """전체 KB 문서 목록 (카테고리 필터 선택)."""
    q = select(KBDocument).order_by(KBDocument.category, KBDocument.doc_id)
    if category:
        q = q.where(KBDocument.category == category)
    res = await db.execute(q)
    return res.scalars().all()