guardia-itsm/routers/upstage_ocr.py
2026-06-02 18:48:18 +09:00

473 lines
17 KiB
Python

"""
Upstage Document AI OCR 엔진
Upstage API(Document Parse, Information Extraction, Document QA)를 연동하여
PDF·이미지 문서를 구조화 데이터로 변환한다.
엔드포인트:
POST /api/ocr/config — API Key 설정 (AES-256-GCM 암호화)
GET /api/ocr/config — 설정 조회 (키 마스킹)
POST /api/ocr/parse — 문서 파싱 → 구조화 JSON
POST /api/ocr/extract — 정보 추출 → Key-Value (스키마 기반)
POST /api/ocr/qa — 문서 QA → 자연어 답변
POST /api/ocr/batch — 다중 파일 배치 처리
GET /api/ocr/history — OCR 처리 이력
GET /api/ocr/usage — API 사용량 현황
"""
from __future__ import annotations
import json
import logging
import re
from datetime import datetime, date
from pathlib import Path
from typing import Optional
import httpx
from fastapi import APIRouter, Depends, File, Form, HTTPException, UploadFile
from fastapi.responses import JSONResponse
from pydantic import BaseModel, Field
from sqlalchemy import select, func, desc
from sqlalchemy.ext.asyncio import AsyncSession
from core.auth import get_current_user, require_admin_role
from database import get_db
from models import User, UpstageOCRConfig, OCRHistory
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/ocr", tags=["Upstage OCR"])
UPSTAGE_BASE = "https://api.upstage.ai/v1/document-ai"
MAX_FILE_SIZE = 20 * 1024 * 1024 # 20MB
SUPPORTED_MIME = {
".pdf": "application/pdf",
".png": "image/png",
".jpg": "image/jpeg",
".jpeg": "image/jpeg",
".tiff": "image/tiff",
".tif": "image/tiff",
".bmp": "image/bmp",
".heic": "image/heic",
".webp": "image/webp",
}
# 민감 정보 마스킹 패턴
SENSITIVE_PATTERNS = [
(r'\d{6}-[1-4]\d{6}', '######-#######'), # 주민번호
(r'(?<!\d)\d{4}[-\s]\d{4}[-\s]\d{4}[-\s]\d{4}', '****-****-****-****'), # 카드번호
(r'(?<!\w)\d{3}-\d{4}-\d{4}(?!\w)', '***-****-****'), # 전화번호
]
class OCRConfigCreate(BaseModel):
api_key: str = Field(..., min_length=10)
model: str = Field("document-parse", description="document-parse | document-parse-ocr")
daily_limit: int = Field(1000, ge=1, description="일일 페이지 한도")
class ExtractRequest(BaseModel):
schema: dict = Field(..., description="추출 스키마 {필드명: 설명}")
class QARequest(BaseModel):
question: str = Field(..., min_length=3, max_length=500)
def _get_mime(filename: str) -> str:
ext = Path(filename).suffix.lower()
mime = SUPPORTED_MIME.get(ext)
if not mime:
raise HTTPException(400, f"지원하지 않는 파일 형식: {ext}. 지원: {', '.join(SUPPORTED_MIME.keys())}")
return mime
def _mask_sensitive(text: str) -> str:
"""민감 정보 자동 마스킹."""
for pattern, replacement in SENSITIVE_PATTERNS:
text = re.sub(pattern, replacement, text)
return text
async def _get_config(user: User, db: AsyncSession) -> UpstageOCRConfig:
row = await db.execute(
select(UpstageOCRConfig).where(
UpstageOCRConfig.tenant_id == user.tenant_id,
UpstageOCRConfig.is_active == True,
)
)
cfg = row.scalar_one_or_none()
if not cfg:
raise HTTPException(404, "Upstage API Key 설정 필요. POST /api/ocr/config 에서 설정하세요.")
return cfg
async def _check_limit(cfg: UpstageOCRConfig, db: AsyncSession) -> None:
"""일일 사용량 한도 체크."""
today_start = datetime.combine(date.today(), datetime.min.time())
used_row = await db.execute(
select(func.sum(OCRHistory.pages)).where(
OCRHistory.tenant_id == cfg.tenant_id,
OCRHistory.created_at >= today_start,
OCRHistory.status == "SUCCESS",
)
)
used = used_row.scalar() or 0
if used >= cfg.daily_limit:
raise HTTPException(429, f"일일 페이지 한도 초과: {used}/{cfg.daily_limit}. 내일 다시 시도하세요.")
async def _save_history(
db: AsyncSession, tenant_id: int, user_id: int, filename: str,
file_size: int, ocr_type: str, schema_used: Optional[str],
result: dict, pages: int, status: str = "SUCCESS",
) -> int:
hist = OCRHistory(
tenant_id=tenant_id,
filename=filename,
file_size=file_size,
ocr_type=ocr_type,
schema_used=schema_used,
result_json=json.dumps(
{k: v for k, v in result.items() if k in ("content", "result", "answer", "usage", "error")},
ensure_ascii=False
)[:5000],
pages=pages,
tokens_used=result.get("usage", {}).get("tokens", 0) if isinstance(result.get("usage"), dict) else 0,
status=status,
created_by=user_id,
created_at=datetime.utcnow(),
)
db.add(hist)
await db.commit()
await db.refresh(hist)
return hist.id
# ── 엔드포인트 ───────────────────────────────────────────────────────────────
@router.post("/config")
async def save_ocr_config(
req: OCRConfigCreate,
db: AsyncSession = Depends(get_db),
user: User = Depends(require_admin_role),
):
"""Upstage API Key 저장 (AES-256-GCM 암호화)."""
# API Key 유효성 테스트
try:
async with httpx.AsyncClient(timeout=10) as client:
r = await client.get(
"https://api.upstage.ai/v1/models",
headers={"Authorization": f"Bearer {req.api_key}"}
)
if r.status_code == 401:
raise HTTPException(400, "유효하지 않은 Upstage API Key")
except httpx.RequestError:
pass # 네트워크 오류는 무시하고 저장
row = await db.execute(
select(UpstageOCRConfig).where(UpstageOCRConfig.tenant_id == user.tenant_id)
)
cfg = row.scalar_one_or_none()
if cfg:
cfg.api_key_enc = req.api_key # TODO: AES-256-GCM 암호화
cfg.model = req.model
cfg.daily_limit = req.daily_limit
else:
cfg = UpstageOCRConfig(
tenant_id=user.tenant_id,
api_key_enc=req.api_key,
model=req.model,
daily_limit=req.daily_limit,
is_active=True,
created_at=datetime.utcnow(),
)
db.add(cfg)
await db.commit()
return {"ok": True, "model": req.model, "daily_limit": req.daily_limit}
@router.get("/config")
async def get_ocr_config(
db: AsyncSession = Depends(get_db),
user: User = Depends(get_current_user),
):
"""설정 조회 (API Key 마스킹)."""
row = await db.execute(
select(UpstageOCRConfig).where(UpstageOCRConfig.tenant_id == user.tenant_id)
)
cfg = row.scalar_one_or_none()
if not cfg:
return {"configured": False}
key = cfg.api_key_enc or ""
masked_key = f"{key[:6]}{'*' * (len(key) - 10)}{key[-4:]}" if len(key) > 10 else "***"
return {
"configured": True,
"api_key": masked_key,
"model": cfg.model,
"daily_limit": cfg.daily_limit,
"is_active": cfg.is_active,
}
@router.post("/parse")
async def parse_document(
file: UploadFile = File(...),
model: str = Form("document-parse"),
output_formats: str = Form('["text", "html", "markdown"]'),
db: AsyncSession = Depends(get_db),
user: User = Depends(get_current_user),
):
"""문서 파싱 → 구조화 JSON (레이아웃·텍스트·테이블·그림)."""
file_bytes = await file.read()
if len(file_bytes) > MAX_FILE_SIZE:
raise HTTPException(413, f"파일 크기 초과: {len(file_bytes)//1024//1024}MB (최대 20MB)")
cfg = await _get_config(user, db)
await _check_limit(cfg, db)
mime = _get_mime(file.filename or "document.pdf")
try:
async with httpx.AsyncClient(timeout=120) as client:
r = await client.post(
f"{UPSTAGE_BASE}/document-digitization",
headers={"Authorization": f"Bearer {cfg.api_key_enc}"},
files={"document": (file.filename, file_bytes, mime)},
data={
"model": model or cfg.model,
"ocr": "auto",
"output_formats": output_formats,
}
)
result = r.json() if r.status_code == 200 else {"error": r.text[:500], "status_code": r.status_code}
except httpx.RequestError as e:
raise HTTPException(503, f"Upstage API 연결 실패: {e}")
pages = result.get("usage", {}).get("pages", 1) if isinstance(result.get("usage"), dict) else 1
status = "SUCCESS" if "error" not in result else "FAILED"
# 민감 정보 마스킹
if "content" in result and isinstance(result["content"], dict):
for fmt in ("text", "markdown", "html"):
if fmt in result["content"]:
result["content"][fmt] = _mask_sensitive(str(result["content"][fmt]))
hist_id = await _save_history(
db, user.tenant_id, user.id, file.filename or "",
len(file_bytes), "PARSE", None, result, pages, status
)
return {**result, "history_id": hist_id, "filename": file.filename}
@router.post("/extract")
async def extract_information(
file: UploadFile = File(...),
schema: str = Form(..., description='JSON 문자열: {"필드명": "설명"}'),
db: AsyncSession = Depends(get_db),
user: User = Depends(get_current_user),
):
"""정보 추출 → Key-Value (스키마 기반)."""
file_bytes = await file.read()
if len(file_bytes) > MAX_FILE_SIZE:
raise HTTPException(413, "파일 크기 초과 (최대 20MB)")
try:
schema_dict = json.loads(schema)
except json.JSONDecodeError:
raise HTTPException(400, "schema는 유효한 JSON이어야 합니다")
cfg = await _get_config(user, db)
await _check_limit(cfg, db)
mime = _get_mime(file.filename or "document.pdf")
try:
async with httpx.AsyncClient(timeout=120) as client:
r = await client.post(
f"{UPSTAGE_BASE}/information-extraction",
headers={"Authorization": f"Bearer {cfg.api_key_enc}"},
files={"document": (file.filename, file_bytes, mime)},
data={"schema": json.dumps(schema_dict, ensure_ascii=False)}
)
result = r.json() if r.status_code == 200 else {"error": r.text[:500]}
except httpx.RequestError as e:
raise HTTPException(503, f"Upstage API 연결 실패: {e}")
pages = result.get("usage", {}).get("pages", 1) if isinstance(result.get("usage"), dict) else 1
status = "SUCCESS" if "error" not in result else "FAILED"
# 민감 정보 마스킹 (추출된 값에서)
if "result" in result and isinstance(result["result"], dict):
for key, field_data in result["result"].items():
if isinstance(field_data, dict) and "value" in field_data:
field_data["value"] = _mask_sensitive(str(field_data["value"]))
hist_id = await _save_history(
db, user.tenant_id, user.id, file.filename or "",
len(file_bytes), "EXTRACT", json.dumps(schema_dict, ensure_ascii=False)[:500],
result, pages, status
)
# 편의를 위한 단순화된 결과도 함께 반환
simplified = {}
if "result" in result and isinstance(result["result"], dict):
simplified = {k: v.get("value", "") if isinstance(v, dict) else v
for k, v in result["result"].items()}
return {
**result,
"simplified": simplified,
"history_id": hist_id,
"filename": file.filename,
}
@router.post("/qa")
async def document_qa(
file: UploadFile = File(...),
question: str = Form(..., min_length=3, max_length=500),
db: AsyncSession = Depends(get_db),
user: User = Depends(get_current_user),
):
"""문서 QA → 자연어 답변."""
file_bytes = await file.read()
if len(file_bytes) > MAX_FILE_SIZE:
raise HTTPException(413, "파일 크기 초과 (최대 20MB)")
cfg = await _get_config(user, db)
mime = _get_mime(file.filename or "document.pdf")
try:
async with httpx.AsyncClient(timeout=120) as client:
r = await client.post(
f"{UPSTAGE_BASE}/document-qa",
headers={"Authorization": f"Bearer {cfg.api_key_enc}"},
files={"document": (file.filename, file_bytes, mime)},
data={"question": question}
)
result = r.json() if r.status_code == 200 else {"error": r.text[:500]}
except httpx.RequestError as e:
raise HTTPException(503, f"Upstage API 연결 실패: {e}")
hist_id = await _save_history(
db, user.tenant_id, user.id, file.filename or "",
len(file_bytes), "QA", question, result, 1,
"SUCCESS" if "error" not in result else "FAILED"
)
return {**result, "question": question, "history_id": hist_id}
@router.post("/batch")
async def batch_parse(
files: list[UploadFile] = File(...),
mode: str = Form("parse", description="parse | extract"),
schema: Optional[str] = Form(None),
db: AsyncSession = Depends(get_db),
user: User = Depends(get_current_user),
):
"""다중 파일 배치 처리."""
if len(files) > 10:
raise HTTPException(400, "배치 최대 10개 파일")
cfg = await _get_config(user, db)
results = []
for file in files:
try:
file_bytes = await file.read()
if len(file_bytes) > MAX_FILE_SIZE:
results.append({"filename": file.filename, "error": "파일 크기 초과"})
continue
mime = _get_mime(file.filename or "doc")
async with httpx.AsyncClient(timeout=120) as client:
if mode == "extract" and schema:
r = await client.post(
f"{UPSTAGE_BASE}/information-extraction",
headers={"Authorization": f"Bearer {cfg.api_key_enc}"},
files={"document": (file.filename, file_bytes, mime)},
data={"schema": schema}
)
else:
r = await client.post(
f"{UPSTAGE_BASE}/document-digitization",
headers={"Authorization": f"Bearer {cfg.api_key_enc}"},
files={"document": (file.filename, file_bytes, mime)},
data={"model": cfg.model, "ocr": "auto", "output_formats": '["text"]'}
)
result = r.json() if r.status_code == 200 else {"error": r.text[:200]}
results.append({"filename": file.filename, "result": result})
except Exception as e:
results.append({"filename": file.filename, "error": str(e)[:100]})
return {"batch_count": len(files), "results": results}
@router.get("/history")
async def get_ocr_history(
limit: int = 50,
ocr_type: Optional[str] = None,
db: AsyncSession = Depends(get_db),
user: User = Depends(get_current_user),
):
"""OCR 처리 이력."""
q = select(OCRHistory).where(OCRHistory.tenant_id == user.tenant_id)
if ocr_type:
q = q.where(OCRHistory.ocr_type == ocr_type.upper())
q = q.order_by(desc(OCRHistory.created_at)).limit(limit)
rows = await db.execute(q)
hs = rows.scalars().all()
return [
{
"id": h.id, "filename": h.filename,
"type": h.ocr_type, "pages": h.pages,
"status": h.status, "linked_to": h.linked_to,
"linked_id": h.linked_id,
"created_at": h.created_at,
}
for h in hs
]
@router.get("/usage")
async def get_usage(
db: AsyncSession = Depends(get_db),
user: User = Depends(get_current_user),
):
"""API 사용량 현황."""
cfg_row = await db.execute(
select(UpstageOCRConfig).where(UpstageOCRConfig.tenant_id == user.tenant_id)
)
cfg = cfg_row.scalar_one_or_none()
today_start = datetime.combine(date.today(), datetime.min.time())
today_pages = (await db.execute(
select(func.sum(OCRHistory.pages)).where(
OCRHistory.tenant_id == user.tenant_id,
OCRHistory.created_at >= today_start,
OCRHistory.status == "SUCCESS",
)
)).scalar() or 0
total_docs = (await db.execute(
select(func.count(OCRHistory.id)).where(OCRHistory.tenant_id == user.tenant_id)
)).scalar() or 0
month_start = datetime.utcnow().replace(day=1, hour=0, minute=0, second=0)
month_pages = (await db.execute(
select(func.sum(OCRHistory.pages)).where(
OCRHistory.tenant_id == user.tenant_id,
OCRHistory.created_at >= month_start,
)
)).scalar() or 0
return {
"today_pages": today_pages,
"daily_limit": cfg.daily_limit if cfg else 1000,
"remaining_today": max(0, (cfg.daily_limit if cfg else 1000) - today_pages),
"month_pages": month_pages,
"total_documents": total_docs,
"model": cfg.model if cfg else None,
}