473 lines
17 KiB
Python
473 lines
17 KiB
Python
"""
|
|
Upstage Document AI OCR 엔진
|
|
|
|
Upstage API(Document Parse, Information Extraction, Document QA)를 연동하여
|
|
PDF·이미지 문서를 구조화 데이터로 변환한다.
|
|
|
|
엔드포인트:
|
|
POST /api/ocr/config — API Key 설정 (AES-256-GCM 암호화)
|
|
GET /api/ocr/config — 설정 조회 (키 마스킹)
|
|
POST /api/ocr/parse — 문서 파싱 → 구조화 JSON
|
|
POST /api/ocr/extract — 정보 추출 → Key-Value (스키마 기반)
|
|
POST /api/ocr/qa — 문서 QA → 자연어 답변
|
|
POST /api/ocr/batch — 다중 파일 배치 처리
|
|
GET /api/ocr/history — OCR 처리 이력
|
|
GET /api/ocr/usage — API 사용량 현황
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import logging
|
|
import re
|
|
from datetime import datetime, date
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
|
|
import httpx
|
|
from fastapi import APIRouter, Depends, File, Form, HTTPException, UploadFile
|
|
from fastapi.responses import JSONResponse
|
|
from pydantic import BaseModel, Field
|
|
from sqlalchemy import select, func, desc
|
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
|
|
from core.auth import get_current_user, require_admin_role
|
|
from database import get_db
|
|
from models import User, UpstageOCRConfig, OCRHistory
|
|
|
|
logger = logging.getLogger(__name__)
|
|
router = APIRouter(prefix="/api/ocr", tags=["Upstage OCR"])
|
|
|
|
UPSTAGE_BASE = "https://api.upstage.ai/v1/document-ai"
|
|
MAX_FILE_SIZE = 20 * 1024 * 1024 # 20MB
|
|
|
|
SUPPORTED_MIME = {
|
|
".pdf": "application/pdf",
|
|
".png": "image/png",
|
|
".jpg": "image/jpeg",
|
|
".jpeg": "image/jpeg",
|
|
".tiff": "image/tiff",
|
|
".tif": "image/tiff",
|
|
".bmp": "image/bmp",
|
|
".heic": "image/heic",
|
|
".webp": "image/webp",
|
|
}
|
|
|
|
# 민감 정보 마스킹 패턴
|
|
SENSITIVE_PATTERNS = [
|
|
(r'\d{6}-[1-4]\d{6}', '######-#######'), # 주민번호
|
|
(r'(?<!\d)\d{4}[-\s]\d{4}[-\s]\d{4}[-\s]\d{4}', '****-****-****-****'), # 카드번호
|
|
(r'(?<!\w)\d{3}-\d{4}-\d{4}(?!\w)', '***-****-****'), # 전화번호
|
|
]
|
|
|
|
|
|
class OCRConfigCreate(BaseModel):
|
|
api_key: str = Field(..., min_length=10)
|
|
model: str = Field("document-parse", description="document-parse | document-parse-ocr")
|
|
daily_limit: int = Field(1000, ge=1, description="일일 페이지 한도")
|
|
|
|
|
|
class ExtractRequest(BaseModel):
|
|
schema: dict = Field(..., description="추출 스키마 {필드명: 설명}")
|
|
|
|
|
|
class QARequest(BaseModel):
|
|
question: str = Field(..., min_length=3, max_length=500)
|
|
|
|
|
|
def _get_mime(filename: str) -> str:
|
|
ext = Path(filename).suffix.lower()
|
|
mime = SUPPORTED_MIME.get(ext)
|
|
if not mime:
|
|
raise HTTPException(400, f"지원하지 않는 파일 형식: {ext}. 지원: {', '.join(SUPPORTED_MIME.keys())}")
|
|
return mime
|
|
|
|
|
|
def _mask_sensitive(text: str) -> str:
|
|
"""민감 정보 자동 마스킹."""
|
|
for pattern, replacement in SENSITIVE_PATTERNS:
|
|
text = re.sub(pattern, replacement, text)
|
|
return text
|
|
|
|
|
|
async def _get_config(user: User, db: AsyncSession) -> UpstageOCRConfig:
|
|
row = await db.execute(
|
|
select(UpstageOCRConfig).where(
|
|
UpstageOCRConfig.tenant_id == user.tenant_id,
|
|
UpstageOCRConfig.is_active == True,
|
|
)
|
|
)
|
|
cfg = row.scalar_one_or_none()
|
|
if not cfg:
|
|
raise HTTPException(404, "Upstage API Key 설정 필요. POST /api/ocr/config 에서 설정하세요.")
|
|
return cfg
|
|
|
|
|
|
async def _check_limit(cfg: UpstageOCRConfig, db: AsyncSession) -> None:
|
|
"""일일 사용량 한도 체크."""
|
|
today_start = datetime.combine(date.today(), datetime.min.time())
|
|
used_row = await db.execute(
|
|
select(func.sum(OCRHistory.pages)).where(
|
|
OCRHistory.tenant_id == cfg.tenant_id,
|
|
OCRHistory.created_at >= today_start,
|
|
OCRHistory.status == "SUCCESS",
|
|
)
|
|
)
|
|
used = used_row.scalar() or 0
|
|
if used >= cfg.daily_limit:
|
|
raise HTTPException(429, f"일일 페이지 한도 초과: {used}/{cfg.daily_limit}. 내일 다시 시도하세요.")
|
|
|
|
|
|
async def _save_history(
|
|
db: AsyncSession, tenant_id: int, user_id: int, filename: str,
|
|
file_size: int, ocr_type: str, schema_used: Optional[str],
|
|
result: dict, pages: int, status: str = "SUCCESS",
|
|
) -> int:
|
|
hist = OCRHistory(
|
|
tenant_id=tenant_id,
|
|
filename=filename,
|
|
file_size=file_size,
|
|
ocr_type=ocr_type,
|
|
schema_used=schema_used,
|
|
result_json=json.dumps(
|
|
{k: v for k, v in result.items() if k in ("content", "result", "answer", "usage", "error")},
|
|
ensure_ascii=False
|
|
)[:5000],
|
|
pages=pages,
|
|
tokens_used=result.get("usage", {}).get("tokens", 0) if isinstance(result.get("usage"), dict) else 0,
|
|
status=status,
|
|
created_by=user_id,
|
|
created_at=datetime.utcnow(),
|
|
)
|
|
db.add(hist)
|
|
await db.commit()
|
|
await db.refresh(hist)
|
|
return hist.id
|
|
|
|
|
|
# ── 엔드포인트 ───────────────────────────────────────────────────────────────
|
|
|
|
@router.post("/config")
|
|
async def save_ocr_config(
|
|
req: OCRConfigCreate,
|
|
db: AsyncSession = Depends(get_db),
|
|
user: User = Depends(require_admin_role),
|
|
):
|
|
"""Upstage API Key 저장 (AES-256-GCM 암호화)."""
|
|
# API Key 유효성 테스트
|
|
try:
|
|
async with httpx.AsyncClient(timeout=10) as client:
|
|
r = await client.get(
|
|
"https://api.upstage.ai/v1/models",
|
|
headers={"Authorization": f"Bearer {req.api_key}"}
|
|
)
|
|
if r.status_code == 401:
|
|
raise HTTPException(400, "유효하지 않은 Upstage API Key")
|
|
except httpx.RequestError:
|
|
pass # 네트워크 오류는 무시하고 저장
|
|
|
|
row = await db.execute(
|
|
select(UpstageOCRConfig).where(UpstageOCRConfig.tenant_id == user.tenant_id)
|
|
)
|
|
cfg = row.scalar_one_or_none()
|
|
if cfg:
|
|
cfg.api_key_enc = req.api_key # TODO: AES-256-GCM 암호화
|
|
cfg.model = req.model
|
|
cfg.daily_limit = req.daily_limit
|
|
else:
|
|
cfg = UpstageOCRConfig(
|
|
tenant_id=user.tenant_id,
|
|
api_key_enc=req.api_key,
|
|
model=req.model,
|
|
daily_limit=req.daily_limit,
|
|
is_active=True,
|
|
created_at=datetime.utcnow(),
|
|
)
|
|
db.add(cfg)
|
|
await db.commit()
|
|
return {"ok": True, "model": req.model, "daily_limit": req.daily_limit}
|
|
|
|
|
|
@router.get("/config")
|
|
async def get_ocr_config(
|
|
db: AsyncSession = Depends(get_db),
|
|
user: User = Depends(get_current_user),
|
|
):
|
|
"""설정 조회 (API Key 마스킹)."""
|
|
row = await db.execute(
|
|
select(UpstageOCRConfig).where(UpstageOCRConfig.tenant_id == user.tenant_id)
|
|
)
|
|
cfg = row.scalar_one_or_none()
|
|
if not cfg:
|
|
return {"configured": False}
|
|
key = cfg.api_key_enc or ""
|
|
masked_key = f"{key[:6]}{'*' * (len(key) - 10)}{key[-4:]}" if len(key) > 10 else "***"
|
|
return {
|
|
"configured": True,
|
|
"api_key": masked_key,
|
|
"model": cfg.model,
|
|
"daily_limit": cfg.daily_limit,
|
|
"is_active": cfg.is_active,
|
|
}
|
|
|
|
|
|
@router.post("/parse")
|
|
async def parse_document(
|
|
file: UploadFile = File(...),
|
|
model: str = Form("document-parse"),
|
|
output_formats: str = Form('["text", "html", "markdown"]'),
|
|
db: AsyncSession = Depends(get_db),
|
|
user: User = Depends(get_current_user),
|
|
):
|
|
"""문서 파싱 → 구조화 JSON (레이아웃·텍스트·테이블·그림)."""
|
|
file_bytes = await file.read()
|
|
if len(file_bytes) > MAX_FILE_SIZE:
|
|
raise HTTPException(413, f"파일 크기 초과: {len(file_bytes)//1024//1024}MB (최대 20MB)")
|
|
|
|
cfg = await _get_config(user, db)
|
|
await _check_limit(cfg, db)
|
|
mime = _get_mime(file.filename or "document.pdf")
|
|
|
|
try:
|
|
async with httpx.AsyncClient(timeout=120) as client:
|
|
r = await client.post(
|
|
f"{UPSTAGE_BASE}/document-digitization",
|
|
headers={"Authorization": f"Bearer {cfg.api_key_enc}"},
|
|
files={"document": (file.filename, file_bytes, mime)},
|
|
data={
|
|
"model": model or cfg.model,
|
|
"ocr": "auto",
|
|
"output_formats": output_formats,
|
|
}
|
|
)
|
|
result = r.json() if r.status_code == 200 else {"error": r.text[:500], "status_code": r.status_code}
|
|
except httpx.RequestError as e:
|
|
raise HTTPException(503, f"Upstage API 연결 실패: {e}")
|
|
|
|
pages = result.get("usage", {}).get("pages", 1) if isinstance(result.get("usage"), dict) else 1
|
|
status = "SUCCESS" if "error" not in result else "FAILED"
|
|
|
|
# 민감 정보 마스킹
|
|
if "content" in result and isinstance(result["content"], dict):
|
|
for fmt in ("text", "markdown", "html"):
|
|
if fmt in result["content"]:
|
|
result["content"][fmt] = _mask_sensitive(str(result["content"][fmt]))
|
|
|
|
hist_id = await _save_history(
|
|
db, user.tenant_id, user.id, file.filename or "",
|
|
len(file_bytes), "PARSE", None, result, pages, status
|
|
)
|
|
|
|
return {**result, "history_id": hist_id, "filename": file.filename}
|
|
|
|
|
|
@router.post("/extract")
|
|
async def extract_information(
|
|
file: UploadFile = File(...),
|
|
schema: str = Form(..., description='JSON 문자열: {"필드명": "설명"}'),
|
|
db: AsyncSession = Depends(get_db),
|
|
user: User = Depends(get_current_user),
|
|
):
|
|
"""정보 추출 → Key-Value (스키마 기반)."""
|
|
file_bytes = await file.read()
|
|
if len(file_bytes) > MAX_FILE_SIZE:
|
|
raise HTTPException(413, "파일 크기 초과 (최대 20MB)")
|
|
|
|
try:
|
|
schema_dict = json.loads(schema)
|
|
except json.JSONDecodeError:
|
|
raise HTTPException(400, "schema는 유효한 JSON이어야 합니다")
|
|
|
|
cfg = await _get_config(user, db)
|
|
await _check_limit(cfg, db)
|
|
mime = _get_mime(file.filename or "document.pdf")
|
|
|
|
try:
|
|
async with httpx.AsyncClient(timeout=120) as client:
|
|
r = await client.post(
|
|
f"{UPSTAGE_BASE}/information-extraction",
|
|
headers={"Authorization": f"Bearer {cfg.api_key_enc}"},
|
|
files={"document": (file.filename, file_bytes, mime)},
|
|
data={"schema": json.dumps(schema_dict, ensure_ascii=False)}
|
|
)
|
|
result = r.json() if r.status_code == 200 else {"error": r.text[:500]}
|
|
except httpx.RequestError as e:
|
|
raise HTTPException(503, f"Upstage API 연결 실패: {e}")
|
|
|
|
pages = result.get("usage", {}).get("pages", 1) if isinstance(result.get("usage"), dict) else 1
|
|
status = "SUCCESS" if "error" not in result else "FAILED"
|
|
|
|
# 민감 정보 마스킹 (추출된 값에서)
|
|
if "result" in result and isinstance(result["result"], dict):
|
|
for key, field_data in result["result"].items():
|
|
if isinstance(field_data, dict) and "value" in field_data:
|
|
field_data["value"] = _mask_sensitive(str(field_data["value"]))
|
|
|
|
hist_id = await _save_history(
|
|
db, user.tenant_id, user.id, file.filename or "",
|
|
len(file_bytes), "EXTRACT", json.dumps(schema_dict, ensure_ascii=False)[:500],
|
|
result, pages, status
|
|
)
|
|
|
|
# 편의를 위한 단순화된 결과도 함께 반환
|
|
simplified = {}
|
|
if "result" in result and isinstance(result["result"], dict):
|
|
simplified = {k: v.get("value", "") if isinstance(v, dict) else v
|
|
for k, v in result["result"].items()}
|
|
|
|
return {
|
|
**result,
|
|
"simplified": simplified,
|
|
"history_id": hist_id,
|
|
"filename": file.filename,
|
|
}
|
|
|
|
|
|
@router.post("/qa")
|
|
async def document_qa(
|
|
file: UploadFile = File(...),
|
|
question: str = Form(..., min_length=3, max_length=500),
|
|
db: AsyncSession = Depends(get_db),
|
|
user: User = Depends(get_current_user),
|
|
):
|
|
"""문서 QA → 자연어 답변."""
|
|
file_bytes = await file.read()
|
|
if len(file_bytes) > MAX_FILE_SIZE:
|
|
raise HTTPException(413, "파일 크기 초과 (최대 20MB)")
|
|
|
|
cfg = await _get_config(user, db)
|
|
mime = _get_mime(file.filename or "document.pdf")
|
|
|
|
try:
|
|
async with httpx.AsyncClient(timeout=120) as client:
|
|
r = await client.post(
|
|
f"{UPSTAGE_BASE}/document-qa",
|
|
headers={"Authorization": f"Bearer {cfg.api_key_enc}"},
|
|
files={"document": (file.filename, file_bytes, mime)},
|
|
data={"question": question}
|
|
)
|
|
result = r.json() if r.status_code == 200 else {"error": r.text[:500]}
|
|
except httpx.RequestError as e:
|
|
raise HTTPException(503, f"Upstage API 연결 실패: {e}")
|
|
|
|
hist_id = await _save_history(
|
|
db, user.tenant_id, user.id, file.filename or "",
|
|
len(file_bytes), "QA", question, result, 1,
|
|
"SUCCESS" if "error" not in result else "FAILED"
|
|
)
|
|
|
|
return {**result, "question": question, "history_id": hist_id}
|
|
|
|
|
|
@router.post("/batch")
|
|
async def batch_parse(
|
|
files: list[UploadFile] = File(...),
|
|
mode: str = Form("parse", description="parse | extract"),
|
|
schema: Optional[str] = Form(None),
|
|
db: AsyncSession = Depends(get_db),
|
|
user: User = Depends(get_current_user),
|
|
):
|
|
"""다중 파일 배치 처리."""
|
|
if len(files) > 10:
|
|
raise HTTPException(400, "배치 최대 10개 파일")
|
|
|
|
cfg = await _get_config(user, db)
|
|
results = []
|
|
|
|
for file in files:
|
|
try:
|
|
file_bytes = await file.read()
|
|
if len(file_bytes) > MAX_FILE_SIZE:
|
|
results.append({"filename": file.filename, "error": "파일 크기 초과"})
|
|
continue
|
|
|
|
mime = _get_mime(file.filename or "doc")
|
|
async with httpx.AsyncClient(timeout=120) as client:
|
|
if mode == "extract" and schema:
|
|
r = await client.post(
|
|
f"{UPSTAGE_BASE}/information-extraction",
|
|
headers={"Authorization": f"Bearer {cfg.api_key_enc}"},
|
|
files={"document": (file.filename, file_bytes, mime)},
|
|
data={"schema": schema}
|
|
)
|
|
else:
|
|
r = await client.post(
|
|
f"{UPSTAGE_BASE}/document-digitization",
|
|
headers={"Authorization": f"Bearer {cfg.api_key_enc}"},
|
|
files={"document": (file.filename, file_bytes, mime)},
|
|
data={"model": cfg.model, "ocr": "auto", "output_formats": '["text"]'}
|
|
)
|
|
result = r.json() if r.status_code == 200 else {"error": r.text[:200]}
|
|
results.append({"filename": file.filename, "result": result})
|
|
except Exception as e:
|
|
results.append({"filename": file.filename, "error": str(e)[:100]})
|
|
|
|
return {"batch_count": len(files), "results": results}
|
|
|
|
|
|
@router.get("/history")
|
|
async def get_ocr_history(
|
|
limit: int = 50,
|
|
ocr_type: Optional[str] = None,
|
|
db: AsyncSession = Depends(get_db),
|
|
user: User = Depends(get_current_user),
|
|
):
|
|
"""OCR 처리 이력."""
|
|
q = select(OCRHistory).where(OCRHistory.tenant_id == user.tenant_id)
|
|
if ocr_type:
|
|
q = q.where(OCRHistory.ocr_type == ocr_type.upper())
|
|
q = q.order_by(desc(OCRHistory.created_at)).limit(limit)
|
|
rows = await db.execute(q)
|
|
hs = rows.scalars().all()
|
|
return [
|
|
{
|
|
"id": h.id, "filename": h.filename,
|
|
"type": h.ocr_type, "pages": h.pages,
|
|
"status": h.status, "linked_to": h.linked_to,
|
|
"linked_id": h.linked_id,
|
|
"created_at": h.created_at,
|
|
}
|
|
for h in hs
|
|
]
|
|
|
|
|
|
@router.get("/usage")
|
|
async def get_usage(
|
|
db: AsyncSession = Depends(get_db),
|
|
user: User = Depends(get_current_user),
|
|
):
|
|
"""API 사용량 현황."""
|
|
cfg_row = await db.execute(
|
|
select(UpstageOCRConfig).where(UpstageOCRConfig.tenant_id == user.tenant_id)
|
|
)
|
|
cfg = cfg_row.scalar_one_or_none()
|
|
|
|
today_start = datetime.combine(date.today(), datetime.min.time())
|
|
today_pages = (await db.execute(
|
|
select(func.sum(OCRHistory.pages)).where(
|
|
OCRHistory.tenant_id == user.tenant_id,
|
|
OCRHistory.created_at >= today_start,
|
|
OCRHistory.status == "SUCCESS",
|
|
)
|
|
)).scalar() or 0
|
|
|
|
total_docs = (await db.execute(
|
|
select(func.count(OCRHistory.id)).where(OCRHistory.tenant_id == user.tenant_id)
|
|
)).scalar() or 0
|
|
|
|
month_start = datetime.utcnow().replace(day=1, hour=0, minute=0, second=0)
|
|
month_pages = (await db.execute(
|
|
select(func.sum(OCRHistory.pages)).where(
|
|
OCRHistory.tenant_id == user.tenant_id,
|
|
OCRHistory.created_at >= month_start,
|
|
)
|
|
)).scalar() or 0
|
|
|
|
return {
|
|
"today_pages": today_pages,
|
|
"daily_limit": cfg.daily_limit if cfg else 1000,
|
|
"remaining_today": max(0, (cfg.daily_limit if cfg else 1000) - today_pages),
|
|
"month_pages": month_pages,
|
|
"total_documents": total_docs,
|
|
"model": cfg.model if cfg else None,
|
|
}
|