""" 스크랩핑 봇 라우터 - ScrapingTarget CRUD (스크랩 대상 등록) - 즉시/스케줄 스크랩 실행 - 결과 관리: DRAFT → PUBLISHED(메신저 알림) / DELETED / 원복 """ from __future__ import annotations import asyncio from datetime import datetime from typing import Any, Dict, List, Optional from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, Query from pydantic import BaseModel from sqlalchemy import select, func, desc from sqlalchemy.ext.asyncio import AsyncSession from core.auth import get_current_user from core.scraping_engine import scrape as _scrape from database import get_db, SessionLocal from models import ( ScrapingTarget, ScrapingResult, ScrapingTargetOut, ScrapingTargetCreate, ScrapingResultOut, User, ) router = APIRouter(prefix="/api/scraping", tags=["scraping"]) # ── ScrapingTarget CRUD ─────────────────────────────────────────────────────── @router.post("/targets", response_model=ScrapingTargetOut) async def create_target( body: ScrapingTargetCreate, db: AsyncSession = Depends(get_db), current_user: User = Depends(get_current_user), ): """스크랩 대상 URL 등록.""" target = ScrapingTarget( name=body.name, url=body.url, selector=body.selector, schedule=body.schedule, is_active=body.is_active, note=body.note, created_by=current_user.username, ) db.add(target) await db.commit() await db.refresh(target) if body.schedule and body.is_active: _register_scrape_cron(target.id, target.url, body.schedule, body.selector, current_user.username) return target @router.get("/targets", response_model=List[ScrapingTargetOut]) async def list_targets( is_active: Optional[bool] = Query(None), db: AsyncSession = Depends(get_db), current_user: User = Depends(get_current_user), ): q = select(ScrapingTarget).order_by(desc(ScrapingTarget.created_at)) if is_active is not None: q = q.where(ScrapingTarget.is_active == is_active) result = await db.execute(q) return result.scalars().all() @router.get("/targets/{target_id}", response_model=ScrapingTargetOut) async def get_target( target_id: int, db: AsyncSession = Depends(get_db), current_user: User = Depends(get_current_user), ): t = await db.get(ScrapingTarget, target_id) if not t: raise HTTPException(404, "스크랩 타겟을 찾을 수 없습니다.") return t @router.put("/targets/{target_id}", response_model=ScrapingTargetOut) async def update_target( target_id: int, body: ScrapingTargetCreate, db: AsyncSession = Depends(get_db), current_user: User = Depends(get_current_user), ): t = await db.get(ScrapingTarget, target_id) if not t: raise HTTPException(404, "스크랩 타겟을 찾을 수 없습니다.") for k, v in body.model_dump().items(): setattr(t, k, v) await db.commit() await db.refresh(t) return t @router.delete("/targets/{target_id}") async def delete_target( target_id: int, db: AsyncSession = Depends(get_db), current_user: User = Depends(get_current_user), ): t = await db.get(ScrapingTarget, target_id) if not t: raise HTTPException(404, "스크랩 타겟을 찾을 수 없습니다.") await db.delete(t) await db.commit() return {"deleted": target_id} # ── 스크랩 실행 ────────────────────────────────────────────────────────────── class RunRequest(BaseModel): url: str selector: Optional[str] = None target_id: Optional[int] = None messenger_room: str = "ops" @router.post("/run", response_model=ScrapingResultOut) async def run_scrape( body: RunRequest, bg: BackgroundTasks, db: AsyncSession = Depends(get_db), current_user: User = Depends(get_current_user), ): """URL 즉시 스크랩 → DRAFT 저장.""" eng_result = await _scrape(body.url, body.selector) rec = ScrapingResult( target_id=body.target_id, title=eng_result.title or body.url, content=eng_result.content, plain_text=eng_result.plain_text, url=body.url, source_html=eng_result.source_html, status="FAILED" if eng_result.error else "DRAFT", meta=eng_result.meta, error_msg=eng_result.error, messenger_room=body.messenger_room, scraped_by=current_user.username, ) db.add(rec) if body.target_id: t = await db.get(ScrapingTarget, body.target_id) if t: t.last_scraped = datetime.now() await db.commit() await db.refresh(rec) return rec # ── 결과 조회 ───────────────────────────────────────────────────────────────── @router.get("/results", response_model=List[ScrapingResultOut]) async def list_results( status: Optional[str] = Query(None), target_id: Optional[int] = Query(None), page: int = Query(1, ge=1), size: int = Query(20, ge=1, le=100), db: AsyncSession = Depends(get_db), current_user: User = Depends(get_current_user), ): q = select(ScrapingResult).order_by(desc(ScrapingResult.scraped_at)) if status: q = q.where(ScrapingResult.status == status) if target_id: q = q.where(ScrapingResult.target_id == target_id) total_q = select(func.count()).select_from(q.subquery()) total = (await db.execute(total_q)).scalar_one() q = q.offset((page - 1) * size).limit(size) rows = (await db.execute(q)).scalars().all() return rows @router.get("/results/{result_id}", response_model=ScrapingResultOut) async def get_result( result_id: int, db: AsyncSession = Depends(get_db), current_user: User = Depends(get_current_user), ): r = await db.get(ScrapingResult, result_id) if not r: raise HTTPException(404, "스크랩 결과를 찾을 수 없습니다.") return r @router.get("/results/{result_id}/html") async def get_result_html( result_id: int, db: AsyncSession = Depends(get_db), current_user: User = Depends(get_current_user), ): """원본 HTML 조회 (원복 미리보기용).""" r = await db.get(ScrapingResult, result_id) if not r: raise HTTPException(404) return {"id": r.id, "url": r.url, "source_html": r.source_html or ""} # ── 상태 전환: 게시 ─────────────────────────────────────────────────────────── class PublishRequest(BaseModel): room: str = "ops" custom_message: Optional[str] = None @router.post("/results/{result_id}/publish") async def publish_result( result_id: int, body: PublishRequest, bg: BackgroundTasks, db: AsyncSession = Depends(get_db), current_user: User = Depends(get_current_user), ): """DRAFT → PUBLISHED + 메신저 알림.""" r = await db.get(ScrapingResult, result_id) if not r: raise HTTPException(404, "스크랩 결과를 찾을 수 없습니다.") if r.status == "PUBLISHED": raise HTTPException(400, "이미 게시된 결과입니다.") if r.status == "FAILED": raise HTTPException(400, "실패한 스크랩은 게시할 수 없습니다.") r.status = "PUBLISHED" r.published_at = datetime.now() r.published_by = current_user.username r.messenger_room = body.room await db.commit() await db.refresh(r) bg.add_task(_notify_publish, r.id, r.title, r.url, r.plain_text, body.room, body.custom_message, current_user.username) return {"id": r.id, "status": "PUBLISHED", "published_at": r.published_at.isoformat()} # ── 상태 전환: 삭제 ─────────────────────────────────────────────────────────── @router.delete("/results/{result_id}") async def delete_result( result_id: int, db: AsyncSession = Depends(get_db), current_user: User = Depends(get_current_user), ): """소프트 삭제: → DELETED (원본 보존).""" r = await db.get(ScrapingResult, result_id) if not r: raise HTTPException(404, "스크랩 결과를 찾을 수 없습니다.") if r.status == "DELETED": raise HTTPException(400, "이미 삭제된 결과입니다.") r.status = "DELETED" r.deleted_at = datetime.now() await db.commit() return {"id": r.id, "status": "DELETED"} # ── 상태 전환: 원복 ─────────────────────────────────────────────────────────── @router.post("/results/{result_id}/restore") async def restore_result( result_id: int, db: AsyncSession = Depends(get_db), current_user: User = Depends(get_current_user), ): """DELETED → DRAFT 원복.""" r = await db.get(ScrapingResult, result_id) if not r: raise HTTPException(404, "스크랩 결과를 찾을 수 없습니다.") if r.status != "DELETED": raise HTTPException(400, "삭제된 결과만 원복할 수 있습니다.") r.status = "DRAFT" r.deleted_at = None await db.commit() return {"id": r.id, "status": "DRAFT", "restored_at": datetime.now().isoformat()} # ── 통계 ───────────────────────────────────────────────────────────────────── @router.get("/stats") async def scraping_stats( db: AsyncSession = Depends(get_db), current_user: User = Depends(get_current_user), ): stats = {} for status in ("DRAFT", "PUBLISHED", "DELETED", "FAILED"): cnt = (await db.execute( select(func.count()).where(ScrapingResult.status == status) )).scalar_one() stats[status.lower()] = cnt stats["targets"] = (await db.execute( select(func.count()).select_from(ScrapingTarget) )).scalar_one() return stats # ── 내부 헬퍼 ──────────────────────────────────────────────────────────────── async def _notify_publish( result_id: int, title: str, url: str, plain_text: str, room: str, custom_msg: Optional[str], publisher: str, ) -> None: """게시 시 메신저 webhook 전송.""" import httpx import os base = os.getenv("ITSM_BASE_URL", "http://127.0.0.1:9001") summary = (plain_text or "")[:300].replace("\n", " ") msg = custom_msg or ( f"[스크랩 게시] {title}\n" f"URL: {url}\n" f"요약: {summary}{'...' if len(plain_text or '') > 300 else ''}\n" f"게시자: {publisher}\n" f"결과 ID: #{result_id}" ) payload = { "event": "scrap_published", "room": room, "title": title, "summary": msg, "result_id": result_id, } try: async with httpx.AsyncClient(timeout=5) as client: await client.post(f"{base}/api/messenger/webhook", json=payload) except Exception: pass def _register_scrape_cron( target_id: int, url: str, schedule: str, selector: Optional[str], actor: str, ) -> None: """APScheduler에 스크랩 크론 등록.""" try: from core.scheduler import scheduler parts = schedule.split() if len(parts) < 5: return minute, hour, day, month, dow = parts[:5] job_id = f"scrape_target_{target_id}" scheduler.add_job( _run_scrape_background, trigger="cron", id=job_id, replace_existing=True, minute=minute, hour=hour, day=day, month=month, day_of_week=dow, args=[target_id, url, selector, actor], ) except Exception as e: import logging logging.getLogger(__name__).warning("scrape cron 등록 실패: %s", e) def _run_scrape_background( target_id: int, url: str, selector: Optional[str], actor: str, ) -> None: """크론 실행 시 백그라운드 스크랩.""" async def _inner(): from database import SessionLocal eng = await _scrape(url, selector) async with SessionLocal() as db: rec = ScrapingResult( target_id=target_id, title=eng.title or url, content=eng.content, plain_text=eng.plain_text, url=url, source_html=eng.source_html, status="FAILED" if eng.error else "DRAFT", meta=eng.meta, error_msg=eng.error, scraped_by=actor, ) db.add(rec) t = await db.get(ScrapingTarget, target_id) if t: t.last_scraped = datetime.now() await db.commit() import asyncio loop = asyncio.new_event_loop() loop.run_until_complete(_inner()) loop.close()