[하네스]
- agents/scraping-bot.md: 스크랩 봇 에이전트
- skills/scraping-orchestrator/SKILL.md: E2E 워크플로우
[ITSM Backend]
- models.py: ScrapingTarget + ScrapingResult ORM + Pydantic 스키마
- core/scraping_engine.py: BeautifulSoup 기반 비동기 스크래퍼
- routers/scraping.py: 13개 API (타겟 CRUD + run + 게시/삭제/원복 + 통계)
- routers/messenger.py: !scrap 봇 명령어 6종 + scrap_published 이벤트
- main.py: scraping 라우터 등록
[Manager UI]
- ScrapingManager.tsx: 결과 목록/상세/게시/삭제/원복 + 타겟 관리
- Sidebar.tsx: 🕷️ 스크랩핑 봇 메뉴 추가
- App.tsx: /scraping 라우트 추가
[테스트 결과 - 전체 통과]
- T1 타겟 등록 OK
- T2 즉시 스크랩: zioinfo.co.kr → DRAFT
- T3 결과 목록 조회 OK
- T4 게시: DRAFT → PUBLISHED + 메신저 알림
- T5/T6/T7 두번째 스크랩 → 삭제 → 원복 OK
- T8 통계: draft:1, published:1, deleted:0
- T9 !scrap list 봇 명령어 OK
- T10 !scrap status 봇 명령어 OK
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
379 lines
13 KiB
Python
379 lines
13 KiB
Python
"""
|
|
스크랩핑 봇 라우터
|
|
- ScrapingTarget CRUD (스크랩 대상 등록)
|
|
- 즉시/스케줄 스크랩 실행
|
|
- 결과 관리: DRAFT → PUBLISHED(메신저 알림) / DELETED / 원복
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
from datetime import datetime
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, Query
|
|
from pydantic import BaseModel
|
|
from sqlalchemy import select, func, desc
|
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
|
|
from core.auth import get_current_user
|
|
from core.scraping_engine import scrape as _scrape
|
|
from database import get_db, SessionLocal
|
|
from models import (
|
|
ScrapingTarget, ScrapingResult,
|
|
ScrapingTargetOut, ScrapingTargetCreate,
|
|
ScrapingResultOut, User,
|
|
)
|
|
|
|
router = APIRouter(prefix="/api/scraping", tags=["scraping"])
|
|
|
|
|
|
# ── ScrapingTarget CRUD ───────────────────────────────────────────────────────
|
|
|
|
@router.post("/targets", response_model=ScrapingTargetOut)
|
|
async def create_target(
|
|
body: ScrapingTargetCreate,
|
|
db: AsyncSession = Depends(get_db),
|
|
current_user: User = Depends(get_current_user),
|
|
):
|
|
"""스크랩 대상 URL 등록."""
|
|
target = ScrapingTarget(
|
|
name=body.name, url=body.url, selector=body.selector,
|
|
schedule=body.schedule, is_active=body.is_active,
|
|
note=body.note, created_by=current_user.username,
|
|
)
|
|
db.add(target)
|
|
await db.commit()
|
|
await db.refresh(target)
|
|
|
|
if body.schedule and body.is_active:
|
|
_register_scrape_cron(target.id, target.url, body.schedule,
|
|
body.selector, current_user.username)
|
|
return target
|
|
|
|
|
|
@router.get("/targets", response_model=List[ScrapingTargetOut])
|
|
async def list_targets(
|
|
is_active: Optional[bool] = Query(None),
|
|
db: AsyncSession = Depends(get_db),
|
|
current_user: User = Depends(get_current_user),
|
|
):
|
|
q = select(ScrapingTarget).order_by(desc(ScrapingTarget.created_at))
|
|
if is_active is not None:
|
|
q = q.where(ScrapingTarget.is_active == is_active)
|
|
result = await db.execute(q)
|
|
return result.scalars().all()
|
|
|
|
|
|
@router.get("/targets/{target_id}", response_model=ScrapingTargetOut)
|
|
async def get_target(
|
|
target_id: int,
|
|
db: AsyncSession = Depends(get_db),
|
|
current_user: User = Depends(get_current_user),
|
|
):
|
|
t = await db.get(ScrapingTarget, target_id)
|
|
if not t:
|
|
raise HTTPException(404, "스크랩 타겟을 찾을 수 없습니다.")
|
|
return t
|
|
|
|
|
|
@router.put("/targets/{target_id}", response_model=ScrapingTargetOut)
|
|
async def update_target(
|
|
target_id: int,
|
|
body: ScrapingTargetCreate,
|
|
db: AsyncSession = Depends(get_db),
|
|
current_user: User = Depends(get_current_user),
|
|
):
|
|
t = await db.get(ScrapingTarget, target_id)
|
|
if not t:
|
|
raise HTTPException(404, "스크랩 타겟을 찾을 수 없습니다.")
|
|
for k, v in body.model_dump().items():
|
|
setattr(t, k, v)
|
|
await db.commit()
|
|
await db.refresh(t)
|
|
return t
|
|
|
|
|
|
@router.delete("/targets/{target_id}")
|
|
async def delete_target(
|
|
target_id: int,
|
|
db: AsyncSession = Depends(get_db),
|
|
current_user: User = Depends(get_current_user),
|
|
):
|
|
t = await db.get(ScrapingTarget, target_id)
|
|
if not t:
|
|
raise HTTPException(404, "스크랩 타겟을 찾을 수 없습니다.")
|
|
await db.delete(t)
|
|
await db.commit()
|
|
return {"deleted": target_id}
|
|
|
|
|
|
# ── 스크랩 실행 ──────────────────────────────────────────────────────────────
|
|
|
|
class RunRequest(BaseModel):
|
|
url: str
|
|
selector: Optional[str] = None
|
|
target_id: Optional[int] = None
|
|
messenger_room: str = "ops"
|
|
|
|
|
|
@router.post("/run", response_model=ScrapingResultOut)
|
|
async def run_scrape(
|
|
body: RunRequest,
|
|
bg: BackgroundTasks,
|
|
db: AsyncSession = Depends(get_db),
|
|
current_user: User = Depends(get_current_user),
|
|
):
|
|
"""URL 즉시 스크랩 → DRAFT 저장."""
|
|
eng_result = await _scrape(body.url, body.selector)
|
|
|
|
rec = ScrapingResult(
|
|
target_id=body.target_id,
|
|
title=eng_result.title or body.url,
|
|
content=eng_result.content,
|
|
plain_text=eng_result.plain_text,
|
|
url=body.url,
|
|
source_html=eng_result.source_html,
|
|
status="FAILED" if eng_result.error else "DRAFT",
|
|
meta=eng_result.meta,
|
|
error_msg=eng_result.error,
|
|
messenger_room=body.messenger_room,
|
|
scraped_by=current_user.username,
|
|
)
|
|
db.add(rec)
|
|
|
|
if body.target_id:
|
|
t = await db.get(ScrapingTarget, body.target_id)
|
|
if t:
|
|
t.last_scraped = datetime.now()
|
|
|
|
await db.commit()
|
|
await db.refresh(rec)
|
|
return rec
|
|
|
|
|
|
# ── 결과 조회 ─────────────────────────────────────────────────────────────────
|
|
|
|
@router.get("/results", response_model=List[ScrapingResultOut])
|
|
async def list_results(
|
|
status: Optional[str] = Query(None),
|
|
target_id: Optional[int] = Query(None),
|
|
page: int = Query(1, ge=1),
|
|
size: int = Query(20, ge=1, le=100),
|
|
db: AsyncSession = Depends(get_db),
|
|
current_user: User = Depends(get_current_user),
|
|
):
|
|
q = select(ScrapingResult).order_by(desc(ScrapingResult.scraped_at))
|
|
if status:
|
|
q = q.where(ScrapingResult.status == status)
|
|
if target_id:
|
|
q = q.where(ScrapingResult.target_id == target_id)
|
|
total_q = select(func.count()).select_from(q.subquery())
|
|
total = (await db.execute(total_q)).scalar_one()
|
|
q = q.offset((page - 1) * size).limit(size)
|
|
rows = (await db.execute(q)).scalars().all()
|
|
return rows
|
|
|
|
|
|
@router.get("/results/{result_id}", response_model=ScrapingResultOut)
|
|
async def get_result(
|
|
result_id: int,
|
|
db: AsyncSession = Depends(get_db),
|
|
current_user: User = Depends(get_current_user),
|
|
):
|
|
r = await db.get(ScrapingResult, result_id)
|
|
if not r:
|
|
raise HTTPException(404, "스크랩 결과를 찾을 수 없습니다.")
|
|
return r
|
|
|
|
|
|
@router.get("/results/{result_id}/html")
|
|
async def get_result_html(
|
|
result_id: int,
|
|
db: AsyncSession = Depends(get_db),
|
|
current_user: User = Depends(get_current_user),
|
|
):
|
|
"""원본 HTML 조회 (원복 미리보기용)."""
|
|
r = await db.get(ScrapingResult, result_id)
|
|
if not r:
|
|
raise HTTPException(404)
|
|
return {"id": r.id, "url": r.url, "source_html": r.source_html or ""}
|
|
|
|
|
|
# ── 상태 전환: 게시 ───────────────────────────────────────────────────────────
|
|
|
|
class PublishRequest(BaseModel):
|
|
room: str = "ops"
|
|
custom_message: Optional[str] = None
|
|
|
|
|
|
@router.post("/results/{result_id}/publish")
|
|
async def publish_result(
|
|
result_id: int,
|
|
body: PublishRequest,
|
|
bg: BackgroundTasks,
|
|
db: AsyncSession = Depends(get_db),
|
|
current_user: User = Depends(get_current_user),
|
|
):
|
|
"""DRAFT → PUBLISHED + 메신저 알림."""
|
|
r = await db.get(ScrapingResult, result_id)
|
|
if not r:
|
|
raise HTTPException(404, "스크랩 결과를 찾을 수 없습니다.")
|
|
if r.status == "PUBLISHED":
|
|
raise HTTPException(400, "이미 게시된 결과입니다.")
|
|
if r.status == "FAILED":
|
|
raise HTTPException(400, "실패한 스크랩은 게시할 수 없습니다.")
|
|
|
|
r.status = "PUBLISHED"
|
|
r.published_at = datetime.now()
|
|
r.published_by = current_user.username
|
|
r.messenger_room = body.room
|
|
await db.commit()
|
|
await db.refresh(r)
|
|
|
|
bg.add_task(_notify_publish, r.id, r.title, r.url,
|
|
r.plain_text, body.room, body.custom_message, current_user.username)
|
|
return {"id": r.id, "status": "PUBLISHED", "published_at": r.published_at.isoformat()}
|
|
|
|
|
|
# ── 상태 전환: 삭제 ───────────────────────────────────────────────────────────
|
|
|
|
@router.delete("/results/{result_id}")
|
|
async def delete_result(
|
|
result_id: int,
|
|
db: AsyncSession = Depends(get_db),
|
|
current_user: User = Depends(get_current_user),
|
|
):
|
|
"""소프트 삭제: → DELETED (원본 보존)."""
|
|
r = await db.get(ScrapingResult, result_id)
|
|
if not r:
|
|
raise HTTPException(404, "스크랩 결과를 찾을 수 없습니다.")
|
|
if r.status == "DELETED":
|
|
raise HTTPException(400, "이미 삭제된 결과입니다.")
|
|
r.status = "DELETED"
|
|
r.deleted_at = datetime.now()
|
|
await db.commit()
|
|
return {"id": r.id, "status": "DELETED"}
|
|
|
|
|
|
# ── 상태 전환: 원복 ───────────────────────────────────────────────────────────
|
|
|
|
@router.post("/results/{result_id}/restore")
|
|
async def restore_result(
|
|
result_id: int,
|
|
db: AsyncSession = Depends(get_db),
|
|
current_user: User = Depends(get_current_user),
|
|
):
|
|
"""DELETED → DRAFT 원복."""
|
|
r = await db.get(ScrapingResult, result_id)
|
|
if not r:
|
|
raise HTTPException(404, "스크랩 결과를 찾을 수 없습니다.")
|
|
if r.status != "DELETED":
|
|
raise HTTPException(400, "삭제된 결과만 원복할 수 있습니다.")
|
|
r.status = "DRAFT"
|
|
r.deleted_at = None
|
|
await db.commit()
|
|
return {"id": r.id, "status": "DRAFT", "restored_at": datetime.now().isoformat()}
|
|
|
|
|
|
# ── 통계 ─────────────────────────────────────────────────────────────────────
|
|
|
|
@router.get("/stats")
|
|
async def scraping_stats(
|
|
db: AsyncSession = Depends(get_db),
|
|
current_user: User = Depends(get_current_user),
|
|
):
|
|
stats = {}
|
|
for status in ("DRAFT", "PUBLISHED", "DELETED", "FAILED"):
|
|
cnt = (await db.execute(
|
|
select(func.count()).where(ScrapingResult.status == status)
|
|
)).scalar_one()
|
|
stats[status.lower()] = cnt
|
|
stats["targets"] = (await db.execute(
|
|
select(func.count()).select_from(ScrapingTarget)
|
|
)).scalar_one()
|
|
return stats
|
|
|
|
|
|
# ── 내부 헬퍼 ────────────────────────────────────────────────────────────────
|
|
|
|
async def _notify_publish(
|
|
result_id: int, title: str, url: str,
|
|
plain_text: str, room: str, custom_msg: Optional[str], publisher: str,
|
|
) -> None:
|
|
"""게시 시 메신저 webhook 전송."""
|
|
import httpx
|
|
import os
|
|
base = os.getenv("ITSM_BASE_URL", "http://127.0.0.1:9001")
|
|
summary = (plain_text or "")[:300].replace("\n", " ")
|
|
msg = custom_msg or (
|
|
f"[스크랩 게시] {title}\n"
|
|
f"URL: {url}\n"
|
|
f"요약: {summary}{'...' if len(plain_text or '') > 300 else ''}\n"
|
|
f"게시자: {publisher}\n"
|
|
f"결과 ID: #{result_id}"
|
|
)
|
|
payload = {
|
|
"event": "scrap_published",
|
|
"room": room,
|
|
"title": title,
|
|
"summary": msg,
|
|
"result_id": result_id,
|
|
}
|
|
try:
|
|
async with httpx.AsyncClient(timeout=5) as client:
|
|
await client.post(f"{base}/api/messenger/webhook", json=payload)
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
def _register_scrape_cron(
|
|
target_id: int, url: str, schedule: str,
|
|
selector: Optional[str], actor: str,
|
|
) -> None:
|
|
"""APScheduler에 스크랩 크론 등록."""
|
|
try:
|
|
from core.scheduler import scheduler
|
|
parts = schedule.split()
|
|
if len(parts) < 5:
|
|
return
|
|
minute, hour, day, month, dow = parts[:5]
|
|
job_id = f"scrape_target_{target_id}"
|
|
scheduler.add_job(
|
|
_run_scrape_background,
|
|
trigger="cron",
|
|
id=job_id,
|
|
replace_existing=True,
|
|
minute=minute, hour=hour, day=day, month=month, day_of_week=dow,
|
|
args=[target_id, url, selector, actor],
|
|
)
|
|
except Exception as e:
|
|
import logging
|
|
logging.getLogger(__name__).warning("scrape cron 등록 실패: %s", e)
|
|
|
|
|
|
def _run_scrape_background(
|
|
target_id: int, url: str, selector: Optional[str], actor: str,
|
|
) -> None:
|
|
"""크론 실행 시 백그라운드 스크랩."""
|
|
async def _inner():
|
|
from database import SessionLocal
|
|
eng = await _scrape(url, selector)
|
|
async with SessionLocal() as db:
|
|
rec = ScrapingResult(
|
|
target_id=target_id, title=eng.title or url,
|
|
content=eng.content, plain_text=eng.plain_text,
|
|
url=url, source_html=eng.source_html,
|
|
status="FAILED" if eng.error else "DRAFT",
|
|
meta=eng.meta, error_msg=eng.error, scraped_by=actor,
|
|
)
|
|
db.add(rec)
|
|
t = await db.get(ScrapingTarget, target_id)
|
|
if t:
|
|
t.last_scraped = datetime.now()
|
|
await db.commit()
|
|
|
|
import asyncio
|
|
loop = asyncio.new_event_loop()
|
|
loop.run_until_complete(_inner())
|
|
loop.close()
|