guardia-itsm/core/scraping_engine.py

"""
Scraping Engine — BeautifulSoup 기반 웹 스크랩핑
"""
from __future__ import annotations

import re
from datetime import datetime
from typing import Any, Dict, List, Optional
from urllib.parse import urljoin, urlparse

import httpx

_BS4_OK = False
try:
    from bs4 import BeautifulSoup
    _BS4_OK = True
except ImportError:
    pass


HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/124.0 Safari/537.36"
    ),
    "Accept-Language": "ko-KR,ko;q=0.9,en;q=0.8",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
}


class ScrapingResult:
    __slots__ = ("url", "title", "content", "plain_text", "source_html",
                 "meta", "links", "images", "scraped_at", "error")

    def __init__(self):
        self.url = ""
        self.title = ""
        self.content = ""
        self.plain_text = ""
        self.source_html = ""
        self.meta: Dict[str, str] = {}
        self.links: List[str] = []
        self.images: List[str] = []
        self.scraped_at = datetime.now().isoformat()
        self.error: Optional[str] = None

    def to_dict(self) -> Dict[str, Any]:
        return {
            "url": self.url,
            "title": self.title,
            "content": self.content,
            "plain_text": self.plain_text[:5000],   # DB 저장용 자름
            "source_html": self.source_html[:500000],
            "meta": self.meta,
            "links": self.links[:50],
            "images": self.images[:20],
            "scraped_at": self.scraped_at,
            "error": self.error,
        }


async def scrape(
    url: str,
    selector: Optional[str] = None,
    timeout: int = 30,
) -> ScrapingResult:
    """
    URL을 스크랩하여 ScrapingResult 반환.
    selector가 있으면 해당 CSS 셀렉터 내용만 추출.
    """
    result = ScrapingResult()
    result.url = url

    if not _BS4_OK:
        result.error = "bs4 미설치. pip install beautifulsoup4 lxml"
        return result

    try:
        async with httpx.AsyncClient(
            timeout=timeout,
            headers=HEADERS,
            follow_redirects=True,
        ) as client:
            resp = await client.get(url)
            resp.raise_for_status()
            html = resp.text
    except httpx.TimeoutException:
        result.error = f"타임아웃 ({timeout}초)"
        return result
    except Exception as e:
        result.error = str(e)[:200]
        return result

    result.source_html = html

    soup = BeautifulSoup(html, "lxml" if _lxml_ok() else "html.parser")

    # 제목
    title_tag = soup.find("title")
    og_title = soup.find("meta", property="og:title")
    result.title = (
        (og_title.get("content", "") if og_title else "")
        or (title_tag.get_text(strip=True) if title_tag else "")
        or urlparse(url).netloc
    )

    # 메타
    for m in soup.find_all("meta"):
        name = m.get("name") or m.get("property", "")
        content = m.get("content", "")
        if name and content:
            result.meta[name] = content[:300]

    # 본문 (셀렉터 or 자동 추출)
    base = urlparse(url).scheme + "://" + urlparse(url).netloc
    if selector:
        target = soup.select_one(selector)
        if target:
            result.content = str(target)
            result.plain_text = target.get_text(separator="\n", strip=True)
        else:
            result.error = f"셀렉터 '{selector}' 미매칭"
    else:
        # 자동 추출 우선순위: article > main > #content > body
        for tag in ("article", "main", '[id*="content"]', '[class*="content"]',
                    '[id*="article"]', "body"):
            node = soup.select_one(tag)
            if node and len(node.get_text(strip=True)) > 100:
                result.content = str(node)[:200000]
                result.plain_text = _clean_text(node.get_text(separator="\n", strip=True))
                break
        else:
            result.plain_text = _clean_text(soup.get_text(separator="\n", strip=True))
            result.content = html[:200000]

    # 링크
    for a in soup.find_all("a", href=True)[:100]:
        href = urljoin(base, a["href"])
        if href.startswith("http"):
            result.links.append(href)

    # 이미지
    for img in soup.find_all("img", src=True)[:30]:
        src = urljoin(base, img["src"])
        if src.startswith("http"):
            result.images.append(src)

    return result


def _clean_text(text: str) -> str:
    text = re.sub(r"\n{3,}", "\n\n", text)
    text = re.sub(r"[ \t]{2,}", " ", text)
    return text.strip()[:10000]


def _lxml_ok() -> bool:
    try:
        import lxml  # noqa
        return True
    except ImportError:
        return False