knowledge-inbox/core/web.py

"""URL fetching and HTML-to-text extraction."""

import re
from html.parser import HTMLParser

import httpx


class _TextExtractor(HTMLParser):
    _SKIP_TAGS = {"script", "style", "head", "nav", "footer", "noscript"}

    def __init__(self) -> None:
        super().__init__()
        self._buf: list[str] = []
        self._skip = 0

    def handle_starttag(self, tag: str, attrs: list) -> None:
        if tag in self._SKIP_TAGS:
            self._skip += 1

    def handle_endtag(self, tag: str) -> None:
        if tag in self._SKIP_TAGS and self._skip:
            self._skip -= 1

    def handle_data(self, data: str) -> None:
        if not self._skip:
            text = data.strip()
            if text:
                self._buf.append(text)

    def get_text(self) -> str:
        return " ".join(self._buf)


def _html_to_text(html: str) -> str:
    parser = _TextExtractor()
    parser.feed(html)
    return re.sub(r"\s{3,}", "  ", parser.get_text())


_HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; knowledge-inbox/1.0)"}


def _paragraphize(text: str) -> str:
    """Restore paragraph breaks lost during HTML stripping."""
    # Collapse runs of spaces but preserve intentional double-spaces as breaks
    text = re.sub(r" {3,}", "  ", text)
    # Insert paragraph break after sentence-ending punctuation + space
    text = re.sub(r'([.!?])\s{2,}', r'\1\n\n', text)
    return text.strip()


def fetch_page_text(url: str, max_chars: int = 8000) -> str:
    """Fetch a URL and return stripped plain text, truncated to max_chars.

    Args:
        url: The URL to fetch.
        max_chars: Maximum characters to return.

    Returns:
        Extracted plain text with paragraph breaks, or empty string on failure.
    """
    try:
        r = httpx.get(url, timeout=15, follow_redirects=True, headers=_HEADERS)
        r.raise_for_status()
        return _paragraphize(_html_to_text(r.text))[:max_chars]
    except Exception:
        return ""