"""URL fetching and HTML-to-text extraction.""" import re from html.parser import HTMLParser import httpx class _TextExtractor(HTMLParser): _SKIP_TAGS = {"script", "style", "head", "nav", "footer", "noscript"} def __init__(self) -> None: super().__init__() self._buf: list[str] = [] self._skip = 0 def handle_starttag(self, tag: str, attrs: list) -> None: if tag in self._SKIP_TAGS: self._skip += 1 def handle_endtag(self, tag: str) -> None: if tag in self._SKIP_TAGS and self._skip: self._skip -= 1 def handle_data(self, data: str) -> None: if not self._skip: text = data.strip() if text: self._buf.append(text) def get_text(self) -> str: return " ".join(self._buf) def _html_to_text(html: str) -> str: parser = _TextExtractor() parser.feed(html) return re.sub(r"\s{3,}", " ", parser.get_text()) _HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; knowledge-inbox/1.0)"} def fetch_page_text(url: str, max_chars: int = 8000) -> str: """Fetch a URL and return stripped plain text, truncated to max_chars. Args: url: The URL to fetch. max_chars: Maximum characters to return. Returns: Extracted plain text, or empty string on failure. """ try: r = httpx.get(url, timeout=15, follow_redirects=True, headers=_HEADERS) r.raise_for_status() return _html_to_text(r.text)[:max_chars] except Exception: return ""