- youtube.py: fetch real title via YouTube oEmbed API instead of falling back to video ID - youtube.py: paragraphize transcript text by grouping sentences (4 per para) - enricher.py: increase max_tokens 1024→2048 to prevent summary truncation - web.py: restore paragraph breaks after HTML stripping
69 lines
1.9 KiB
Python
69 lines
1.9 KiB
Python
"""URL fetching and HTML-to-text extraction."""
|
|
|
|
import re
|
|
from html.parser import HTMLParser
|
|
|
|
import httpx
|
|
|
|
|
|
class _TextExtractor(HTMLParser):
|
|
_SKIP_TAGS = {"script", "style", "head", "nav", "footer", "noscript"}
|
|
|
|
def __init__(self) -> None:
|
|
super().__init__()
|
|
self._buf: list[str] = []
|
|
self._skip = 0
|
|
|
|
def handle_starttag(self, tag: str, attrs: list) -> None:
|
|
if tag in self._SKIP_TAGS:
|
|
self._skip += 1
|
|
|
|
def handle_endtag(self, tag: str) -> None:
|
|
if tag in self._SKIP_TAGS and self._skip:
|
|
self._skip -= 1
|
|
|
|
def handle_data(self, data: str) -> None:
|
|
if not self._skip:
|
|
text = data.strip()
|
|
if text:
|
|
self._buf.append(text)
|
|
|
|
def get_text(self) -> str:
|
|
return " ".join(self._buf)
|
|
|
|
|
|
def _html_to_text(html: str) -> str:
|
|
parser = _TextExtractor()
|
|
parser.feed(html)
|
|
return re.sub(r"\s{3,}", " ", parser.get_text())
|
|
|
|
|
|
_HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; knowledge-inbox/1.0)"}
|
|
|
|
|
|
def _paragraphize(text: str) -> str:
|
|
"""Restore paragraph breaks lost during HTML stripping."""
|
|
# Collapse runs of spaces but preserve intentional double-spaces as breaks
|
|
text = re.sub(r" {3,}", " ", text)
|
|
# Insert paragraph break after sentence-ending punctuation + space
|
|
text = re.sub(r'([.!?])\s{2,}', r'\1\n\n', text)
|
|
return text.strip()
|
|
|
|
|
|
def fetch_page_text(url: str, max_chars: int = 8000) -> str:
|
|
"""Fetch a URL and return stripped plain text, truncated to max_chars.
|
|
|
|
Args:
|
|
url: The URL to fetch.
|
|
max_chars: Maximum characters to return.
|
|
|
|
Returns:
|
|
Extracted plain text with paragraph breaks, or empty string on failure.
|
|
"""
|
|
try:
|
|
r = httpx.get(url, timeout=15, follow_redirects=True, headers=_HEADERS)
|
|
r.raise_for_status()
|
|
return _paragraphize(_html_to_text(r.text))[:max_chars]
|
|
except Exception:
|
|
return ""
|