Files
knowledge-inbox/core/web.py
joungmin 826961f2b9 fix: improve title, summary, and content formatting
- youtube.py: fetch real title via YouTube oEmbed API instead of falling back to video ID
- youtube.py: paragraphize transcript text by grouping sentences (4 per para)
- enricher.py: increase max_tokens 1024→2048 to prevent summary truncation
- web.py: restore paragraph breaks after HTML stripping
2026-02-28 09:39:05 +09:00

69 lines
1.9 KiB
Python

"""URL fetching and HTML-to-text extraction."""
import re
from html.parser import HTMLParser
import httpx
class _TextExtractor(HTMLParser):
_SKIP_TAGS = {"script", "style", "head", "nav", "footer", "noscript"}
def __init__(self) -> None:
super().__init__()
self._buf: list[str] = []
self._skip = 0
def handle_starttag(self, tag: str, attrs: list) -> None:
if tag in self._SKIP_TAGS:
self._skip += 1
def handle_endtag(self, tag: str) -> None:
if tag in self._SKIP_TAGS and self._skip:
self._skip -= 1
def handle_data(self, data: str) -> None:
if not self._skip:
text = data.strip()
if text:
self._buf.append(text)
def get_text(self) -> str:
return " ".join(self._buf)
def _html_to_text(html: str) -> str:
parser = _TextExtractor()
parser.feed(html)
return re.sub(r"\s{3,}", " ", parser.get_text())
_HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; knowledge-inbox/1.0)"}
def _paragraphize(text: str) -> str:
"""Restore paragraph breaks lost during HTML stripping."""
# Collapse runs of spaces but preserve intentional double-spaces as breaks
text = re.sub(r" {3,}", " ", text)
# Insert paragraph break after sentence-ending punctuation + space
text = re.sub(r'([.!?])\s{2,}', r'\1\n\n', text)
return text.strip()
def fetch_page_text(url: str, max_chars: int = 8000) -> str:
"""Fetch a URL and return stripped plain text, truncated to max_chars.
Args:
url: The URL to fetch.
max_chars: Maximum characters to return.
Returns:
Extracted plain text with paragraph breaks, or empty string on failure.
"""
try:
r = httpx.get(url, timeout=15, follow_redirects=True, headers=_HEADERS)
r.raise_for_status()
return _paragraphize(_html_to_text(r.text))[:max_chars]
except Exception:
return ""