fix: improve title, summary, and content formatting

- youtube.py: fetch real title via YouTube oEmbed API instead of falling back to video ID
- youtube.py: paragraphize transcript text by grouping sentences (4 per para)
- enricher.py: increase max_tokens 1024→2048 to prevent summary truncation
- web.py: restore paragraph breaks after HTML stripping
This commit is contained in:
joungmin
2026-02-28 09:39:05 +09:00
parent 9739daf481
commit 826961f2b9
3 changed files with 47 additions and 20 deletions

View File

@@ -41,6 +41,15 @@ def _html_to_text(html: str) -> str:
_HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; knowledge-inbox/1.0)"}
def _paragraphize(text: str) -> str:
"""Restore paragraph breaks lost during HTML stripping."""
# Collapse runs of spaces but preserve intentional double-spaces as breaks
text = re.sub(r" {3,}", " ", text)
# Insert paragraph break after sentence-ending punctuation + space
text = re.sub(r'([.!?])\s{2,}', r'\1\n\n', text)
return text.strip()
def fetch_page_text(url: str, max_chars: int = 8000) -> str:
"""Fetch a URL and return stripped plain text, truncated to max_chars.
@@ -49,11 +58,11 @@ def fetch_page_text(url: str, max_chars: int = 8000) -> str:
max_chars: Maximum characters to return.
Returns:
Extracted plain text, or empty string on failure.
Extracted plain text with paragraph breaks, or empty string on failure.
"""
try:
r = httpx.get(url, timeout=15, follow_redirects=True, headers=_HEADERS)
r.raise_for_status()
return _html_to_text(r.text)[:max_chars]
return _paragraphize(_html_to_text(r.text))[:max_chars]
except Exception:
return ""