fix: improve title, summary, and content formatting

- youtube.py: fetch real title via YouTube oEmbed API instead of falling back to video ID - youtube.py: paragraphize transcript text by grouping sentences (4 per para) - enricher.py: increase max_tokens 1024→2048 to prevent summary truncation - web.py: restore paragraph breaks after HTML stripping
2026-02-28 09:39:05 +09:00
parent 9739daf481
commit 826961f2b9
3 changed files with 47 additions and 20 deletions
--- a/core/web.py
+++ b/core/web.py
@@ -41,6 +41,15 @@ def _html_to_text(html: str) -> str:
 _HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; knowledge-inbox/1.0)"}


+def _paragraphize(text: str) -> str:
+    """Restore paragraph breaks lost during HTML stripping."""
+    # Collapse runs of spaces but preserve intentional double-spaces as breaks
+    text = re.sub(r" {3,}", "  ", text)
+    # Insert paragraph break after sentence-ending punctuation + space
+    text = re.sub(r'([.!?])\s{2,}', r'\1\n\n', text)
+    return text.strip()
+
+
 def fetch_page_text(url: str, max_chars: int = 8000) -> str:
    """Fetch a URL and return stripped plain text, truncated to max_chars.

@@ -49,11 +58,11 @@ def fetch_page_text(url: str, max_chars: int = 8000) -> str:
        max_chars: Maximum characters to return.

    Returns:
-        Extracted plain text, or empty string on failure.
+        Extracted plain text with paragraph breaks, or empty string on failure.
    """
    try:
        r = httpx.get(url, timeout=15, follow_redirects=True, headers=_HEADERS)
        r.raise_for_status()
-        return _html_to_text(r.text)[:max_chars]
+        return _paragraphize(_html_to_text(r.text))[:max_chars]
    except Exception:
        return ""