fix: improve title, summary, and content formatting
- youtube.py: fetch real title via YouTube oEmbed API instead of falling back to video ID - youtube.py: paragraphize transcript text by grouping sentences (4 per para) - enricher.py: increase max_tokens 1024→2048 to prevent summary truncation - web.py: restore paragraph breaks after HTML stripping
This commit is contained in:
13
core/web.py
13
core/web.py
@@ -41,6 +41,15 @@ def _html_to_text(html: str) -> str:
|
||||
_HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; knowledge-inbox/1.0)"}
|
||||
|
||||
|
||||
def _paragraphize(text: str) -> str:
|
||||
"""Restore paragraph breaks lost during HTML stripping."""
|
||||
# Collapse runs of spaces but preserve intentional double-spaces as breaks
|
||||
text = re.sub(r" {3,}", " ", text)
|
||||
# Insert paragraph break after sentence-ending punctuation + space
|
||||
text = re.sub(r'([.!?])\s{2,}', r'\1\n\n', text)
|
||||
return text.strip()
|
||||
|
||||
|
||||
def fetch_page_text(url: str, max_chars: int = 8000) -> str:
|
||||
"""Fetch a URL and return stripped plain text, truncated to max_chars.
|
||||
|
||||
@@ -49,11 +58,11 @@ def fetch_page_text(url: str, max_chars: int = 8000) -> str:
|
||||
max_chars: Maximum characters to return.
|
||||
|
||||
Returns:
|
||||
Extracted plain text, or empty string on failure.
|
||||
Extracted plain text with paragraph breaks, or empty string on failure.
|
||||
"""
|
||||
try:
|
||||
r = httpx.get(url, timeout=15, follow_redirects=True, headers=_HEADERS)
|
||||
r.raise_for_status()
|
||||
return _html_to_text(r.text)[:max_chars]
|
||||
return _paragraphize(_html_to_text(r.text))[:max_chars]
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
Reference in New Issue
Block a user