From 826961f2b924c65996b85cf5ff23f913803704cc Mon Sep 17 00:00:00 2001 From: joungmin Date: Sat, 28 Feb 2026 09:39:05 +0900 Subject: [PATCH] fix: improve title, summary, and content formatting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - youtube.py: fetch real title via YouTube oEmbed API instead of falling back to video ID - youtube.py: paragraphize transcript text by grouping sentences (4 per para) - enricher.py: increase max_tokens 1024→2048 to prevent summary truncation - web.py: restore paragraph breaks after HTML stripping --- core/enricher.py | 2 +- core/web.py | 13 ++++++++++-- core/youtube.py | 52 ++++++++++++++++++++++++++++++++---------------- 3 files changed, 47 insertions(+), 20 deletions(-) diff --git a/core/enricher.py b/core/enricher.py index 2acab5a..66cee2b 100644 --- a/core/enricher.py +++ b/core/enricher.py @@ -64,7 +64,7 @@ def enrich(content_type: str, title: str, url: str, text: str) -> dict: client = _get_client() req = GenericChatRequest( messages=[UserMessage(content=[TextContent(text=prompt)])], - max_tokens=1024, + max_tokens=2048, temperature=0, ) det = ChatDetails( diff --git a/core/web.py b/core/web.py index d99a0d0..dfa950b 100644 --- a/core/web.py +++ b/core/web.py @@ -41,6 +41,15 @@ def _html_to_text(html: str) -> str: _HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; knowledge-inbox/1.0)"} +def _paragraphize(text: str) -> str: + """Restore paragraph breaks lost during HTML stripping.""" + # Collapse runs of spaces but preserve intentional double-spaces as breaks + text = re.sub(r" {3,}", " ", text) + # Insert paragraph break after sentence-ending punctuation + space + text = re.sub(r'([.!?])\s{2,}', r'\1\n\n', text) + return text.strip() + + def fetch_page_text(url: str, max_chars: int = 8000) -> str: """Fetch a URL and return stripped plain text, truncated to max_chars. @@ -49,11 +58,11 @@ def fetch_page_text(url: str, max_chars: int = 8000) -> str: max_chars: Maximum characters to return. Returns: - Extracted plain text, or empty string on failure. + Extracted plain text with paragraph breaks, or empty string on failure. """ try: r = httpx.get(url, timeout=15, follow_redirects=True, headers=_HEADERS) r.raise_for_status() - return _html_to_text(r.text)[:max_chars] + return _paragraphize(_html_to_text(r.text))[:max_chars] except Exception: return "" diff --git a/core/youtube.py b/core/youtube.py index 0d2f0fe..e70a812 100644 --- a/core/youtube.py +++ b/core/youtube.py @@ -2,27 +2,47 @@ import re +import httpx from youtube_transcript_api import YouTubeTranscriptApi def _extract_video_id(url: str) -> str: - """Extract YouTube video ID from a URL. - - Args: - url: YouTube URL (watch?v= or youtu.be/ formats). - - Returns: - The video ID string. - - Raises: - ValueError: If no video ID can be found in the URL. - """ match = re.search(r"(?:v=|youtu\.be/)([^&?/\s]+)", url) if not match: raise ValueError(f"Cannot extract video ID from URL: {url}") return match.group(1) +def _get_video_title(video_id: str) -> str: + """Fetch video title via YouTube oEmbed API (no auth required).""" + try: + r = httpx.get( + "https://www.youtube.com/oembed", + params={"url": f"https://www.youtube.com/watch?v={video_id}", "format": "json"}, + timeout=5, + ) + r.raise_for_status() + return r.json().get("title") or video_id + except Exception: + return video_id + + +def _paragraphize(text: str, sentences_per_para: int = 4) -> str: + """Group flat transcript text into readable paragraphs. + + Splits on Korean/English sentence endings and groups by sentence count. + """ + # Split on sentence-ending punctuation followed by whitespace + parts = re.split(r'(?<=[다요죠습니까요\.!?])\s+', text) + parts = [p.strip() for p in parts if p.strip()] + + paras = [] + for i in range(0, len(parts), sentences_per_para): + paras.append(" ".join(parts[i:i + sentences_per_para])) + + return "\n\n".join(paras) if paras else text + + def get_transcript(url: str) -> dict: """Fetch transcript text for a YouTube video. @@ -31,16 +51,14 @@ def get_transcript(url: str) -> dict: Returns: Dict with keys: video_id, title, text, url. - title falls back to video_id if unavailable. """ video_id = _extract_video_id(url) - fetched = YouTubeTranscriptApi.fetch(video_id, languages=["ko", "en"]) - segments = list(fetched) - text = " ".join(seg.text for seg in segments) + fetched = YouTubeTranscriptApi().fetch(video_id, languages=["ko", "en"]) + raw_text = " ".join(seg.text for seg in fetched) - # Try to get title from fetched transcript metadata - title = getattr(fetched, "title", None) or video_id + title = _get_video_title(video_id) + text = _paragraphize(raw_text) return { "video_id": video_id,