From 826961f2b924c65996b85cf5ff23f913803704cc Mon Sep 17 00:00:00 2001
From: joungmin <joungmin@joungmins-Mac-mini.local>
Date: Sat, 28 Feb 2026 09:39:05 +0900
Subject: [PATCH] fix: improve title, summary, and content formatting
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- youtube.py: fetch real title via YouTube oEmbed API instead of falling back to video ID
- youtube.py: paragraphize transcript text by grouping sentences (4 per para)
- enricher.py: increase max_tokens 1024→2048 to prevent summary truncation
- web.py: restore paragraph breaks after HTML stripping
---
 core/enricher.py |  2 +-
 core/web.py      | 13 ++++++++++--
 core/youtube.py  | 52 ++++++++++++++++++++++++++++++++----------------
 3 files changed, 47 insertions(+), 20 deletions(-)

diff --git a/core/enricher.py b/core/enricher.py
index 2acab5a..66cee2b 100644
--- a/core/enricher.py
+++ b/core/enricher.py
@@ -64,7 +64,7 @@ def enrich(content_type: str, title: str, url: str, text: str) -> dict:
         client = _get_client()
         req = GenericChatRequest(
             messages=[UserMessage(content=[TextContent(text=prompt)])],
-            max_tokens=1024,
+            max_tokens=2048,
             temperature=0,
         )
         det = ChatDetails(
diff --git a/core/web.py b/core/web.py
index d99a0d0..dfa950b 100644
--- a/core/web.py
+++ b/core/web.py
@@ -41,6 +41,15 @@ def _html_to_text(html: str) -> str:
 _HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; knowledge-inbox/1.0)"}
 
 
+def _paragraphize(text: str) -> str:
+    """Restore paragraph breaks lost during HTML stripping."""
+    # Collapse runs of spaces but preserve intentional double-spaces as breaks
+    text = re.sub(r" {3,}", "  ", text)
+    # Insert paragraph break after sentence-ending punctuation + space
+    text = re.sub(r'([.!?])\s{2,}', r'\1\n\n', text)
+    return text.strip()
+
+
 def fetch_page_text(url: str, max_chars: int = 8000) -> str:
     """Fetch a URL and return stripped plain text, truncated to max_chars.
 
@@ -49,11 +58,11 @@ def fetch_page_text(url: str, max_chars: int = 8000) -> str:
         max_chars: Maximum characters to return.
 
     Returns:
-        Extracted plain text, or empty string on failure.
+        Extracted plain text with paragraph breaks, or empty string on failure.
     """
     try:
         r = httpx.get(url, timeout=15, follow_redirects=True, headers=_HEADERS)
         r.raise_for_status()
-        return _html_to_text(r.text)[:max_chars]
+        return _paragraphize(_html_to_text(r.text))[:max_chars]
     except Exception:
         return ""
diff --git a/core/youtube.py b/core/youtube.py
index 0d2f0fe..e70a812 100644
--- a/core/youtube.py
+++ b/core/youtube.py
@@ -2,27 +2,47 @@
 
 import re
 
+import httpx
 from youtube_transcript_api import YouTubeTranscriptApi
 
 
 def _extract_video_id(url: str) -> str:
-    """Extract YouTube video ID from a URL.
-
-    Args:
-        url: YouTube URL (watch?v= or youtu.be/ formats).
-
-    Returns:
-        The video ID string.
-
-    Raises:
-        ValueError: If no video ID can be found in the URL.
-    """
     match = re.search(r"(?:v=|youtu\.be/)([^&?/\s]+)", url)
     if not match:
         raise ValueError(f"Cannot extract video ID from URL: {url}")
     return match.group(1)
 
 
+def _get_video_title(video_id: str) -> str:
+    """Fetch video title via YouTube oEmbed API (no auth required)."""
+    try:
+        r = httpx.get(
+            "https://www.youtube.com/oembed",
+            params={"url": f"https://www.youtube.com/watch?v={video_id}", "format": "json"},
+            timeout=5,
+        )
+        r.raise_for_status()
+        return r.json().get("title") or video_id
+    except Exception:
+        return video_id
+
+
+def _paragraphize(text: str, sentences_per_para: int = 4) -> str:
+    """Group flat transcript text into readable paragraphs.
+
+    Splits on Korean/English sentence endings and groups by sentence count.
+    """
+    # Split on sentence-ending punctuation followed by whitespace
+    parts = re.split(r'(?<=[다요죠습니까요\.!?])\s+', text)
+    parts = [p.strip() for p in parts if p.strip()]
+
+    paras = []
+    for i in range(0, len(parts), sentences_per_para):
+        paras.append(" ".join(parts[i:i + sentences_per_para]))
+
+    return "\n\n".join(paras) if paras else text
+
+
 def get_transcript(url: str) -> dict:
     """Fetch transcript text for a YouTube video.
 
@@ -31,16 +51,14 @@ def get_transcript(url: str) -> dict:
 
     Returns:
         Dict with keys: video_id, title, text, url.
-        title falls back to video_id if unavailable.
     """
     video_id = _extract_video_id(url)
 
-    fetched = YouTubeTranscriptApi.fetch(video_id, languages=["ko", "en"])
-    segments = list(fetched)
-    text = " ".join(seg.text for seg in segments)
+    fetched = YouTubeTranscriptApi().fetch(video_id, languages=["ko", "en"])
+    raw_text = " ".join(seg.text for seg in fetched)
 
-    # Try to get title from fetched transcript metadata
-    title = getattr(fetched, "title", None) or video_id
+    title = _get_video_title(video_id)
+    text = _paragraphize(raw_text)
 
     return {
         "video_id": video_id,