fix: improve title, summary, and content formatting
- youtube.py: fetch real title via YouTube oEmbed API instead of falling back to video ID - youtube.py: paragraphize transcript text by grouping sentences (4 per para) - enricher.py: increase max_tokens 1024→2048 to prevent summary truncation - web.py: restore paragraph breaks after HTML stripping
This commit is contained in:
@@ -64,7 +64,7 @@ def enrich(content_type: str, title: str, url: str, text: str) -> dict:
|
|||||||
client = _get_client()
|
client = _get_client()
|
||||||
req = GenericChatRequest(
|
req = GenericChatRequest(
|
||||||
messages=[UserMessage(content=[TextContent(text=prompt)])],
|
messages=[UserMessage(content=[TextContent(text=prompt)])],
|
||||||
max_tokens=1024,
|
max_tokens=2048,
|
||||||
temperature=0,
|
temperature=0,
|
||||||
)
|
)
|
||||||
det = ChatDetails(
|
det = ChatDetails(
|
||||||
|
|||||||
13
core/web.py
13
core/web.py
@@ -41,6 +41,15 @@ def _html_to_text(html: str) -> str:
|
|||||||
_HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; knowledge-inbox/1.0)"}
|
_HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; knowledge-inbox/1.0)"}
|
||||||
|
|
||||||
|
|
||||||
|
def _paragraphize(text: str) -> str:
|
||||||
|
"""Restore paragraph breaks lost during HTML stripping."""
|
||||||
|
# Collapse runs of spaces but preserve intentional double-spaces as breaks
|
||||||
|
text = re.sub(r" {3,}", " ", text)
|
||||||
|
# Insert paragraph break after sentence-ending punctuation + space
|
||||||
|
text = re.sub(r'([.!?])\s{2,}', r'\1\n\n', text)
|
||||||
|
return text.strip()
|
||||||
|
|
||||||
|
|
||||||
def fetch_page_text(url: str, max_chars: int = 8000) -> str:
|
def fetch_page_text(url: str, max_chars: int = 8000) -> str:
|
||||||
"""Fetch a URL and return stripped plain text, truncated to max_chars.
|
"""Fetch a URL and return stripped plain text, truncated to max_chars.
|
||||||
|
|
||||||
@@ -49,11 +58,11 @@ def fetch_page_text(url: str, max_chars: int = 8000) -> str:
|
|||||||
max_chars: Maximum characters to return.
|
max_chars: Maximum characters to return.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Extracted plain text, or empty string on failure.
|
Extracted plain text with paragraph breaks, or empty string on failure.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
r = httpx.get(url, timeout=15, follow_redirects=True, headers=_HEADERS)
|
r = httpx.get(url, timeout=15, follow_redirects=True, headers=_HEADERS)
|
||||||
r.raise_for_status()
|
r.raise_for_status()
|
||||||
return _html_to_text(r.text)[:max_chars]
|
return _paragraphize(_html_to_text(r.text))[:max_chars]
|
||||||
except Exception:
|
except Exception:
|
||||||
return ""
|
return ""
|
||||||
|
|||||||
@@ -2,27 +2,47 @@
|
|||||||
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
import httpx
|
||||||
from youtube_transcript_api import YouTubeTranscriptApi
|
from youtube_transcript_api import YouTubeTranscriptApi
|
||||||
|
|
||||||
|
|
||||||
def _extract_video_id(url: str) -> str:
|
def _extract_video_id(url: str) -> str:
|
||||||
"""Extract YouTube video ID from a URL.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
url: YouTube URL (watch?v= or youtu.be/ formats).
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
The video ID string.
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
ValueError: If no video ID can be found in the URL.
|
|
||||||
"""
|
|
||||||
match = re.search(r"(?:v=|youtu\.be/)([^&?/\s]+)", url)
|
match = re.search(r"(?:v=|youtu\.be/)([^&?/\s]+)", url)
|
||||||
if not match:
|
if not match:
|
||||||
raise ValueError(f"Cannot extract video ID from URL: {url}")
|
raise ValueError(f"Cannot extract video ID from URL: {url}")
|
||||||
return match.group(1)
|
return match.group(1)
|
||||||
|
|
||||||
|
|
||||||
|
def _get_video_title(video_id: str) -> str:
|
||||||
|
"""Fetch video title via YouTube oEmbed API (no auth required)."""
|
||||||
|
try:
|
||||||
|
r = httpx.get(
|
||||||
|
"https://www.youtube.com/oembed",
|
||||||
|
params={"url": f"https://www.youtube.com/watch?v={video_id}", "format": "json"},
|
||||||
|
timeout=5,
|
||||||
|
)
|
||||||
|
r.raise_for_status()
|
||||||
|
return r.json().get("title") or video_id
|
||||||
|
except Exception:
|
||||||
|
return video_id
|
||||||
|
|
||||||
|
|
||||||
|
def _paragraphize(text: str, sentences_per_para: int = 4) -> str:
|
||||||
|
"""Group flat transcript text into readable paragraphs.
|
||||||
|
|
||||||
|
Splits on Korean/English sentence endings and groups by sentence count.
|
||||||
|
"""
|
||||||
|
# Split on sentence-ending punctuation followed by whitespace
|
||||||
|
parts = re.split(r'(?<=[다요죠습니까요\.!?])\s+', text)
|
||||||
|
parts = [p.strip() for p in parts if p.strip()]
|
||||||
|
|
||||||
|
paras = []
|
||||||
|
for i in range(0, len(parts), sentences_per_para):
|
||||||
|
paras.append(" ".join(parts[i:i + sentences_per_para]))
|
||||||
|
|
||||||
|
return "\n\n".join(paras) if paras else text
|
||||||
|
|
||||||
|
|
||||||
def get_transcript(url: str) -> dict:
|
def get_transcript(url: str) -> dict:
|
||||||
"""Fetch transcript text for a YouTube video.
|
"""Fetch transcript text for a YouTube video.
|
||||||
|
|
||||||
@@ -31,16 +51,14 @@ def get_transcript(url: str) -> dict:
|
|||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Dict with keys: video_id, title, text, url.
|
Dict with keys: video_id, title, text, url.
|
||||||
title falls back to video_id if unavailable.
|
|
||||||
"""
|
"""
|
||||||
video_id = _extract_video_id(url)
|
video_id = _extract_video_id(url)
|
||||||
|
|
||||||
fetched = YouTubeTranscriptApi.fetch(video_id, languages=["ko", "en"])
|
fetched = YouTubeTranscriptApi().fetch(video_id, languages=["ko", "en"])
|
||||||
segments = list(fetched)
|
raw_text = " ".join(seg.text for seg in fetched)
|
||||||
text = " ".join(seg.text for seg in segments)
|
|
||||||
|
|
||||||
# Try to get title from fetched transcript metadata
|
title = _get_video_title(video_id)
|
||||||
title = getattr(fetched, "title", None) or video_id
|
text = _paragraphize(raw_text)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"video_id": video_id,
|
"video_id": video_id,
|
||||||
|
|||||||
Reference in New Issue
Block a user