Files
knowledge-inbox/core/youtube.py
joungmin 826961f2b9 fix: improve title, summary, and content formatting
- youtube.py: fetch real title via YouTube oEmbed API instead of falling back to video ID
- youtube.py: paragraphize transcript text by grouping sentences (4 per para)
- enricher.py: increase max_tokens 1024→2048 to prevent summary truncation
- web.py: restore paragraph breaks after HTML stripping
2026-02-28 09:39:05 +09:00

69 lines
1.9 KiB
Python

"""YouTube transcript extraction via youtube-transcript-api."""
import re
import httpx
from youtube_transcript_api import YouTubeTranscriptApi
def _extract_video_id(url: str) -> str:
match = re.search(r"(?:v=|youtu\.be/)([^&?/\s]+)", url)
if not match:
raise ValueError(f"Cannot extract video ID from URL: {url}")
return match.group(1)
def _get_video_title(video_id: str) -> str:
"""Fetch video title via YouTube oEmbed API (no auth required)."""
try:
r = httpx.get(
"https://www.youtube.com/oembed",
params={"url": f"https://www.youtube.com/watch?v={video_id}", "format": "json"},
timeout=5,
)
r.raise_for_status()
return r.json().get("title") or video_id
except Exception:
return video_id
def _paragraphize(text: str, sentences_per_para: int = 4) -> str:
"""Group flat transcript text into readable paragraphs.
Splits on Korean/English sentence endings and groups by sentence count.
"""
# Split on sentence-ending punctuation followed by whitespace
parts = re.split(r'(?<=[다요죠습니까요\.!?])\s+', text)
parts = [p.strip() for p in parts if p.strip()]
paras = []
for i in range(0, len(parts), sentences_per_para):
paras.append(" ".join(parts[i:i + sentences_per_para]))
return "\n\n".join(paras) if paras else text
def get_transcript(url: str) -> dict:
"""Fetch transcript text for a YouTube video.
Args:
url: YouTube video URL.
Returns:
Dict with keys: video_id, title, text, url.
"""
video_id = _extract_video_id(url)
fetched = YouTubeTranscriptApi().fetch(video_id, languages=["ko", "en"])
raw_text = " ".join(seg.text for seg in fetched)
title = _get_video_title(video_id)
text = _paragraphize(raw_text)
return {
"video_id": video_id,
"title": title,
"text": text,
"url": url,
}