"""YouTube transcript extraction via youtube-transcript-api.""" import re import httpx from youtube_transcript_api import YouTubeTranscriptApi def _extract_video_id(url: str) -> str: match = re.search(r"(?:v=|youtu\.be/)([^&?/\s]+)", url) if not match: raise ValueError(f"Cannot extract video ID from URL: {url}") return match.group(1) def _get_video_title(video_id: str) -> str: """Fetch video title via YouTube oEmbed API (no auth required).""" try: r = httpx.get( "https://www.youtube.com/oembed", params={"url": f"https://www.youtube.com/watch?v={video_id}", "format": "json"}, timeout=5, ) r.raise_for_status() return r.json().get("title") or video_id except Exception: return video_id def _paragraphize(text: str, sentences_per_para: int = 4) -> str: """Group flat transcript text into readable paragraphs. Splits on Korean/English sentence endings and groups by sentence count. """ # Split on sentence-ending punctuation followed by whitespace parts = re.split(r'(?<=[다요죠습니까요\.!?])\s+', text) parts = [p.strip() for p in parts if p.strip()] paras = [] for i in range(0, len(parts), sentences_per_para): paras.append(" ".join(parts[i:i + sentences_per_para])) return "\n\n".join(paras) if paras else text def get_transcript(url: str) -> dict: """Fetch transcript text for a YouTube video. Args: url: YouTube video URL. Returns: Dict with keys: video_id, title, text, url. """ video_id = _extract_video_id(url) fetched = YouTubeTranscriptApi().fetch(video_id, languages=["ko", "en"]) raw_text = " ".join(seg.text for seg in fetched) title = _get_video_title(video_id) text = _paragraphize(raw_text) return { "video_id": video_id, "title": title, "text": text, "url": url, }