knowledge-inbox/core/youtube.py

"""YouTube transcript extraction via youtube-transcript-api."""

import re

from youtube_transcript_api import YouTubeTranscriptApi


def _extract_video_id(url: str) -> str:
    """Extract YouTube video ID from a URL.

    Args:
        url: YouTube URL (watch?v= or youtu.be/ formats).

    Returns:
        The video ID string.

    Raises:
        ValueError: If no video ID can be found in the URL.
    """
    match = re.search(r"(?:v=|youtu\.be/)([^&?/\s]+)", url)
    if not match:
        raise ValueError(f"Cannot extract video ID from URL: {url}")
    return match.group(1)


def get_transcript(url: str) -> dict:
    """Fetch transcript text for a YouTube video.

    Args:
        url: YouTube video URL.

    Returns:
        Dict with keys: video_id, title, text, url.
        title falls back to video_id if unavailable.
    """
    video_id = _extract_video_id(url)

    fetched = YouTubeTranscriptApi.fetch(video_id, languages=["ko", "en"])
    segments = list(fetched)
    text = " ".join(seg.text for seg in segments)

    # Try to get title from fetched transcript metadata
    title = getattr(fetched, "title", None) or video_id

    return {
        "video_id": video_id,
        "title": title,
        "text": text,
        "url": url,
    }