tasteby/backend/core/youtube.py

"""YouTube channel scanner + transcript extraction.

Uses YouTube Data API v3 for channel video listing,
youtube-transcript-api for transcript extraction.
"""

from __future__ import annotations

import logging
import os
import re
from datetime import datetime

import httpx
from youtube_transcript_api import YouTubeTranscriptApi

from core.db import conn

logger = logging.getLogger(__name__)


def _api_key() -> str:
    return os.environ["YOUTUBE_DATA_API_KEY"]


def extract_video_id(url: str) -> str:
    match = re.search(r"(?:v=|youtu\.be/)([^&?/\s]+)", url)
    if not match:
        raise ValueError(f"Cannot extract video ID from URL: {url}")
    return match.group(1)


# -- Channel operations -------------------------------------------------------

def add_channel(channel_id: str, channel_name: str) -> str:
    """Register a YouTube channel. Returns DB row id."""
    sql = """
        INSERT INTO channels (channel_id, channel_name, channel_url)
        VALUES (:cid, :cname, :curl)
        RETURNING id INTO :out_id
    """
    with conn() as c:
        cur = c.cursor()
        import oracledb
        out_id = cur.var(oracledb.STRING)
        cur.execute(sql, {
            "cid": channel_id,
            "cname": channel_name,
            "curl": f"https://www.youtube.com/channel/{channel_id}",
            "out_id": out_id,
        })
        return out_id.getvalue()[0]


def get_active_channels() -> list[dict]:
    sql = "SELECT id, channel_id, channel_name FROM channels WHERE is_active = 1"
    with conn() as c:
        cur = c.cursor()
        cur.execute(sql)
        return [
            {"id": r[0], "channel_id": r[1], "channel_name": r[2]}
            for r in cur.fetchall()
        ]


# -- Video listing via YouTube Data API v3 ------------------------------------

def fetch_channel_videos(
    channel_id: str,
    max_results: int = 50,
    published_after: str | None = None,
) -> list[dict]:
    """Fetch video list from a YouTube channel via Data API v3.

    Returns list of dicts: video_id, title, published_at, url.
    """
    params: dict = {
        "key": _api_key(),
        "channelId": channel_id,
        "part": "snippet",
        "order": "date",
        "maxResults": min(max_results, 50),
        "type": "video",
    }
    if published_after:
        params["publishedAfter"] = published_after

    videos: list[dict] = []
    next_page = None

    while True:
        if next_page:
            params["pageToken"] = next_page

        r = httpx.get(
            "https://www.googleapis.com/youtube/v3/search",
            params=params,
            timeout=15,
        )
        r.raise_for_status()
        data = r.json()

        for item in data.get("items", []):
            snippet = item["snippet"]
            vid = item["id"]["videoId"]
            videos.append({
                "video_id": vid,
                "title": snippet["title"],
                "published_at": snippet["publishedAt"],
                "url": f"https://www.youtube.com/watch?v={vid}",
            })

        next_page = data.get("nextPageToken")
        if not next_page or len(videos) >= max_results:
            break

    return videos[:max_results]


# -- Transcript extraction ----------------------------------------------------

def get_transcript(video_id: str) -> str | None:
    """Fetch transcript text for a video. Returns None if unavailable."""
    try:
        fetched = YouTubeTranscriptApi().fetch(video_id, languages=["ko", "en"])
        return " ".join(seg.text for seg in fetched)
    except Exception as e:
        logger.warning("Transcript unavailable for %s: %s", video_id, e)
        return None


# -- DB operations for videos -------------------------------------------------

def save_video(channel_db_id: str, video: dict) -> str | None:
    """Insert a video row if not exists. Returns row id or None if duplicate."""
    sql = """
        INSERT INTO videos (channel_id, video_id, title, url, published_at, status)
        VALUES (:ch_id, :vid, :title, :url, :pub_at, 'pending')
        RETURNING id INTO :out_id
    """
    with conn() as c:
        cur = c.cursor()
        import oracledb
        out_id = cur.var(oracledb.STRING)
        try:
            pub_at = None
            if video.get("published_at"):
                pub_at = datetime.fromisoformat(
                    video["published_at"].replace("Z", "+00:00")
                )
            cur.execute(sql, {
                "ch_id": channel_db_id,
                "vid": video["video_id"],
                "title": video["title"],
                "url": video["url"],
                "pub_at": pub_at,
                "out_id": out_id,
            })
            return out_id.getvalue()[0]
        except Exception as e:
            if "UQ_VIDEOS_VID" in str(e).upper():
                return None  # duplicate
            raise


def get_pending_videos(limit: int = 10) -> list[dict]:
    sql = """
        SELECT id, video_id, title, url
        FROM videos
        WHERE status = 'pending'
        ORDER BY created_at
        FETCH FIRST :n ROWS ONLY
    """
    with conn() as c:
        cur = c.cursor()
        cur.execute(sql, {"n": limit})
        return [
            {"id": r[0], "video_id": r[1], "title": r[2], "url": r[3]}
            for r in cur.fetchall()
        ]


def update_video_status(
    video_db_id: str,
    status: str,
    transcript: str | None = None,
    llm_raw: str | None = None,
) -> None:
    sets = ["status = :st", "processed_at = SYSTIMESTAMP"]
    params: dict = {"st": status, "vid": video_db_id}
    if transcript:
        sets.append("transcript_text = :txt")
        params["txt"] = transcript
    if llm_raw:
        sets.append("llm_raw_response = :llm_resp")
        params["llm_resp"] = llm_raw
    sql = f"UPDATE videos SET {', '.join(sets)} WHERE id = :vid"
    with conn() as c:
        c.cursor().execute(sql, params)


# -- Scan: fetch new videos for all active channels ---------------------------

def scan_all_channels(max_per_channel: int = 50) -> int:
    """Scan all active channels for new videos. Returns count of new videos."""
    channels = get_active_channels()
    total_new = 0
    for ch in channels:
        try:
            videos = fetch_channel_videos(ch["channel_id"], max_per_channel)
            for v in videos:
                row_id = save_video(ch["id"], v)
                if row_id:
                    total_new += 1
            logger.info(
                "Channel %s: fetched %d videos, %d new",
                ch["channel_name"], len(videos), total_new,
            )
        except Exception as e:
            logger.error("Failed to scan channel %s: %s", ch["channel_name"], e)
    return total_new