Initial commit: Tasteby - YouTube restaurant map service

Backend (FastAPI + Oracle ADB), Frontend (Next.js), daemon worker. Features: channel/video/restaurant management, semantic search, Google OAuth, user reviews. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-06 13:47:19 +09:00
commit 36bec10bd0
54 changed files with 9727 additions and 0 deletions
--- a/backend/core/youtube.py
+++ b/backend/core/youtube.py
@@ -0,0 +1,221 @@
+"""YouTube channel scanner + transcript extraction.
+
+Uses YouTube Data API v3 for channel video listing,
+youtube-transcript-api for transcript extraction.
+"""
+
+from __future__ import annotations
+
+import logging
+import os
+import re
+from datetime import datetime
+
+import httpx
+from youtube_transcript_api import YouTubeTranscriptApi
+
+from core.db import conn
+
+logger = logging.getLogger(__name__)
+
+
+def _api_key() -> str:
+    return os.environ["YOUTUBE_DATA_API_KEY"]
+
+
+def extract_video_id(url: str) -> str:
+    match = re.search(r"(?:v=|youtu\.be/)([^&?/\s]+)", url)
+    if not match:
+        raise ValueError(f"Cannot extract video ID from URL: {url}")
+    return match.group(1)
+
+
+# -- Channel operations -------------------------------------------------------
+
+def add_channel(channel_id: str, channel_name: str) -> str:
+    """Register a YouTube channel. Returns DB row id."""
+    sql = """
+        INSERT INTO channels (channel_id, channel_name, channel_url)
+        VALUES (:cid, :cname, :curl)
+        RETURNING id INTO :out_id
+    """
+    with conn() as c:
+        cur = c.cursor()
+        import oracledb
+        out_id = cur.var(oracledb.STRING)
+        cur.execute(sql, {
+            "cid": channel_id,
+            "cname": channel_name,
+            "curl": f"https://www.youtube.com/channel/{channel_id}",
+            "out_id": out_id,
+        })
+        return out_id.getvalue()[0]
+
+
+def get_active_channels() -> list[dict]:
+    sql = "SELECT id, channel_id, channel_name FROM channels WHERE is_active = 1"
+    with conn() as c:
+        cur = c.cursor()
+        cur.execute(sql)
+        return [
+            {"id": r[0], "channel_id": r[1], "channel_name": r[2]}
+            for r in cur.fetchall()
+        ]
+
+
+# -- Video listing via YouTube Data API v3 ------------------------------------
+
+def fetch_channel_videos(
+    channel_id: str,
+    max_results: int = 50,
+    published_after: str | None = None,
+) -> list[dict]:
+    """Fetch video list from a YouTube channel via Data API v3.
+
+    Returns list of dicts: video_id, title, published_at, url.
+    """
+    params: dict = {
+        "key": _api_key(),
+        "channelId": channel_id,
+        "part": "snippet",
+        "order": "date",
+        "maxResults": min(max_results, 50),
+        "type": "video",
+    }
+    if published_after:
+        params["publishedAfter"] = published_after
+
+    videos: list[dict] = []
+    next_page = None
+
+    while True:
+        if next_page:
+            params["pageToken"] = next_page
+
+        r = httpx.get(
+            "https://www.googleapis.com/youtube/v3/search",
+            params=params,
+            timeout=15,
+        )
+        r.raise_for_status()
+        data = r.json()
+
+        for item in data.get("items", []):
+            snippet = item["snippet"]
+            vid = item["id"]["videoId"]
+            videos.append({
+                "video_id": vid,
+                "title": snippet["title"],
+                "published_at": snippet["publishedAt"],
+                "url": f"https://www.youtube.com/watch?v={vid}",
+            })
+
+        next_page = data.get("nextPageToken")
+        if not next_page or len(videos) >= max_results:
+            break
+
+    return videos[:max_results]
+
+
+# -- Transcript extraction ----------------------------------------------------
+
+def get_transcript(video_id: str) -> str | None:
+    """Fetch transcript text for a video. Returns None if unavailable."""
+    try:
+        fetched = YouTubeTranscriptApi().fetch(video_id, languages=["ko", "en"])
+        return " ".join(seg.text for seg in fetched)
+    except Exception as e:
+        logger.warning("Transcript unavailable for %s: %s", video_id, e)
+        return None
+
+
+# -- DB operations for videos -------------------------------------------------
+
+def save_video(channel_db_id: str, video: dict) -> str | None:
+    """Insert a video row if not exists. Returns row id or None if duplicate."""
+    sql = """
+        INSERT INTO videos (channel_id, video_id, title, url, published_at, status)
+        VALUES (:ch_id, :vid, :title, :url, :pub_at, 'pending')
+        RETURNING id INTO :out_id
+    """
+    with conn() as c:
+        cur = c.cursor()
+        import oracledb
+        out_id = cur.var(oracledb.STRING)
+        try:
+            pub_at = None
+            if video.get("published_at"):
+                pub_at = datetime.fromisoformat(
+                    video["published_at"].replace("Z", "+00:00")
+                )
+            cur.execute(sql, {
+                "ch_id": channel_db_id,
+                "vid": video["video_id"],
+                "title": video["title"],
+                "url": video["url"],
+                "pub_at": pub_at,
+                "out_id": out_id,
+            })
+            return out_id.getvalue()[0]
+        except Exception as e:
+            if "UQ_VIDEOS_VID" in str(e).upper():
+                return None  # duplicate
+            raise
+
+
+def get_pending_videos(limit: int = 10) -> list[dict]:
+    sql = """
+        SELECT id, video_id, title, url
+        FROM videos
+        WHERE status = 'pending'
+        ORDER BY created_at
+        FETCH FIRST :n ROWS ONLY
+    """
+    with conn() as c:
+        cur = c.cursor()
+        cur.execute(sql, {"n": limit})
+        return [
+            {"id": r[0], "video_id": r[1], "title": r[2], "url": r[3]}
+            for r in cur.fetchall()
+        ]
+
+
+def update_video_status(
+    video_db_id: str,
+    status: str,
+    transcript: str | None = None,
+    llm_raw: str | None = None,
+) -> None:
+    sets = ["status = :st", "processed_at = SYSTIMESTAMP"]
+    params: dict = {"st": status, "vid": video_db_id}
+    if transcript:
+        sets.append("transcript_text = :txt")
+        params["txt"] = transcript
+    if llm_raw:
+        sets.append("llm_raw_response = :llm_resp")
+        params["llm_resp"] = llm_raw
+    sql = f"UPDATE videos SET {', '.join(sets)} WHERE id = :vid"
+    with conn() as c:
+        c.cursor().execute(sql, params)
+
+
+# -- Scan: fetch new videos for all active channels ---------------------------
+
+def scan_all_channels(max_per_channel: int = 50) -> int:
+    """Scan all active channels for new videos. Returns count of new videos."""
+    channels = get_active_channels()
+    total_new = 0
+    for ch in channels:
+        try:
+            videos = fetch_channel_videos(ch["channel_id"], max_per_channel)
+            for v in videos:
+                row_id = save_video(ch["id"], v)
+                if row_id:
+                    total_new += 1
+            logger.info(
+                "Channel %s: fetched %d videos, %d new",
+                ch["channel_name"], len(videos), total_new,
+            )
+        except Exception as e:
+            logger.error("Failed to scan channel %s: %s", ch["channel_name"], e)
+    return total_new