Initial commit: Tasteby - YouTube restaurant map service

Backend (FastAPI + Oracle ADB), Frontend (Next.js), daemon worker.
Features: channel/video/restaurant management, semantic search,
Google OAuth, user reviews.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
joungmin
2026-03-06 13:47:19 +09:00
commit 36bec10bd0
54 changed files with 9727 additions and 0 deletions

0
backend/core/__init__.py Normal file
View File

122
backend/core/auth.py Normal file
View File

@@ -0,0 +1,122 @@
"""Authentication helpers — Google OAuth2 + JWT."""
from __future__ import annotations
import os
from datetime import datetime, timedelta, timezone
import jwt
import oracledb
from google.oauth2 import id_token as google_id_token
from google.auth.transport import requests as google_requests
from core.db import conn
JWT_SECRET = os.environ.get("JWT_SECRET", "tasteby-dev-secret-change-me")
JWT_ALGORITHM = "HS256"
JWT_EXPIRE_DAYS = 7
def verify_google_token(token: str) -> dict:
"""Verify a Google ID token and return user info.
Returns dict with keys: sub, email, name, picture.
Raises ValueError on invalid token.
"""
info = google_id_token.verify_oauth2_token(
token, google_requests.Request(),
)
return {
"sub": info["sub"],
"email": info.get("email"),
"name": info.get("name"),
"picture": info.get("picture"),
}
def find_or_create_user(
provider: str,
provider_id: str,
email: str | None = None,
nickname: str | None = None,
avatar_url: str | None = None,
) -> dict:
"""Find existing user or create new one. Returns user dict."""
# Try to find existing user
sql_find = """
SELECT id, provider, provider_id, email, nickname, avatar_url, created_at, last_login_at
FROM tasteby_users
WHERE provider = :provider AND provider_id = :provider_id
"""
with conn() as c:
cur = c.cursor()
cur.execute(sql_find, {"provider": provider, "provider_id": provider_id})
row = cur.fetchone()
if row:
# Update last_login and optional fields
sql_update = """
UPDATE tasteby_users
SET last_login_at = SYSTIMESTAMP,
email = COALESCE(:email, email),
nickname = COALESCE(:nickname, nickname),
avatar_url = COALESCE(:avatar_url, avatar_url)
WHERE id = :id
"""
cur.execute(sql_update, {
"email": email, "nickname": nickname,
"avatar_url": avatar_url, "id": row[0],
})
return {
"id": row[0],
"provider": row[1],
"provider_id": row[2],
"email": email or row[3],
"nickname": nickname or row[4],
"avatar_url": avatar_url or row[5],
}
# Create new user
sql_insert = """
INSERT INTO tasteby_users (provider, provider_id, email, nickname, avatar_url, last_login_at)
VALUES (:provider, :provider_id, :email, :nickname, :avatar_url, SYSTIMESTAMP)
RETURNING id INTO :out_id
"""
out_id = cur.var(oracledb.STRING)
cur.execute(sql_insert, {
"provider": provider,
"provider_id": provider_id,
"email": email,
"nickname": nickname,
"avatar_url": avatar_url,
"out_id": out_id,
})
new_id = out_id.getvalue()[0]
return {
"id": new_id,
"provider": provider,
"provider_id": provider_id,
"email": email,
"nickname": nickname,
"avatar_url": avatar_url,
}
def create_jwt(user: dict) -> str:
"""Create a JWT access token for the given user."""
payload = {
"sub": user["id"],
"email": user.get("email"),
"nickname": user.get("nickname"),
"exp": datetime.now(timezone.utc) + timedelta(days=JWT_EXPIRE_DAYS),
"iat": datetime.now(timezone.utc),
}
return jwt.encode(payload, JWT_SECRET, algorithm=JWT_ALGORITHM)
def verify_jwt(token: str) -> dict:
"""Verify a JWT and return the payload.
Raises jwt.InvalidTokenError on failure.
"""
return jwt.decode(token, JWT_SECRET, algorithms=[JWT_ALGORITHM])

44
backend/core/db.py Normal file
View File

@@ -0,0 +1,44 @@
"""Oracle ADB connection pool — shared across all modules."""
from __future__ import annotations
import os
from contextlib import contextmanager
from typing import Generator, Optional
import oracledb
_pool: Optional[oracledb.ConnectionPool] = None
def _get_pool() -> oracledb.ConnectionPool:
global _pool
if _pool is None:
kwargs: dict = dict(
user=os.environ["ORACLE_USER"],
password=os.environ["ORACLE_PASSWORD"],
dsn=os.environ["ORACLE_DSN"],
min=1,
max=5,
increment=1,
)
wallet = os.environ.get("ORACLE_WALLET")
if wallet:
kwargs["config_dir"] = wallet
_pool = oracledb.create_pool(**kwargs)
return _pool
@contextmanager
def conn() -> Generator[oracledb.Connection, None, None]:
"""Acquire a pooled connection with auto-commit/rollback."""
pool = _get_pool()
c = pool.acquire()
try:
yield c
c.commit()
except Exception:
c.rollback()
raise
finally:
pool.release(c)

128
backend/core/extractor.py Normal file
View File

@@ -0,0 +1,128 @@
"""LLM-based restaurant info extraction from video transcripts.
Uses OCI GenAI (Gemini Flash) to extract structured restaurant data.
"""
from __future__ import annotations
import json
import logging
import os
import re
import oci
from oci.generative_ai_inference import GenerativeAiInferenceClient
from oci.generative_ai_inference.models import (
ChatDetails,
GenericChatRequest,
OnDemandServingMode,
TextContent,
UserMessage,
)
logger = logging.getLogger(__name__)
def _get_client() -> GenerativeAiInferenceClient:
config = oci.config.from_file()
endpoint = os.environ.get("OCI_CHAT_ENDPOINT") or os.environ["OCI_GENAI_ENDPOINT"]
return GenerativeAiInferenceClient(config, service_endpoint=endpoint)
def _llm(prompt: str, max_tokens: int = 4096) -> str:
client = _get_client()
req = GenericChatRequest(
messages=[UserMessage(content=[TextContent(text=prompt)])],
max_tokens=max_tokens,
temperature=0,
)
det = ChatDetails(
compartment_id=os.environ["OCI_COMPARTMENT_ID"],
serving_mode=OnDemandServingMode(model_id=os.environ["OCI_CHAT_MODEL_ID"]),
chat_request=req,
)
resp = client.chat(det)
return resp.data.chat_response.choices[0].message.content[0].text.strip()
def _parse_json(raw: str) -> dict | list:
raw = re.sub(r"^```(?:json)?\s*|\s*```$", "", raw, flags=re.MULTILINE).strip()
raw = re.sub(r",\s*([}\]])", r"\1", raw)
try:
return json.loads(raw)
except json.JSONDecodeError:
pass
try:
return json.JSONDecoder(strict=False).decode(raw)
except json.JSONDecodeError:
pass
# recover truncated array
if raw.lstrip().startswith("["):
decoder = json.JSONDecoder(strict=False)
items: list = []
idx = raw.index("[") + 1
while idx < len(raw):
while idx < len(raw) and raw[idx] in " \t\n\r,":
idx += 1
if idx >= len(raw) or raw[idx] == "]":
break
try:
obj, end = decoder.raw_decode(raw, idx)
items.append(obj)
idx = end
except json.JSONDecodeError:
break
if items:
return items
raise ValueError(f"JSON parse failed: {raw[:80]!r}")
_EXTRACT_PROMPT = """\
다음은 유튜브 먹방/맛집 영상의 자막입니다.
이 영상에서 언급된 모든 식당 정보를 추출하세요.
규칙:
- 식당이 없으면 빈 배열 [] 반환
- 각 식당에 대해 아래 필드를 JSON 배열로 반환
- 확실하지 않은 정보는 null
- 추가 설명 없이 JSON만 반환
필드:
- name: 식당 이름 (string, 필수)
- address: 주소 또는 위치 힌트 (string | null)
- region: 지역 (예: 서울 강남, 부산 해운대) (string | null)
- cuisine_type: 음식 종류 (예: 한식, 일식, 중식, 양식, 카페) (string | null)
- price_range: 가격대 (예: 1만원대, 2-3만원) (string | null)
- foods_mentioned: 언급된 메뉴들 (string[])
- evaluation: 평가 내용 (string | null)
- guests: 함께한 게스트 (string[])
영상 제목: {title}
자막:
{transcript}
JSON 배열:"""
def extract_restaurants(title: str, transcript: str) -> tuple[list[dict], str]:
"""Extract restaurant info from a video transcript using LLM.
Returns (list of restaurant dicts, raw LLM response text).
"""
# Truncate very long transcripts
if len(transcript) > 8000:
transcript = transcript[:7000] + "\n...(중략)...\n" + transcript[-1000:]
prompt = _EXTRACT_PROMPT.format(title=title, transcript=transcript)
try:
raw = _llm(prompt, max_tokens=4096)
result = _parse_json(raw)
if isinstance(result, list):
return result, raw
if isinstance(result, dict):
return [result], raw
return [], raw
except Exception as e:
logger.error("Restaurant extraction failed: %s", e)
return [], ""

97
backend/core/geocoding.py Normal file
View File

@@ -0,0 +1,97 @@
"""Google Maps Geocoding + Place Search for restaurant location lookup."""
from __future__ import annotations
import logging
import os
import httpx
logger = logging.getLogger(__name__)
def _api_key() -> str:
return os.environ["GOOGLE_MAPS_API_KEY"]
def geocode_restaurant(name: str, address: str | None = None, region: str | None = None) -> dict | None:
"""Look up restaurant coordinates using Google Maps.
Tries Places Text Search first (more accurate for restaurant names),
falls back to Geocoding API.
Returns dict with: latitude, longitude, formatted_address, google_place_id
or None if not found.
"""
query = name
if address:
query += f" {address}"
elif region:
query += f" {region}"
# Try Places Text Search (better for business names)
result = _places_text_search(query)
if result:
return result
# Fallback: Geocoding API
return _geocode(query)
def _places_text_search(query: str) -> dict | None:
"""Search for a place using Google Places Text Search API."""
try:
r = httpx.get(
"https://maps.googleapis.com/maps/api/place/textsearch/json",
params={
"query": query,
"key": _api_key(),
"language": "ko",
"type": "restaurant",
},
timeout=10,
)
r.raise_for_status()
data = r.json()
if data.get("status") == "OK" and data.get("results"):
place = data["results"][0]
loc = place["geometry"]["location"]
return {
"latitude": loc["lat"],
"longitude": loc["lng"],
"formatted_address": place.get("formatted_address", ""),
"google_place_id": place.get("place_id", ""),
}
except Exception as e:
logger.warning("Places text search failed for '%s': %s", query, e)
return None
def _geocode(query: str) -> dict | None:
"""Geocode an address string."""
try:
r = httpx.get(
"https://maps.googleapis.com/maps/api/geocode/json",
params={
"address": query,
"key": _api_key(),
"language": "ko",
},
timeout=10,
)
r.raise_for_status()
data = r.json()
if data.get("status") == "OK" and data.get("results"):
result = data["results"][0]
loc = result["geometry"]["location"]
return {
"latitude": loc["lat"],
"longitude": loc["lng"],
"formatted_address": result.get("formatted_address", ""),
"google_place_id": "",
}
except Exception as e:
logger.warning("Geocoding failed for '%s': %s", query, e)
return None

134
backend/core/pipeline.py Normal file
View File

@@ -0,0 +1,134 @@
"""Data pipeline: process pending videos end-to-end.
For each pending video:
1. Fetch transcript
2. Extract restaurant info via LLM
3. Geocode each restaurant
4. Save to DB + generate vector embeddings
"""
from __future__ import annotations
import json
import logging
from core import youtube, extractor, geocoding, restaurant, vector
logger = logging.getLogger(__name__)
def process_video(video: dict) -> int:
"""Process a single pending video. Returns number of restaurants found."""
video_db_id = video["id"]
video_id = video["video_id"]
title = video["title"]
logger.info("Processing video: %s (%s)", title, video_id)
youtube.update_video_status(video_db_id, "processing")
try:
# 1. Transcript
transcript = youtube.get_transcript(video_id)
if not transcript:
logger.warning("No transcript for %s, marking done", video_id)
youtube.update_video_status(video_db_id, "done")
return 0
youtube.update_video_status(video_db_id, "processing", transcript)
# 2. LLM extraction
restaurants, llm_raw = extractor.extract_restaurants(title, transcript)
if not restaurants:
logger.info("No restaurants found in %s", video_id)
youtube.update_video_status(video_db_id, "done", llm_raw=llm_raw)
return 0
# 3-4. Geocode + save each restaurant
count = 0
for rest_data in restaurants:
name = rest_data.get("name")
if not name:
continue
# Geocode
geo = geocoding.geocode_restaurant(
name,
address=rest_data.get("address"),
region=rest_data.get("region"),
)
lat = geo["latitude"] if geo else None
lng = geo["longitude"] if geo else None
addr = geo["formatted_address"] if geo else rest_data.get("address")
place_id = geo["google_place_id"] if geo else None
# Upsert restaurant
rest_id = restaurant.upsert(
name=name,
address=addr,
region=rest_data.get("region"),
latitude=lat,
longitude=lng,
cuisine_type=rest_data.get("cuisine_type"),
price_range=rest_data.get("price_range"),
google_place_id=place_id,
)
# Link video <-> restaurant
restaurant.link_video_restaurant(
video_db_id=video_db_id,
restaurant_id=rest_id,
foods=rest_data.get("foods_mentioned"),
evaluation=rest_data.get("evaluation"),
guests=rest_data.get("guests"),
)
# Vector embeddings
chunks = _build_chunks(name, rest_data, title)
if chunks:
vector.save_restaurant_vectors(rest_id, chunks)
count += 1
logger.info("Saved restaurant: %s (geocoded=%s)", name, bool(geo))
youtube.update_video_status(video_db_id, "done", llm_raw=llm_raw)
logger.info("Video %s done: %d restaurants", video_id, count)
return count
except Exception as e:
logger.error("Pipeline error for %s: %s", video_id, e, exc_info=True)
youtube.update_video_status(video_db_id, "error")
return 0
def _build_chunks(name: str, data: dict, video_title: str) -> list[str]:
"""Build text chunks for vector embedding."""
parts = [f"식당: {name}"]
if data.get("region"):
parts.append(f"지역: {data['region']}")
if data.get("cuisine_type"):
parts.append(f"음식 종류: {data['cuisine_type']}")
if data.get("foods_mentioned"):
foods = data["foods_mentioned"]
if isinstance(foods, list):
parts.append(f"메뉴: {', '.join(foods)}")
if data.get("evaluation"):
parts.append(f"평가: {data['evaluation']}")
if data.get("price_range"):
parts.append(f"가격대: {data['price_range']}")
parts.append(f"영상: {video_title}")
return ["\n".join(parts)]
def process_pending(limit: int = 5) -> int:
"""Process up to `limit` pending videos. Returns total restaurants found."""
videos = youtube.get_pending_videos(limit)
if not videos:
logger.info("No pending videos")
return 0
total = 0
for v in videos:
total += process_video(v)
return total

205
backend/core/restaurant.py Normal file
View File

@@ -0,0 +1,205 @@
"""Restaurant DB operations — save extracted data, link to videos."""
from __future__ import annotations
import json
import oracledb
from core.db import conn
def find_by_name(name: str) -> dict | None:
"""Find a restaurant by exact name match."""
sql = "SELECT id, name, address, region, latitude, longitude FROM restaurants WHERE name = :n"
with conn() as c:
cur = c.cursor()
cur.execute(sql, {"n": name})
r = cur.fetchone()
if r:
return {
"id": r[0], "name": r[1], "address": r[2],
"region": r[3], "latitude": r[4], "longitude": r[5],
}
return None
def upsert(
name: str,
address: str | None = None,
region: str | None = None,
latitude: float | None = None,
longitude: float | None = None,
cuisine_type: str | None = None,
price_range: str | None = None,
google_place_id: str | None = None,
) -> str:
"""Insert or update a restaurant. Returns row id."""
existing = find_by_name(name)
if existing:
sql = """
UPDATE restaurants
SET address = COALESCE(:addr, address),
region = COALESCE(:reg, region),
latitude = COALESCE(:lat, latitude),
longitude = COALESCE(:lng, longitude),
cuisine_type = COALESCE(:cuisine, cuisine_type),
price_range = COALESCE(:price, price_range),
google_place_id = COALESCE(:gid, google_place_id),
updated_at = SYSTIMESTAMP
WHERE id = :id
"""
with conn() as c:
c.cursor().execute(sql, {
"addr": address, "reg": region,
"lat": latitude, "lng": longitude,
"cuisine": cuisine_type, "price": price_range,
"gid": google_place_id, "id": existing["id"],
})
return existing["id"]
sql = """
INSERT INTO restaurants (name, address, region, latitude, longitude,
cuisine_type, price_range, google_place_id)
VALUES (:name, :addr, :reg, :lat, :lng, :cuisine, :price, :gid)
RETURNING id INTO :out_id
"""
with conn() as c:
cur = c.cursor()
out_id = cur.var(oracledb.STRING)
cur.execute(sql, {
"name": name, "addr": address, "reg": region,
"lat": latitude, "lng": longitude,
"cuisine": cuisine_type, "price": price_range,
"gid": google_place_id, "out_id": out_id,
})
return out_id.getvalue()[0]
def link_video_restaurant(
video_db_id: str,
restaurant_id: str,
foods: list[str] | None = None,
evaluation: str | None = None,
guests: list[str] | None = None,
citation: str | None = None,
) -> str | None:
"""Create video-restaurant mapping. Returns row id or None if duplicate."""
sql = """
INSERT INTO video_restaurants
(video_id, restaurant_id, foods_mentioned, evaluation, guests, citation_text)
VALUES (:vid, :rid, :foods, :eval, :guests, :cite)
RETURNING id INTO :out_id
"""
with conn() as c:
cur = c.cursor()
out_id = cur.var(oracledb.STRING)
try:
cur.execute(sql, {
"vid": video_db_id,
"rid": restaurant_id,
"foods": json.dumps(foods or [], ensure_ascii=False),
"eval": json.dumps({"text": evaluation} if evaluation else {}, ensure_ascii=False),
"guests": json.dumps(guests or [], ensure_ascii=False),
"cite": citation,
"out_id": out_id,
})
return out_id.getvalue()[0]
except Exception as e:
if "UQ_VR_VIDEO_REST" in str(e).upper():
return None
raise
def get_all(
limit: int = 100,
offset: int = 0,
cuisine: str | None = None,
region: str | None = None,
) -> list[dict]:
"""List restaurants with optional filters."""
conditions = ["latitude IS NOT NULL"]
params: dict = {"lim": limit, "off": offset}
if cuisine:
conditions.append("cuisine_type = :cuisine")
params["cuisine"] = cuisine
if region:
conditions.append("region LIKE :region")
params["region"] = f"%{region}%"
where = " AND ".join(conditions)
sql = f"""
SELECT id, name, address, region, latitude, longitude,
cuisine_type, price_range, google_place_id
FROM restaurants
WHERE {where}
ORDER BY updated_at DESC
OFFSET :off ROWS FETCH NEXT :lim ROWS ONLY
"""
with conn() as c:
cur = c.cursor()
cur.execute(sql, params)
cols = [d[0].lower() for d in cur.description]
return [dict(zip(cols, row)) for row in cur.fetchall()]
def get_by_id(restaurant_id: str) -> dict | None:
sql = """
SELECT r.id, r.name, r.address, r.region, r.latitude, r.longitude,
r.cuisine_type, r.price_range, r.phone, r.website, r.google_place_id
FROM restaurants r
WHERE r.id = :id
"""
with conn() as c:
cur = c.cursor()
cur.execute(sql, {"id": restaurant_id})
row = cur.fetchone()
if not row:
return None
cols = [d[0].lower() for d in cur.description]
return dict(zip(cols, row))
def get_video_links(restaurant_id: str) -> list[dict]:
"""Get all video appearances for a restaurant."""
sql = """
SELECT v.video_id, v.title, v.url, v.published_at,
vr.foods_mentioned, vr.evaluation, vr.guests
FROM video_restaurants vr
JOIN videos v ON v.id = vr.video_id
WHERE vr.restaurant_id = :rid
ORDER BY v.published_at DESC
"""
with conn() as c:
cur = c.cursor()
cur.execute(sql, {"rid": restaurant_id})
results = []
for r in cur.fetchall():
foods_raw = r[4].read() if hasattr(r[4], "read") else r[4]
eval_raw = r[5].read() if hasattr(r[5], "read") else r[5]
guests_raw = r[6].read() if hasattr(r[6], "read") else r[6]
results.append({
"video_id": r[0],
"title": r[1],
"url": r[2],
"published_at": r[3].isoformat() if r[3] else None,
"foods_mentioned": _parse_json_field(foods_raw, []),
"evaluation": _parse_json_field(eval_raw, {}),
"guests": _parse_json_field(guests_raw, []),
})
return results
def _parse_json_field(val, default):
"""Parse a JSON field that may be a string, already-parsed object, or None."""
if val is None:
return default
if isinstance(val, (list, dict)):
return val
if isinstance(val, str):
try:
return json.loads(val)
except (json.JSONDecodeError, ValueError):
return default
return default

189
backend/core/review.py Normal file
View File

@@ -0,0 +1,189 @@
"""User review DB operations."""
from __future__ import annotations
from datetime import date
import oracledb
from core.db import conn
def create_review(
user_id: str,
restaurant_id: str,
rating: float,
review_text: str | None = None,
visited_at: date | None = None,
) -> dict:
"""Create a new review. Returns the created review dict."""
sql = """
INSERT INTO user_reviews (user_id, restaurant_id, rating, review_text, visited_at)
VALUES (:user_id, :restaurant_id, :rating, :review_text, :visited_at)
RETURNING id INTO :out_id
"""
with conn() as c:
cur = c.cursor()
out_id = cur.var(oracledb.STRING)
cur.execute(sql, {
"user_id": user_id,
"restaurant_id": restaurant_id,
"rating": rating,
"review_text": review_text,
"visited_at": visited_at,
"out_id": out_id,
})
new_id = out_id.getvalue()[0]
return get_review_by_id(new_id)
def update_review(
review_id: str,
user_id: str,
rating: float | None = None,
review_text: str | None = None,
visited_at: date | None = None,
) -> dict:
"""Update an existing review. Only the owner can update.
Returns the updated review dict, or None if not found / not owner.
"""
sql = """
UPDATE user_reviews
SET rating = COALESCE(:rating, rating),
review_text = COALESCE(:review_text, review_text),
visited_at = COALESCE(:visited_at, visited_at),
updated_at = SYSTIMESTAMP
WHERE id = :id AND user_id = :user_id
"""
with conn() as c:
cur = c.cursor()
cur.execute(sql, {
"rating": rating,
"review_text": review_text,
"visited_at": visited_at,
"id": review_id,
"user_id": user_id,
})
if cur.rowcount == 0:
return None
return get_review_by_id(review_id)
def delete_review(review_id: str, user_id: str) -> bool:
"""Delete a review. Only the owner can delete. Returns True if deleted."""
sql = "DELETE FROM user_reviews WHERE id = :id AND user_id = :user_id"
with conn() as c:
cur = c.cursor()
cur.execute(sql, {"id": review_id, "user_id": user_id})
return cur.rowcount > 0
def get_review_by_id(review_id: str) -> dict | None:
"""Get a single review by ID."""
sql = """
SELECT r.id, r.user_id, r.restaurant_id, r.rating, r.review_text,
r.visited_at, r.created_at, r.updated_at,
u.nickname, u.avatar_url
FROM user_reviews r
JOIN tasteby_users u ON u.id = r.user_id
WHERE r.id = :id
"""
with conn() as c:
cur = c.cursor()
cur.execute(sql, {"id": review_id})
row = cur.fetchone()
if not row:
return None
return _row_to_dict(row)
def get_reviews_for_restaurant(
restaurant_id: str,
limit: int = 20,
offset: int = 0,
) -> list[dict]:
"""List reviews for a restaurant, including user nickname/avatar."""
sql = """
SELECT r.id, r.user_id, r.restaurant_id, r.rating, r.review_text,
r.visited_at, r.created_at, r.updated_at,
u.nickname, u.avatar_url
FROM user_reviews r
JOIN tasteby_users u ON u.id = r.user_id
WHERE r.restaurant_id = :restaurant_id
ORDER BY r.created_at DESC
OFFSET :off ROWS FETCH NEXT :lim ROWS ONLY
"""
with conn() as c:
cur = c.cursor()
cur.execute(sql, {
"restaurant_id": restaurant_id,
"off": offset,
"lim": limit,
})
return [_row_to_dict(row) for row in cur.fetchall()]
def get_user_reviews(
user_id: str,
limit: int = 20,
offset: int = 0,
) -> list[dict]:
"""List reviews by a specific user."""
sql = """
SELECT r.id, r.user_id, r.restaurant_id, r.rating, r.review_text,
r.visited_at, r.created_at, r.updated_at,
u.nickname, u.avatar_url
FROM user_reviews r
JOIN tasteby_users u ON u.id = r.user_id
WHERE r.user_id = :user_id
ORDER BY r.created_at DESC
OFFSET :off ROWS FETCH NEXT :lim ROWS ONLY
"""
with conn() as c:
cur = c.cursor()
cur.execute(sql, {
"user_id": user_id,
"off": offset,
"lim": limit,
})
return [_row_to_dict(row) for row in cur.fetchall()]
def get_restaurant_avg_rating(restaurant_id: str) -> dict:
"""Get average rating and review count for a restaurant."""
sql = """
SELECT ROUND(AVG(rating), 1) AS avg_rating, COUNT(*) AS review_count
FROM user_reviews
WHERE restaurant_id = :restaurant_id
"""
with conn() as c:
cur = c.cursor()
cur.execute(sql, {"restaurant_id": restaurant_id})
row = cur.fetchone()
return {
"avg_rating": float(row[0]) if row[0] else None,
"review_count": int(row[1]),
}
def _row_to_dict(row) -> dict:
"""Convert a review query row to a dict."""
review_text = row[4]
if hasattr(review_text, "read"):
review_text = review_text.read()
return {
"id": row[0],
"user_id": row[1],
"restaurant_id": row[2],
"rating": float(row[3]),
"review_text": review_text,
"visited_at": row[5].isoformat() if row[5] else None,
"created_at": row[6].isoformat() if row[6] else None,
"updated_at": row[7].isoformat() if row[7] else None,
"user_nickname": row[8],
"user_avatar_url": row[9],
}

97
backend/core/vector.py Normal file
View File

@@ -0,0 +1,97 @@
"""Vector embedding generation and storage for restaurant semantic search."""
from __future__ import annotations
import array
import os
import oci
from oci.generative_ai_inference import GenerativeAiInferenceClient
from oci.generative_ai_inference.models import (
EmbedTextDetails,
OnDemandServingMode,
)
from core.db import conn
def _embed_texts(texts: list[str]) -> list[list[float]]:
config = oci.config.from_file()
client = GenerativeAiInferenceClient(
config,
service_endpoint=os.environ["OCI_GENAI_ENDPOINT"],
)
model_id = os.environ.get("OCI_EMBED_MODEL_ID", "cohere.embed-v4.0")
compartment_id = os.environ["OCI_COMPARTMENT_ID"]
details = EmbedTextDetails(
inputs=texts,
serving_mode=OnDemandServingMode(model_id=model_id),
compartment_id=compartment_id,
input_type="SEARCH_DOCUMENT",
)
response = client.embed_text(details)
return response.data.embeddings
def _to_vec(embedding: list[float]) -> array.array:
return array.array("f", embedding)
def save_restaurant_vectors(restaurant_id: str, chunks: list[str]) -> list[str]:
"""Embed and store text chunks for a restaurant.
Returns list of inserted row IDs.
"""
if not chunks:
return []
embeddings = _embed_texts(chunks)
inserted: list[str] = []
sql = """
INSERT INTO restaurant_vectors (restaurant_id, chunk_text, embedding)
VALUES (:rid, :chunk, :emb)
RETURNING id INTO :out_id
"""
import oracledb
with conn() as c:
cur = c.cursor()
for chunk, emb in zip(chunks, embeddings):
out_id = cur.var(oracledb.STRING)
cur.execute(sql, {
"rid": restaurant_id,
"chunk": chunk,
"emb": _to_vec(emb),
"out_id": out_id,
})
inserted.append(out_id.getvalue()[0])
return inserted
def search_similar(query: str, top_k: int = 10) -> list[dict]:
"""Semantic search: find restaurants similar to query text.
Returns list of dicts: restaurant_id, chunk_text, distance.
"""
embeddings = _embed_texts([query])
query_vec = _to_vec(embeddings[0])
sql = """
SELECT rv.restaurant_id, rv.chunk_text,
VECTOR_DISTANCE(rv.embedding, :qvec, COSINE) AS dist
FROM restaurant_vectors rv
ORDER BY dist
FETCH FIRST :k ROWS ONLY
"""
with conn() as c:
cur = c.cursor()
cur.execute(sql, {"qvec": query_vec, "k": top_k})
return [
{
"restaurant_id": r[0],
"chunk_text": r[1].read() if hasattr(r[1], "read") else r[1],
"distance": r[2],
}
for r in cur.fetchall()
]

221
backend/core/youtube.py Normal file
View File

@@ -0,0 +1,221 @@
"""YouTube channel scanner + transcript extraction.
Uses YouTube Data API v3 for channel video listing,
youtube-transcript-api for transcript extraction.
"""
from __future__ import annotations
import logging
import os
import re
from datetime import datetime
import httpx
from youtube_transcript_api import YouTubeTranscriptApi
from core.db import conn
logger = logging.getLogger(__name__)
def _api_key() -> str:
return os.environ["YOUTUBE_DATA_API_KEY"]
def extract_video_id(url: str) -> str:
match = re.search(r"(?:v=|youtu\.be/)([^&?/\s]+)", url)
if not match:
raise ValueError(f"Cannot extract video ID from URL: {url}")
return match.group(1)
# -- Channel operations -------------------------------------------------------
def add_channel(channel_id: str, channel_name: str) -> str:
"""Register a YouTube channel. Returns DB row id."""
sql = """
INSERT INTO channels (channel_id, channel_name, channel_url)
VALUES (:cid, :cname, :curl)
RETURNING id INTO :out_id
"""
with conn() as c:
cur = c.cursor()
import oracledb
out_id = cur.var(oracledb.STRING)
cur.execute(sql, {
"cid": channel_id,
"cname": channel_name,
"curl": f"https://www.youtube.com/channel/{channel_id}",
"out_id": out_id,
})
return out_id.getvalue()[0]
def get_active_channels() -> list[dict]:
sql = "SELECT id, channel_id, channel_name FROM channels WHERE is_active = 1"
with conn() as c:
cur = c.cursor()
cur.execute(sql)
return [
{"id": r[0], "channel_id": r[1], "channel_name": r[2]}
for r in cur.fetchall()
]
# -- Video listing via YouTube Data API v3 ------------------------------------
def fetch_channel_videos(
channel_id: str,
max_results: int = 50,
published_after: str | None = None,
) -> list[dict]:
"""Fetch video list from a YouTube channel via Data API v3.
Returns list of dicts: video_id, title, published_at, url.
"""
params: dict = {
"key": _api_key(),
"channelId": channel_id,
"part": "snippet",
"order": "date",
"maxResults": min(max_results, 50),
"type": "video",
}
if published_after:
params["publishedAfter"] = published_after
videos: list[dict] = []
next_page = None
while True:
if next_page:
params["pageToken"] = next_page
r = httpx.get(
"https://www.googleapis.com/youtube/v3/search",
params=params,
timeout=15,
)
r.raise_for_status()
data = r.json()
for item in data.get("items", []):
snippet = item["snippet"]
vid = item["id"]["videoId"]
videos.append({
"video_id": vid,
"title": snippet["title"],
"published_at": snippet["publishedAt"],
"url": f"https://www.youtube.com/watch?v={vid}",
})
next_page = data.get("nextPageToken")
if not next_page or len(videos) >= max_results:
break
return videos[:max_results]
# -- Transcript extraction ----------------------------------------------------
def get_transcript(video_id: str) -> str | None:
"""Fetch transcript text for a video. Returns None if unavailable."""
try:
fetched = YouTubeTranscriptApi().fetch(video_id, languages=["ko", "en"])
return " ".join(seg.text for seg in fetched)
except Exception as e:
logger.warning("Transcript unavailable for %s: %s", video_id, e)
return None
# -- DB operations for videos -------------------------------------------------
def save_video(channel_db_id: str, video: dict) -> str | None:
"""Insert a video row if not exists. Returns row id or None if duplicate."""
sql = """
INSERT INTO videos (channel_id, video_id, title, url, published_at, status)
VALUES (:ch_id, :vid, :title, :url, :pub_at, 'pending')
RETURNING id INTO :out_id
"""
with conn() as c:
cur = c.cursor()
import oracledb
out_id = cur.var(oracledb.STRING)
try:
pub_at = None
if video.get("published_at"):
pub_at = datetime.fromisoformat(
video["published_at"].replace("Z", "+00:00")
)
cur.execute(sql, {
"ch_id": channel_db_id,
"vid": video["video_id"],
"title": video["title"],
"url": video["url"],
"pub_at": pub_at,
"out_id": out_id,
})
return out_id.getvalue()[0]
except Exception as e:
if "UQ_VIDEOS_VID" in str(e).upper():
return None # duplicate
raise
def get_pending_videos(limit: int = 10) -> list[dict]:
sql = """
SELECT id, video_id, title, url
FROM videos
WHERE status = 'pending'
ORDER BY created_at
FETCH FIRST :n ROWS ONLY
"""
with conn() as c:
cur = c.cursor()
cur.execute(sql, {"n": limit})
return [
{"id": r[0], "video_id": r[1], "title": r[2], "url": r[3]}
for r in cur.fetchall()
]
def update_video_status(
video_db_id: str,
status: str,
transcript: str | None = None,
llm_raw: str | None = None,
) -> None:
sets = ["status = :st", "processed_at = SYSTIMESTAMP"]
params: dict = {"st": status, "vid": video_db_id}
if transcript:
sets.append("transcript_text = :txt")
params["txt"] = transcript
if llm_raw:
sets.append("llm_raw_response = :llm_resp")
params["llm_resp"] = llm_raw
sql = f"UPDATE videos SET {', '.join(sets)} WHERE id = :vid"
with conn() as c:
c.cursor().execute(sql, params)
# -- Scan: fetch new videos for all active channels ---------------------------
def scan_all_channels(max_per_channel: int = 50) -> int:
"""Scan all active channels for new videos. Returns count of new videos."""
channels = get_active_channels()
total_new = 0
for ch in channels:
try:
videos = fetch_channel_videos(ch["channel_id"], max_per_channel)
for v in videos:
row_id = save_video(ch["id"], v)
if row_id:
total_new += 1
logger.info(
"Channel %s: fetched %d videos, %d new",
ch["channel_name"], len(videos), total_new,
)
except Exception as e:
logger.error("Failed to scan channel %s: %s", ch["channel_name"], e)
return total_new