Files
tasteby/backend/core/extractor.py
joungmin 3694730501 Add admin features, responsive UI, user reviews, visit stats, and channel-colored markers
- Admin: video management with Google Maps match status, manual restaurant mapping, restaurant remap on name change
- Admin: user management tab with favorites/reviews detail
- Admin: channel deletion fix for IDs with slashes
- Frontend: responsive mobile layout (map top, list bottom, 2-row header)
- Frontend: channel-colored map markers with legend
- Frontend: my reviews list, favorites toggle, visit counter overlay
- Frontend: force light mode for dark theme devices
- Backend: visit tracking (site_visits table), user reviews endpoint
- Backend: bulk transcript/extract streaming, geocode key fixes
- Nginx config for production deployment

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-07 14:52:20 +09:00

141 lines
4.6 KiB
Python

"""LLM-based restaurant info extraction from video transcripts.
Uses OCI GenAI (Gemini Flash) to extract structured restaurant data.
"""
from __future__ import annotations
import json
import logging
import os
import re
import oci
from oci.generative_ai_inference import GenerativeAiInferenceClient
from oci.generative_ai_inference.models import (
ChatDetails,
GenericChatRequest,
OnDemandServingMode,
TextContent,
UserMessage,
)
logger = logging.getLogger(__name__)
def _get_client() -> GenerativeAiInferenceClient:
config = oci.config.from_file()
endpoint = os.environ.get("OCI_CHAT_ENDPOINT") or os.environ["OCI_GENAI_ENDPOINT"]
return GenerativeAiInferenceClient(config, service_endpoint=endpoint)
def _llm(prompt: str, max_tokens: int = 4096) -> str:
client = _get_client()
req = GenericChatRequest(
messages=[UserMessage(content=[TextContent(text=prompt)])],
max_tokens=max_tokens,
temperature=0,
)
det = ChatDetails(
compartment_id=os.environ["OCI_COMPARTMENT_ID"],
serving_mode=OnDemandServingMode(model_id=os.environ["OCI_CHAT_MODEL_ID"]),
chat_request=req,
)
resp = client.chat(det)
return resp.data.chat_response.choices[0].message.content[0].text.strip()
def _parse_json(raw: str) -> dict | list:
raw = re.sub(r"^```(?:json)?\s*|\s*```$", "", raw, flags=re.MULTILINE).strip()
raw = re.sub(r",\s*([}\]])", r"\1", raw)
try:
return json.loads(raw)
except json.JSONDecodeError:
pass
try:
return json.JSONDecoder(strict=False).decode(raw)
except json.JSONDecodeError:
pass
# recover truncated array — extract complete objects one by one
if raw.lstrip().startswith("["):
decoder = json.JSONDecoder(strict=False)
items: list = []
idx = raw.index("[") + 1
while idx < len(raw):
while idx < len(raw) and raw[idx] in " \t\n\r,":
idx += 1
if idx >= len(raw) or raw[idx] == "]":
break
try:
obj, end = decoder.raw_decode(raw, idx)
items.append(obj)
idx = end
except json.JSONDecodeError:
# Try to recover truncated last object by closing braces
remainder = raw[idx:]
for fix in ["}", "}]", '"}', '"}' , '"}]', "null}", "null}]"]:
try:
patched = remainder.rstrip().rstrip(",") + fix
obj = json.loads(patched)
if isinstance(obj, dict) and obj.get("name"):
items.append(obj)
except (json.JSONDecodeError, ValueError):
continue
break
if items:
logger.info("Recovered %d restaurants from truncated JSON", len(items))
return items
raise ValueError(f"JSON parse failed: {raw[:80]!r}")
_EXTRACT_PROMPT = """\
다음은 유튜브 먹방/맛집 영상의 자막입니다.
이 영상에서 언급된 모든 식당 정보를 추출하세요.
규칙:
- 식당이 없으면 빈 배열 [] 반환
- 각 식당에 대해 아래 필드를 JSON 배열로 반환
- 확실하지 않은 정보는 null
- 추가 설명 없이 JSON만 반환
필드:
- name: 식당 이름 (string, 필수)
- address: 주소 또는 위치 힌트 (string | null)
- region: 지역 (예: 서울 강남, 부산 해운대) (string | null)
- cuisine_type: 음식 종류 (예: 한식, 일식, 중식, 양식, 카페) (string | null)
- price_range: 가격대 (예: 1만원대, 2-3만원) (string | null)
- foods_mentioned: 언급된 메뉴들 (string[])
- evaluation: 평가 내용 (string | null)
- guests: 함께한 게스트 (string[])
영상 제목: {title}
자막:
{transcript}
JSON 배열:"""
def extract_restaurants(title: str, transcript: str, custom_prompt: str | None = None) -> tuple[list[dict], str]:
"""Extract restaurant info from a video transcript using LLM.
Returns (list of restaurant dicts, raw LLM response text).
"""
# Truncate very long transcripts
if len(transcript) > 8000:
transcript = transcript[:7000] + "\n...(중략)...\n" + transcript[-1000:]
template = custom_prompt if custom_prompt else _EXTRACT_PROMPT
prompt = template.format(title=title, transcript=transcript)
try:
raw = _llm(prompt, max_tokens=8192)
result = _parse_json(raw)
if isinstance(result, list):
return result, raw
if isinstance(result, dict):
return [result], raw
return [], raw
except Exception as e:
logger.error("Restaurant extraction failed: %s", e)
return [], ""