Files
tasteby/backend/core/extractor.py
joungmin 54d21afd52 Add food tag remap feature and show menu tags in restaurant cards
- LLM extraction prompt: foods_mentioned max 10, Korean only, prioritized
- New /remap-foods API endpoint for bulk LLM re-extraction
- Admin UI: "메뉴태그 재생성" button with SSE progress bar
- Backend: attach foods_mentioned to restaurant list API response
- Restaurant cards: display food tags (orange, max 5 visible)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-09 11:21:05 +09:00

147 lines
5.2 KiB
Python

"""LLM-based restaurant info extraction from video transcripts.
Uses OCI GenAI (Gemini Flash) to extract structured restaurant data.
"""
from __future__ import annotations
import json
import logging
import os
import re
import oci
from oci.generative_ai_inference import GenerativeAiInferenceClient
from oci.generative_ai_inference.models import (
ChatDetails,
GenericChatRequest,
OnDemandServingMode,
TextContent,
UserMessage,
)
from core.cuisine import CUISINE_LIST_TEXT
logger = logging.getLogger(__name__)
def _get_client() -> GenerativeAiInferenceClient:
config = oci.config.from_file()
endpoint = os.environ.get("OCI_CHAT_ENDPOINT") or os.environ["OCI_GENAI_ENDPOINT"]
return GenerativeAiInferenceClient(config, service_endpoint=endpoint)
def _llm(prompt: str, max_tokens: int = 4096) -> str:
client = _get_client()
req = GenericChatRequest(
messages=[UserMessage(content=[TextContent(text=prompt)])],
max_tokens=max_tokens,
temperature=0,
)
det = ChatDetails(
compartment_id=os.environ["OCI_COMPARTMENT_ID"],
serving_mode=OnDemandServingMode(model_id=os.environ["OCI_CHAT_MODEL_ID"]),
chat_request=req,
)
resp = client.chat(det)
return resp.data.chat_response.choices[0].message.content[0].text.strip()
def _parse_json(raw: str) -> dict | list:
raw = re.sub(r"^```(?:json)?\s*|\s*```$", "", raw, flags=re.MULTILINE).strip()
raw = re.sub(r",\s*([}\]])", r"\1", raw)
try:
return json.loads(raw)
except json.JSONDecodeError:
pass
try:
return json.JSONDecoder(strict=False).decode(raw)
except json.JSONDecodeError:
pass
# recover truncated array — extract complete objects one by one
if raw.lstrip().startswith("["):
decoder = json.JSONDecoder(strict=False)
items: list = []
idx = raw.index("[") + 1
while idx < len(raw):
while idx < len(raw) and raw[idx] in " \t\n\r,":
idx += 1
if idx >= len(raw) or raw[idx] == "]":
break
try:
obj, end = decoder.raw_decode(raw, idx)
items.append(obj)
idx = end
except json.JSONDecodeError:
# Try to recover truncated last object by closing braces
remainder = raw[idx:]
for fix in ["}", "}]", '"}', '"}' , '"}]', "null}", "null}]"]:
try:
patched = remainder.rstrip().rstrip(",") + fix
obj = json.loads(patched)
if isinstance(obj, dict) and obj.get("name"):
items.append(obj)
except (json.JSONDecodeError, ValueError):
continue
break
if items:
logger.info("Recovered %d restaurants from truncated JSON", len(items))
return items
raise ValueError(f"JSON parse failed: {raw[:80]!r}")
_EXTRACT_PROMPT = """\
다음은 유튜브 먹방/맛집 영상의 자막입니다.
이 영상에서 언급된 모든 식당 정보를 추출하세요.
규칙:
- 식당이 없으면 빈 배열 [] 반환
- 각 식당에 대해 아래 필드를 JSON 배열로 반환
- 확실하지 않은 정보는 null
- 추가 설명 없이 JSON만 반환
필드:
- name: 식당 이름 (string, 필수)
- address: 주소 또는 위치 힌트 (string | null)
- region: 지역을 "나라|시/도|구/군/시" 파이프(|) 구분 형식으로 작성 (string | null)
- 한국 예시: "한국|서울|강남구", "한국|부산|해운대구", "한국|제주", "한국|강원|강릉시"
- 해외 예시: "일본|도쿄", "일본|오사카", "싱가포르", "미국|뉴욕", "태국|방콕"
- 나라는 한글로, 해외 도시도 한글로 표기
- cuisine_type: 아래 목록에서 가장 적합한 것을 선택 (string, 필수). 반드시 아래 목록 중 하나를 사용:
{cuisine_types}
- price_range: 가격대 (예: 1만원대, 2-3만원) (string | null)
- foods_mentioned: 언급된 대표 메뉴 (string[], 최대 10개, 우선순위 높은 순, 반드시 한글로 작성)
- evaluation: 평가 내용 (string | null)
- guests: 함께한 게스트 (string[])
영상 제목: {{title}}
자막:
{{transcript}}
JSON 배열:""".format(cuisine_types=CUISINE_LIST_TEXT)
def extract_restaurants(title: str, transcript: str, custom_prompt: str | None = None) -> tuple[list[dict], str]:
"""Extract restaurant info from a video transcript using LLM.
Returns (list of restaurant dicts, raw LLM response text).
"""
# Truncate very long transcripts
if len(transcript) > 8000:
transcript = transcript[:7000] + "\n...(중략)...\n" + transcript[-1000:]
template = custom_prompt if custom_prompt else _EXTRACT_PROMPT
prompt = template.format(title=title, transcript=transcript)
try:
raw = _llm(prompt, max_tokens=8192)
result = _parse_json(raw)
if isinstance(result, list):
return result, raw
if isinstance(result, dict):
return [result], raw
return [], raw
except Exception as e:
logger.error("Restaurant extraction failed: %s", e)
return [], ""