"""LLM-based content enrichment via OCI GenAI Gemini Flash.""" import json import os import re import oci from oci.generative_ai_inference import GenerativeAiInferenceClient from oci.generative_ai_inference.models import ( ChatDetails, GenericChatRequest, OnDemandServingMode, TextContent, UserMessage, ) _PROMPT = """\ You are a knowledge extraction assistant. Analyze the content below and return ONLY a valid JSON object with these fields: - "title": concise descriptive title for this content (string) - "summary": 3-5 sentence summary capturing key insights, written in English (string) - "summary_ko": the same summary translated into Korean (string) - "tags": list of 3-7 relevant keywords or topics (string[]) - "author": author or creator name, or null if not found (string | null) - "date": publication date in ISO 8601 format (YYYY-MM-DD), or null if not found (string | null) - "content_type": one of "youtube", "article", "documentation", "news", "forum", "code", "other" (string) - "language": primary language of the content, ISO 639-1 code, e.g. "en", "ko", "ja" (string) Content type: {content_type} Source URL: {url} Content: {text} Return only the JSON object, no markdown, no explanation.""" def _get_client() -> GenerativeAiInferenceClient: config = oci.config.from_file() # Gemini models live in us-ashburn-1; use OCI_CHAT_ENDPOINT if set, # otherwise fall back to OCI_GENAI_ENDPOINT. endpoint = os.environ.get("OCI_CHAT_ENDPOINT") or os.environ["OCI_GENAI_ENDPOINT"] return GenerativeAiInferenceClient(config, service_endpoint=endpoint) def enrich(content_type: str, title: str, url: str, text: str) -> dict: """Extract structured metadata from content using Gemini Flash. Args: content_type: One of 'youtube', 'url', 'text'. title: Initial title hint (may be empty). url: Source URL (empty for plain text). text: The full content text to analyze. Returns: Dict with keys: title, summary, tags, author, date, content_type. Falls back to minimal defaults on LLM failure. """ prompt = _PROMPT.format( content_type=content_type, url=url or "(none)", text=text[:6000], ) try: client = _get_client() req = GenericChatRequest( messages=[UserMessage(content=[TextContent(text=prompt)])], max_tokens=1024, temperature=0, ) det = ChatDetails( compartment_id=os.environ["OCI_COMPARTMENT_ID"], serving_mode=OnDemandServingMode(model_id=os.environ["OCI_CHAT_MODEL_ID"]), chat_request=req, ) response = client.chat(det) raw = response.data.chat_response.choices[0].message.content[0].text.strip() raw = re.sub(r"^```(?:json)?\s*|\s*```$", "", raw, flags=re.MULTILINE) metadata = json.loads(raw) except Exception as exc: metadata = { "title": title or url or text[:80], "summary": text[:300], "tags": [], "author": None, "date": None, "content_type": content_type, "_error": str(exc), } # Ensure required keys exist metadata.setdefault("title", title or url or text[:80]) metadata.setdefault("summary", "") metadata.setdefault("tags", []) metadata.setdefault("author", None) metadata.setdefault("date", None) metadata.setdefault("content_type", content_type) metadata.setdefault("language", "en") return metadata