"""Extract vocabulary from English content using Gemini Flash.""" import json import os import re import oci from oci.generative_ai_inference import GenerativeAiInferenceClient from oci.generative_ai_inference.models import ( ChatDetails, GenericChatRequest, OnDemandServingMode, TextContent, UserMessage, ) _PROMPT = """\ You are an English vocabulary instructor. Analyze the English text below and extract words or phrases that an intermediate English learner (B1-B2 CEFR level) might not know. Rules: - Extract 5 to 10 items maximum - Skip very basic words (go, see, love, etc.) and overly academic/rare words - Focus on useful vocabulary: idioms, collocations, phrasal verbs, and mid-level words - Each item must appear in the source text Return ONLY a valid JSON array. Each element must have: - "word": the word or phrase as it appears (string) - "pos": part of speech, e.g. "verb", "noun", "adjective", "phrase", "phrasal verb" (string) - "definition_en": concise English definition (string) - "definition_ko": Korean translation of the definition (string) - "example": the sentence from the source text that contains this word/phrase (string) Source title: {title} Text: {text} Return only the JSON array, no markdown, no explanation.""" def _get_client() -> GenerativeAiInferenceClient: config = oci.config.from_file() endpoint = os.environ.get("OCI_CHAT_ENDPOINT") or os.environ["OCI_GENAI_ENDPOINT"] return GenerativeAiInferenceClient(config, service_endpoint=endpoint) def extract_vocab(text: str, title: str = "") -> list[dict]: """Extract intermediate-level English vocabulary from text using Gemini Flash. Args: text: English source text to analyze. title: Content title for context. Returns: List of vocab dicts with keys: word, pos, definition_en, definition_ko, example. Returns empty list on failure or if no suitable vocab found. """ prompt = _PROMPT.format(title=title, text=text[:5000]) try: client = _get_client() req = GenericChatRequest( messages=[UserMessage(content=[TextContent(text=prompt)])], max_tokens=2048, temperature=0, ) det = ChatDetails( compartment_id=os.environ["OCI_COMPARTMENT_ID"], serving_mode=OnDemandServingMode(model_id=os.environ["OCI_CHAT_MODEL_ID"]), chat_request=req, ) response = client.chat(det) raw = response.data.chat_response.choices[0].message.content[0].text.strip() raw = re.sub(r"^```(?:json)?\s*|\s*```$", "", raw, flags=re.MULTILINE) items = json.loads(raw) if not isinstance(items, list): return [] return items except Exception: return []