feat: add English vocab extraction and Anki card registration

- core/vocab.py: extract B1-B2 level vocabulary from English content via Gemini Flash - core/anki.py: register vocab cards to AnkiConnect (English::Vocabulary deck) - core/enricher.py: add language detection field + summary_ko (Korean summary) - core/obsidian.py: render Korean + English summary in note - daemon/worker.py: call vocab extraction and Anki registration for English content
2026-02-28 08:39:58 +09:00
parent 86a4104ae3
commit a9db6a8771
5 changed files with 208 additions and 8 deletions
--- a/core/vocab.py
+++ b/core/vocab.py
@@ -0,0 +1,79 @@
+"""Extract vocabulary from English content using Gemini Flash."""
+
+import json
+import os
+import re
+
+import oci
+from oci.generative_ai_inference import GenerativeAiInferenceClient
+from oci.generative_ai_inference.models import (
+    ChatDetails,
+    GenericChatRequest,
+    OnDemandServingMode,
+    TextContent,
+    UserMessage,
+)
+
+_PROMPT = """\
+You are an English vocabulary instructor. Analyze the English text below and extract words or phrases that an intermediate English learner (B1-B2 CEFR level) might not know.
+
+Rules:
+- Extract 5 to 10 items maximum
+- Skip very basic words (go, see, love, etc.) and overly academic/rare words
+- Focus on useful vocabulary: idioms, collocations, phrasal verbs, and mid-level words
+- Each item must appear in the source text
+
+Return ONLY a valid JSON array. Each element must have:
+- "word": the word or phrase as it appears (string)
+- "pos": part of speech, e.g. "verb", "noun", "adjective", "phrase", "phrasal verb" (string)
+- "definition_en": concise English definition (string)
+- "definition_ko": Korean translation of the definition (string)
+- "example": the sentence from the source text that contains this word/phrase (string)
+
+Source title: {title}
+Text:
+{text}
+
+Return only the JSON array, no markdown, no explanation."""
+
+
+def _get_client() -> GenerativeAiInferenceClient:
+    config = oci.config.from_file()
+    endpoint = os.environ.get("OCI_CHAT_ENDPOINT") or os.environ["OCI_GENAI_ENDPOINT"]
+    return GenerativeAiInferenceClient(config, service_endpoint=endpoint)
+
+
+def extract_vocab(text: str, title: str = "") -> list[dict]:
+    """Extract intermediate-level English vocabulary from text using Gemini Flash.
+
+    Args:
+        text: English source text to analyze.
+        title: Content title for context.
+
+    Returns:
+        List of vocab dicts with keys: word, pos, definition_en, definition_ko, example.
+        Returns empty list on failure or if no suitable vocab found.
+    """
+    prompt = _PROMPT.format(title=title, text=text[:5000])
+
+    try:
+        client = _get_client()
+        req = GenericChatRequest(
+            messages=[UserMessage(content=[TextContent(text=prompt)])],
+            max_tokens=2048,
+            temperature=0,
+        )
+        det = ChatDetails(
+            compartment_id=os.environ["OCI_COMPARTMENT_ID"],
+            serving_mode=OnDemandServingMode(model_id=os.environ["OCI_CHAT_MODEL_ID"]),
+            chat_request=req,
+        )
+        response = client.chat(det)
+        raw = response.data.chat_response.choices[0].message.content[0].text.strip()
+        raw = re.sub(r"^```(?:json)?\s*|\s*```$", "", raw, flags=re.MULTILINE)
+        items = json.loads(raw)
+        if not isinstance(items, list):
+            return []
+        return items
+    except Exception:
+        return []