feat: add English vocab extraction and Anki card registration

- core/vocab.py: extract B1-B2 level vocabulary from English content via Gemini Flash - core/anki.py: register vocab cards to AnkiConnect (English::Vocabulary deck) - core/enricher.py: add language detection field + summary_ko (Korean summary) - core/obsidian.py: render Korean + English summary in note - daemon/worker.py: call vocab extraction and Anki registration for English content
2026-02-28 08:39:58 +09:00
parent 86a4104ae3
commit a9db6a8771
5 changed files with 208 additions and 8 deletions
--- a/core/anki.py
+++ b/core/anki.py
@@ -0,0 +1,106 @@
+"""AnkiConnect HTTP client for adding vocabulary cards."""
+
+import json
+import logging
+from urllib import request, error
+
+logger = logging.getLogger(__name__)
+
+ANKI_URL = "http://localhost:8765"
+DECK_NAME = "English::Vocabulary"
+MODEL_NAME = "Basic"
+
+
+def _invoke(action: str, **params) -> object:
+    """Call AnkiConnect API.
+
+    Args:
+        action: AnkiConnect action name.
+        **params: Action parameters.
+
+    Returns:
+        The result field from AnkiConnect response.
+
+    Raises:
+        RuntimeError: If AnkiConnect returns an error.
+    """
+    payload = json.dumps({"action": action, "version": 6, "params": params}).encode()
+    req = request.Request(ANKI_URL, payload, {"Content-Type": "application/json"})
+    with request.urlopen(req, timeout=5) as resp:
+        data = json.loads(resp.read())
+    if data.get("error"):
+        raise RuntimeError(f"AnkiConnect error: {data['error']}")
+    return data["result"]
+
+
+def _ensure_deck() -> None:
+    """Create deck if it doesn't exist."""
+    _invoke("createDeck", deck=DECK_NAME)
+
+
+def _build_front(word: str, pos: str, example: str) -> str:
+    return f"<b>{word}</b> <i>({pos})</i><br><br>{example}"
+
+
+def _build_back(definition_en: str, definition_ko: str) -> str:
+    return f"{definition_en}<br><br><b>한국어:</b> {definition_ko}"
+
+
+def add_vocab_cards(vocab_list: list[dict], source_title: str = "") -> list[int]:
+    """Add vocabulary cards to Anki.
+
+    Skips duplicates silently (AnkiConnect returns null for existing notes).
+
+    Args:
+        vocab_list: List of vocab dicts from extract_vocab().
+        source_title: Content title, added as a tag on each card.
+
+    Returns:
+        List of created note IDs (excludes skipped duplicates).
+    """
+    if not vocab_list:
+        return []
+
+    try:
+        _ensure_deck()
+    except error.URLError:
+        logger.warning("AnkiConnect unreachable — skipping vocab card creation")
+        return []
+
+    tag = source_title[:50].replace(" ", "_") if source_title else "knowledge-inbox"
+
+    notes = [
+        {
+            "deckName": DECK_NAME,
+            "modelName": MODEL_NAME,
+            "fields": {
+                "Front": _build_front(
+                    item.get("word", ""),
+                    item.get("pos", ""),
+                    item.get("example", ""),
+                ),
+                "Back": _build_back(
+                    item.get("definition_en", ""),
+                    item.get("definition_ko", ""),
+                ),
+            },
+            "options": {"allowDuplicate": False},
+            "tags": ["knowledge-inbox", tag],
+        }
+        for item in vocab_list
+        if item.get("word")
+    ]
+
+    try:
+        results = _invoke("addNotes", notes=notes)
+    except Exception as exc:
+        logger.error("Failed to add Anki cards: %s", exc)
+        return []
+
+    created = [note_id for note_id in results if note_id is not None]
+    skipped = len(results) - len(created)
+    logger.info(
+        "Anki: added %d card(s), skipped %d duplicate(s) for '%s'",
+        len(created), skipped, source_title[:40],
+    )
+    return created
--- a/core/enricher.py
+++ b/core/enricher.py
@@ -17,11 +17,13 @@ from oci.generative_ai_inference.models import (
 _PROMPT = """\
 You are a knowledge extraction assistant. Analyze the content below and return ONLY a valid JSON object with these fields:
 - "title": concise descriptive title for this content (string)
- "summary": 3-5 sentence summary capturing key insights (string)
+- "summary": 3-5 sentence summary capturing key insights, written in English (string)
+- "summary_ko": the same summary translated into Korean (string)
 - "tags": list of 3-7 relevant keywords or topics (string[])
 - "author": author or creator name, or null if not found (string | null)
 - "date": publication date in ISO 8601 format (YYYY-MM-DD), or null if not found (string | null)
 - "content_type": one of "youtube", "article", "documentation", "news", "forum", "code", "other" (string)
+- "language": primary language of the content, ISO 639-1 code, e.g. "en", "ko", "ja" (string)

 Content type: {content_type}
 Source URL: {url}
@@ -33,10 +35,10 @@ Return only the JSON object, no markdown, no explanation."""

 def _get_client() -> GenerativeAiInferenceClient:
    config = oci.config.from_file()
-    return GenerativeAiInferenceClient(
-        config,
-        service_endpoint=os.environ["OCI_GENAI_ENDPOINT"],
-    )
+    # Gemini models live in us-ashburn-1; use OCI_CHAT_ENDPOINT if set,
+    # otherwise fall back to OCI_GENAI_ENDPOINT.
+    endpoint = os.environ.get("OCI_CHAT_ENDPOINT") or os.environ["OCI_GENAI_ENDPOINT"]
+    return GenerativeAiInferenceClient(config, service_endpoint=endpoint)


 def enrich(content_type: str, title: str, url: str, text: str) -> dict:
@@ -92,5 +94,6 @@ def enrich(content_type: str, title: str, url: str, text: str) -> dict:
    metadata.setdefault("author", None)
    metadata.setdefault("date", None)
    metadata.setdefault("content_type", content_type)
+    metadata.setdefault("language", "en")

    return metadata
--- a/core/obsidian.py
+++ b/core/obsidian.py
@@ -22,18 +22,20 @@ def save_note(
    source_url: str = "",
    author: str = "",
    date: str = "",
+    summary_ko: str = "",
 ) -> Path:
    """Save a processed knowledge item as an Obsidian markdown file.

    Args:
        content_type: One of 'youtube', 'url', 'text'.
        title: The note title.
-        summary: LLM-generated summary.
+        summary: LLM-generated summary in English.
        body: Full content text.
        tags: List of topic tags.
        source_url: Original URL (empty for plain text).
        author: Author name (may be empty).
        date: Publication date in ISO 8601 format (may be empty).
+        summary_ko: Korean translation of the summary (may be empty).

    Returns:
        Path of the created markdown file.
@@ -59,6 +61,8 @@ def save_note(
    # Build YAML frontmatter tags
    tags_yaml = ", ".join(tags) if tags else ""

+    summary_section = f"## 요약\n{summary_ko}\n\n*(English)*\n{summary}" if summary_ko else f"## 요약\n{summary}"
+
    content = f"""---
 title: {title}
 source_type: {content_type}
@@ -71,8 +75,7 @@ created: {today}

 # {title}

-## 요약
-{summary}
+{summary_section}

 ## 원문
 {body}
--- a/core/vocab.py
+++ b/core/vocab.py
@@ -0,0 +1,79 @@
+"""Extract vocabulary from English content using Gemini Flash."""
+
+import json
+import os
+import re
+
+import oci
+from oci.generative_ai_inference import GenerativeAiInferenceClient
+from oci.generative_ai_inference.models import (
+    ChatDetails,
+    GenericChatRequest,
+    OnDemandServingMode,
+    TextContent,
+    UserMessage,
+)
+
+_PROMPT = """\
+You are an English vocabulary instructor. Analyze the English text below and extract words or phrases that an intermediate English learner (B1-B2 CEFR level) might not know.
+
+Rules:
+- Extract 5 to 10 items maximum
+- Skip very basic words (go, see, love, etc.) and overly academic/rare words
+- Focus on useful vocabulary: idioms, collocations, phrasal verbs, and mid-level words
+- Each item must appear in the source text
+
+Return ONLY a valid JSON array. Each element must have:
+- "word": the word or phrase as it appears (string)
+- "pos": part of speech, e.g. "verb", "noun", "adjective", "phrase", "phrasal verb" (string)
+- "definition_en": concise English definition (string)
+- "definition_ko": Korean translation of the definition (string)
+- "example": the sentence from the source text that contains this word/phrase (string)
+
+Source title: {title}
+Text:
+{text}
+
+Return only the JSON array, no markdown, no explanation."""
+
+
+def _get_client() -> GenerativeAiInferenceClient:
+    config = oci.config.from_file()
+    endpoint = os.environ.get("OCI_CHAT_ENDPOINT") or os.environ["OCI_GENAI_ENDPOINT"]
+    return GenerativeAiInferenceClient(config, service_endpoint=endpoint)
+
+
+def extract_vocab(text: str, title: str = "") -> list[dict]:
+    """Extract intermediate-level English vocabulary from text using Gemini Flash.
+
+    Args:
+        text: English source text to analyze.
+        title: Content title for context.
+
+    Returns:
+        List of vocab dicts with keys: word, pos, definition_en, definition_ko, example.
+        Returns empty list on failure or if no suitable vocab found.
+    """
+    prompt = _PROMPT.format(title=title, text=text[:5000])
+
+    try:
+        client = _get_client()
+        req = GenericChatRequest(
+            messages=[UserMessage(content=[TextContent(text=prompt)])],
+            max_tokens=2048,
+            temperature=0,
+        )
+        det = ChatDetails(
+            compartment_id=os.environ["OCI_COMPARTMENT_ID"],
+            serving_mode=OnDemandServingMode(model_id=os.environ["OCI_CHAT_MODEL_ID"]),
+            chat_request=req,
+        )
+        response = client.chat(det)
+        raw = response.data.chat_response.choices[0].message.content[0].text.strip()
+        raw = re.sub(r"^```(?:json)?\s*|\s*```$", "", raw, flags=re.MULTILINE)
+        items = json.loads(raw)
+        if not isinstance(items, list):
+            return []
+        return items
+    except Exception:
+        return []
--- a/daemon/worker.py
+++ b/daemon/worker.py
@@ -4,11 +4,13 @@ import logging
 import os
 import time

+from core.anki import add_vocab_cards
 from core.chunker import chunk_text
 from core.enricher import enrich
 from core.obsidian import save_note
 from core.queue_db import fetch_pending, set_done, set_error, set_processing
 from core.vector import save_to_vector
+from core.vocab import extract_vocab
 from core.web import fetch_page_text
 from core.youtube import get_transcript

@@ -62,6 +64,7 @@ def process_item(item: dict) -> None:
            source_url=url,
            author=meta.get("author") or "",
            date=meta.get("date") or "",
+            summary_ko=meta.get("summary_ko", ""),
        )
        logger.info("Obsidian note saved: %s", note_path)

@@ -70,6 +73,12 @@ def process_item(item: dict) -> None:
        inserted = save_to_vector(doc_id, chunks)
        logger.info("Vector store: inserted %d chunks for doc_id=%s", len(inserted), doc_id)

+        # Add Anki vocabulary cards for English content
+        if meta.get("language", "").startswith("en"):
+            vocab = extract_vocab(text, title)
+            if vocab:
+                add_vocab_cards(vocab, source_title=title)
+
        set_done(row_id, title, meta)
        logger.info("Done: %s → %s", row_id[:8], title[:60])