feat: add English vocab extraction and Anki card registration

- core/vocab.py: extract B1-B2 level vocabulary from English content via Gemini Flash
- core/anki.py: register vocab cards to AnkiConnect (English::Vocabulary deck)
- core/enricher.py: add language detection field + summary_ko (Korean summary)
- core/obsidian.py: render Korean + English summary in note
- daemon/worker.py: call vocab extraction and Anki registration for English content
This commit is contained in:
joungmin
2026-02-28 08:39:58 +09:00
parent 86a4104ae3
commit a9db6a8771
5 changed files with 208 additions and 8 deletions

106
core/anki.py Normal file
View File

@@ -0,0 +1,106 @@
"""AnkiConnect HTTP client for adding vocabulary cards."""
import json
import logging
from urllib import request, error
logger = logging.getLogger(__name__)
ANKI_URL = "http://localhost:8765"
DECK_NAME = "English::Vocabulary"
MODEL_NAME = "Basic"
def _invoke(action: str, **params) -> object:
"""Call AnkiConnect API.
Args:
action: AnkiConnect action name.
**params: Action parameters.
Returns:
The result field from AnkiConnect response.
Raises:
RuntimeError: If AnkiConnect returns an error.
"""
payload = json.dumps({"action": action, "version": 6, "params": params}).encode()
req = request.Request(ANKI_URL, payload, {"Content-Type": "application/json"})
with request.urlopen(req, timeout=5) as resp:
data = json.loads(resp.read())
if data.get("error"):
raise RuntimeError(f"AnkiConnect error: {data['error']}")
return data["result"]
def _ensure_deck() -> None:
"""Create deck if it doesn't exist."""
_invoke("createDeck", deck=DECK_NAME)
def _build_front(word: str, pos: str, example: str) -> str:
return f"<b>{word}</b> <i>({pos})</i><br><br>{example}"
def _build_back(definition_en: str, definition_ko: str) -> str:
return f"{definition_en}<br><br><b>한국어:</b> {definition_ko}"
def add_vocab_cards(vocab_list: list[dict], source_title: str = "") -> list[int]:
"""Add vocabulary cards to Anki.
Skips duplicates silently (AnkiConnect returns null for existing notes).
Args:
vocab_list: List of vocab dicts from extract_vocab().
source_title: Content title, added as a tag on each card.
Returns:
List of created note IDs (excludes skipped duplicates).
"""
if not vocab_list:
return []
try:
_ensure_deck()
except error.URLError:
logger.warning("AnkiConnect unreachable — skipping vocab card creation")
return []
tag = source_title[:50].replace(" ", "_") if source_title else "knowledge-inbox"
notes = [
{
"deckName": DECK_NAME,
"modelName": MODEL_NAME,
"fields": {
"Front": _build_front(
item.get("word", ""),
item.get("pos", ""),
item.get("example", ""),
),
"Back": _build_back(
item.get("definition_en", ""),
item.get("definition_ko", ""),
),
},
"options": {"allowDuplicate": False},
"tags": ["knowledge-inbox", tag],
}
for item in vocab_list
if item.get("word")
]
try:
results = _invoke("addNotes", notes=notes)
except Exception as exc:
logger.error("Failed to add Anki cards: %s", exc)
return []
created = [note_id for note_id in results if note_id is not None]
skipped = len(results) - len(created)
logger.info(
"Anki: added %d card(s), skipped %d duplicate(s) for '%s'",
len(created), skipped, source_title[:40],
)
return created

View File

@@ -17,11 +17,13 @@ from oci.generative_ai_inference.models import (
_PROMPT = """\
You are a knowledge extraction assistant. Analyze the content below and return ONLY a valid JSON object with these fields:
- "title": concise descriptive title for this content (string)
- "summary": 3-5 sentence summary capturing key insights (string)
- "summary": 3-5 sentence summary capturing key insights, written in English (string)
- "summary_ko": the same summary translated into Korean (string)
- "tags": list of 3-7 relevant keywords or topics (string[])
- "author": author or creator name, or null if not found (string | null)
- "date": publication date in ISO 8601 format (YYYY-MM-DD), or null if not found (string | null)
- "content_type": one of "youtube", "article", "documentation", "news", "forum", "code", "other" (string)
- "language": primary language of the content, ISO 639-1 code, e.g. "en", "ko", "ja" (string)
Content type: {content_type}
Source URL: {url}
@@ -33,10 +35,10 @@ Return only the JSON object, no markdown, no explanation."""
def _get_client() -> GenerativeAiInferenceClient:
config = oci.config.from_file()
return GenerativeAiInferenceClient(
config,
service_endpoint=os.environ["OCI_GENAI_ENDPOINT"],
)
# Gemini models live in us-ashburn-1; use OCI_CHAT_ENDPOINT if set,
# otherwise fall back to OCI_GENAI_ENDPOINT.
endpoint = os.environ.get("OCI_CHAT_ENDPOINT") or os.environ["OCI_GENAI_ENDPOINT"]
return GenerativeAiInferenceClient(config, service_endpoint=endpoint)
def enrich(content_type: str, title: str, url: str, text: str) -> dict:
@@ -92,5 +94,6 @@ def enrich(content_type: str, title: str, url: str, text: str) -> dict:
metadata.setdefault("author", None)
metadata.setdefault("date", None)
metadata.setdefault("content_type", content_type)
metadata.setdefault("language", "en")
return metadata

View File

@@ -22,18 +22,20 @@ def save_note(
source_url: str = "",
author: str = "",
date: str = "",
summary_ko: str = "",
) -> Path:
"""Save a processed knowledge item as an Obsidian markdown file.
Args:
content_type: One of 'youtube', 'url', 'text'.
title: The note title.
summary: LLM-generated summary.
summary: LLM-generated summary in English.
body: Full content text.
tags: List of topic tags.
source_url: Original URL (empty for plain text).
author: Author name (may be empty).
date: Publication date in ISO 8601 format (may be empty).
summary_ko: Korean translation of the summary (may be empty).
Returns:
Path of the created markdown file.
@@ -59,6 +61,8 @@ def save_note(
# Build YAML frontmatter tags
tags_yaml = ", ".join(tags) if tags else ""
summary_section = f"## 요약\n{summary_ko}\n\n*(English)*\n{summary}" if summary_ko else f"## 요약\n{summary}"
content = f"""---
title: {title}
source_type: {content_type}
@@ -71,8 +75,7 @@ created: {today}
# {title}
## 요약
{summary}
{summary_section}
## 원문
{body}

79
core/vocab.py Normal file
View File

@@ -0,0 +1,79 @@
"""Extract vocabulary from English content using Gemini Flash."""
import json
import os
import re
import oci
from oci.generative_ai_inference import GenerativeAiInferenceClient
from oci.generative_ai_inference.models import (
ChatDetails,
GenericChatRequest,
OnDemandServingMode,
TextContent,
UserMessage,
)
_PROMPT = """\
You are an English vocabulary instructor. Analyze the English text below and extract words or phrases that an intermediate English learner (B1-B2 CEFR level) might not know.
Rules:
- Extract 5 to 10 items maximum
- Skip very basic words (go, see, love, etc.) and overly academic/rare words
- Focus on useful vocabulary: idioms, collocations, phrasal verbs, and mid-level words
- Each item must appear in the source text
Return ONLY a valid JSON array. Each element must have:
- "word": the word or phrase as it appears (string)
- "pos": part of speech, e.g. "verb", "noun", "adjective", "phrase", "phrasal verb" (string)
- "definition_en": concise English definition (string)
- "definition_ko": Korean translation of the definition (string)
- "example": the sentence from the source text that contains this word/phrase (string)
Source title: {title}
Text:
{text}
Return only the JSON array, no markdown, no explanation."""
def _get_client() -> GenerativeAiInferenceClient:
config = oci.config.from_file()
endpoint = os.environ.get("OCI_CHAT_ENDPOINT") or os.environ["OCI_GENAI_ENDPOINT"]
return GenerativeAiInferenceClient(config, service_endpoint=endpoint)
def extract_vocab(text: str, title: str = "") -> list[dict]:
"""Extract intermediate-level English vocabulary from text using Gemini Flash.
Args:
text: English source text to analyze.
title: Content title for context.
Returns:
List of vocab dicts with keys: word, pos, definition_en, definition_ko, example.
Returns empty list on failure or if no suitable vocab found.
"""
prompt = _PROMPT.format(title=title, text=text[:5000])
try:
client = _get_client()
req = GenericChatRequest(
messages=[UserMessage(content=[TextContent(text=prompt)])],
max_tokens=2048,
temperature=0,
)
det = ChatDetails(
compartment_id=os.environ["OCI_COMPARTMENT_ID"],
serving_mode=OnDemandServingMode(model_id=os.environ["OCI_CHAT_MODEL_ID"]),
chat_request=req,
)
response = client.chat(det)
raw = response.data.chat_response.choices[0].message.content[0].text.strip()
raw = re.sub(r"^```(?:json)?\s*|\s*```$", "", raw, flags=re.MULTILINE)
items = json.loads(raw)
if not isinstance(items, list):
return []
return items
except Exception:
return []