Files
knowledge-inbox/core/enricher.py
joungmin a9db6a8771 feat: add English vocab extraction and Anki card registration
- core/vocab.py: extract B1-B2 level vocabulary from English content via Gemini Flash
- core/anki.py: register vocab cards to AnkiConnect (English::Vocabulary deck)
- core/enricher.py: add language detection field + summary_ko (Korean summary)
- core/obsidian.py: render Korean + English summary in note
- daemon/worker.py: call vocab extraction and Anki registration for English content
2026-02-28 08:39:58 +09:00

100 lines
3.5 KiB
Python

"""LLM-based content enrichment via OCI GenAI Gemini Flash."""
import json
import os
import re
import oci
from oci.generative_ai_inference import GenerativeAiInferenceClient
from oci.generative_ai_inference.models import (
ChatDetails,
GenericChatRequest,
OnDemandServingMode,
TextContent,
UserMessage,
)
_PROMPT = """\
You are a knowledge extraction assistant. Analyze the content below and return ONLY a valid JSON object with these fields:
- "title": concise descriptive title for this content (string)
- "summary": 3-5 sentence summary capturing key insights, written in English (string)
- "summary_ko": the same summary translated into Korean (string)
- "tags": list of 3-7 relevant keywords or topics (string[])
- "author": author or creator name, or null if not found (string | null)
- "date": publication date in ISO 8601 format (YYYY-MM-DD), or null if not found (string | null)
- "content_type": one of "youtube", "article", "documentation", "news", "forum", "code", "other" (string)
- "language": primary language of the content, ISO 639-1 code, e.g. "en", "ko", "ja" (string)
Content type: {content_type}
Source URL: {url}
Content:
{text}
Return only the JSON object, no markdown, no explanation."""
def _get_client() -> GenerativeAiInferenceClient:
config = oci.config.from_file()
# Gemini models live in us-ashburn-1; use OCI_CHAT_ENDPOINT if set,
# otherwise fall back to OCI_GENAI_ENDPOINT.
endpoint = os.environ.get("OCI_CHAT_ENDPOINT") or os.environ["OCI_GENAI_ENDPOINT"]
return GenerativeAiInferenceClient(config, service_endpoint=endpoint)
def enrich(content_type: str, title: str, url: str, text: str) -> dict:
"""Extract structured metadata from content using Gemini Flash.
Args:
content_type: One of 'youtube', 'url', 'text'.
title: Initial title hint (may be empty).
url: Source URL (empty for plain text).
text: The full content text to analyze.
Returns:
Dict with keys: title, summary, tags, author, date, content_type.
Falls back to minimal defaults on LLM failure.
"""
prompt = _PROMPT.format(
content_type=content_type,
url=url or "(none)",
text=text[:6000],
)
try:
client = _get_client()
req = GenericChatRequest(
messages=[UserMessage(content=[TextContent(text=prompt)])],
max_tokens=1024,
temperature=0,
)
det = ChatDetails(
compartment_id=os.environ["OCI_COMPARTMENT_ID"],
serving_mode=OnDemandServingMode(model_id=os.environ["OCI_CHAT_MODEL_ID"]),
chat_request=req,
)
response = client.chat(det)
raw = response.data.chat_response.choices[0].message.content[0].text.strip()
raw = re.sub(r"^```(?:json)?\s*|\s*```$", "", raw, flags=re.MULTILINE)
metadata = json.loads(raw)
except Exception as exc:
metadata = {
"title": title or url or text[:80],
"summary": text[:300],
"tags": [],
"author": None,
"date": None,
"content_type": content_type,
"_error": str(exc),
}
# Ensure required keys exist
metadata.setdefault("title", title or url or text[:80])
metadata.setdefault("summary", "")
metadata.setdefault("tags", [])
metadata.setdefault("author", None)
metadata.setdefault("date", None)
metadata.setdefault("content_type", content_type)
metadata.setdefault("language", "en")
return metadata