feat: initial knowledge-inbox pipeline implementation

- Oracle ADB queue table (sql/schema.sql)
- Queue CRUD: core/queue_db.py
- YouTube transcript: core/youtube.py
- Web page fetch: core/web.py
- LLM enrichment via OCI GenAI Gemini Flash: core/enricher.py
- Text chunker: core/chunker.py
- Obsidian note writer: core/obsidian.py
- Oracle vector store insertion: core/vector.py
- Polling daemon: daemon/worker.py
- Telegram bot: bot/telegram_bot.py
- Main runner: main.py
This commit is contained in:
joungmin
2026-02-28 08:16:11 +09:00
commit 86a4104ae3
18 changed files with 926 additions and 0 deletions

0
core/__init__.py Normal file
View File

28
core/chunker.py Normal file
View File

@@ -0,0 +1,28 @@
"""Simple sliding-window text chunking."""
def chunk_text(text: str, size: int = 2000, overlap: int = 200) -> list[str]:
"""Split text into overlapping chunks.
Args:
text: The full text to split.
size: Maximum characters per chunk.
overlap: Characters of overlap between consecutive chunks.
Returns:
List of text chunks. Returns single-item list for short text.
"""
if len(text) <= size:
return [text]
chunks: list[str] = []
step = size - overlap
start = 0
while start < len(text):
end = start + size
chunks.append(text[start:end])
if end >= len(text):
break
start += step
return chunks

96
core/enricher.py Normal file
View File

@@ -0,0 +1,96 @@
"""LLM-based content enrichment via OCI GenAI Gemini Flash."""
import json
import os
import re
import oci
from oci.generative_ai_inference import GenerativeAiInferenceClient
from oci.generative_ai_inference.models import (
ChatDetails,
GenericChatRequest,
OnDemandServingMode,
TextContent,
UserMessage,
)
_PROMPT = """\
You are a knowledge extraction assistant. Analyze the content below and return ONLY a valid JSON object with these fields:
- "title": concise descriptive title for this content (string)
- "summary": 3-5 sentence summary capturing key insights (string)
- "tags": list of 3-7 relevant keywords or topics (string[])
- "author": author or creator name, or null if not found (string | null)
- "date": publication date in ISO 8601 format (YYYY-MM-DD), or null if not found (string | null)
- "content_type": one of "youtube", "article", "documentation", "news", "forum", "code", "other" (string)
Content type: {content_type}
Source URL: {url}
Content:
{text}
Return only the JSON object, no markdown, no explanation."""
def _get_client() -> GenerativeAiInferenceClient:
config = oci.config.from_file()
return GenerativeAiInferenceClient(
config,
service_endpoint=os.environ["OCI_GENAI_ENDPOINT"],
)
def enrich(content_type: str, title: str, url: str, text: str) -> dict:
"""Extract structured metadata from content using Gemini Flash.
Args:
content_type: One of 'youtube', 'url', 'text'.
title: Initial title hint (may be empty).
url: Source URL (empty for plain text).
text: The full content text to analyze.
Returns:
Dict with keys: title, summary, tags, author, date, content_type.
Falls back to minimal defaults on LLM failure.
"""
prompt = _PROMPT.format(
content_type=content_type,
url=url or "(none)",
text=text[:6000],
)
try:
client = _get_client()
req = GenericChatRequest(
messages=[UserMessage(content=[TextContent(text=prompt)])],
max_tokens=1024,
temperature=0,
)
det = ChatDetails(
compartment_id=os.environ["OCI_COMPARTMENT_ID"],
serving_mode=OnDemandServingMode(model_id=os.environ["OCI_CHAT_MODEL_ID"]),
chat_request=req,
)
response = client.chat(det)
raw = response.data.chat_response.choices[0].message.content[0].text.strip()
raw = re.sub(r"^```(?:json)?\s*|\s*```$", "", raw, flags=re.MULTILINE)
metadata = json.loads(raw)
except Exception as exc:
metadata = {
"title": title or url or text[:80],
"summary": text[:300],
"tags": [],
"author": None,
"date": None,
"content_type": content_type,
"_error": str(exc),
}
# Ensure required keys exist
metadata.setdefault("title", title or url or text[:80])
metadata.setdefault("summary", "")
metadata.setdefault("tags", [])
metadata.setdefault("author", None)
metadata.setdefault("date", None)
metadata.setdefault("content_type", content_type)
return metadata

86
core/obsidian.py Normal file
View File

@@ -0,0 +1,86 @@
"""Save processed knowledge items as Obsidian markdown notes."""
import os
import re
from datetime import datetime
from pathlib import Path
def _slugify(text: str, max_len: int = 50) -> str:
"""Convert text to a filesystem-safe slug."""
text = re.sub(r"[^\w\s-]", "", text, flags=re.UNICODE)
text = re.sub(r"[\s_]+", "-", text).strip("-")
return text[:max_len].lower()
def save_note(
content_type: str,
title: str,
summary: str,
body: str,
tags: list[str],
source_url: str = "",
author: str = "",
date: str = "",
) -> Path:
"""Save a processed knowledge item as an Obsidian markdown file.
Args:
content_type: One of 'youtube', 'url', 'text'.
title: The note title.
summary: LLM-generated summary.
body: Full content text.
tags: List of topic tags.
source_url: Original URL (empty for plain text).
author: Author name (may be empty).
date: Publication date in ISO 8601 format (may be empty).
Returns:
Path of the created markdown file.
"""
vault = os.environ.get("OBSIDIAN_VAULT", "/Users/joungmin/Documents/Obsidian Vault")
today = datetime.now().strftime("%Y-%m-%d")
now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
slug = _slugify(title) or "untitled"
# Determine subfolder by content type
subfolder_map = {
"youtube": "20 Sources/YouTube",
"url": "20 Sources/Web",
"text": "20 Sources/Notes",
}
subfolder = subfolder_map.get(content_type, "20 Sources/Notes")
note_dir = Path(vault) / subfolder
note_dir.mkdir(parents=True, exist_ok=True)
filename = f"{today}-{slug}.md"
note_path = note_dir / filename
# Build YAML frontmatter tags
tags_yaml = ", ".join(tags) if tags else ""
content = f"""---
title: {title}
source_type: {content_type}
url: {source_url}
author: {author}
date: {date}
tags: [{tags_yaml}]
created: {today}
---
# {title}
## 요약
{summary}
## 원문
{body}
---
*Source: {source_url}*
*Saved: {now_str}*
"""
note_path.write_text(content, encoding="utf-8")
return note_path

185
core/queue_db.py Normal file
View File

@@ -0,0 +1,185 @@
"""Oracle ADB connection pool and CRUD operations for knowledge_queue."""
import json
import os
from contextlib import contextmanager
from typing import Generator
import oracledb
_pool: oracledb.ConnectionPool | None = None
def _get_pool() -> oracledb.ConnectionPool:
"""Return (or lazily create) the module-level connection pool."""
global _pool
if _pool is None:
kwargs: dict = dict(
user=os.environ["ORACLE_USER"],
password=os.environ["ORACLE_PASSWORD"],
dsn=os.environ["ORACLE_DSN"],
min=1,
max=5,
increment=1,
)
wallet = os.environ.get("ORACLE_WALLET")
if wallet:
kwargs["config_dir"] = wallet # tnsnames.ora + cwallet.sso live here
_pool = oracledb.create_pool(**kwargs)
return _pool
@contextmanager
def _conn() -> Generator[oracledb.Connection, None, None]:
"""Context manager that acquires and releases a pooled connection."""
pool = _get_pool()
conn = pool.acquire()
try:
yield conn
conn.commit()
except Exception:
conn.rollback()
raise
finally:
pool.release(conn)
def insert_item(input_type: str, content: str, chat_id: str = "") -> str:
"""Insert a new queue item and return its generated UUID.
Args:
input_type: One of 'youtube', 'url', 'text'.
content: The URL or raw text to process.
chat_id: Telegram chat ID for future notification support.
Returns:
The UUID of the newly inserted row.
"""
sql = """
INSERT INTO knowledge_queue (input_type, content, telegram_chat_id)
VALUES (:input_type, :content, :chat_id)
RETURNING id INTO :out_id
"""
with _conn() as conn:
cursor = conn.cursor()
out_id_var = cursor.var(oracledb.STRING)
cursor.execute(
sql,
{
"input_type": input_type,
"content": content,
"chat_id": chat_id,
"out_id": out_id_var,
},
)
return out_id_var.getvalue()[0]
def fetch_pending(limit: int = 5) -> list[dict]:
"""Fetch oldest pending items up to limit.
Args:
limit: Maximum number of rows to return.
Returns:
List of dicts with keys: id, input_type, content, telegram_chat_id.
"""
sql = """
SELECT id, input_type, content, telegram_chat_id
FROM knowledge_queue
WHERE status = 'pending'
ORDER BY created_at
FETCH FIRST :n ROWS ONLY
"""
with _conn() as conn:
cursor = conn.cursor()
cursor.execute(sql, {"n": limit})
rows = cursor.fetchall()
return [
{
"id": row[0],
"input_type": row[1],
"content": row[2].read() if hasattr(row[2], "read") else row[2],
"telegram_chat_id": row[3],
}
for row in rows
]
def set_processing(row_id: str) -> None:
"""Mark a queue item as processing.
Args:
row_id: The UUID of the row to update.
"""
sql = """
UPDATE knowledge_queue
SET status = 'processing', updated_at = SYSTIMESTAMP
WHERE id = :id
"""
with _conn() as conn:
conn.cursor().execute(sql, {"id": row_id})
def set_done(row_id: str, title: str, metadata: dict) -> None:
"""Mark a queue item as done with extracted metadata.
Args:
row_id: The UUID of the row to update.
title: LLM-extracted title.
metadata: Dict of enrichment results to store as JSON.
"""
sql = """
UPDATE knowledge_queue
SET status = 'done',
title = :title,
metadata_json = :meta_json,
updated_at = SYSTIMESTAMP
WHERE id = :id
"""
with _conn() as conn:
conn.cursor().execute(
sql,
{
"id": row_id,
"title": title[:500] if title else "",
"meta_json": json.dumps(metadata, ensure_ascii=False),
},
)
def set_error(row_id: str, error_msg: str) -> None:
"""Mark a queue item as error with a message.
Args:
row_id: The UUID of the row to update.
error_msg: Description of the error.
"""
sql = """
UPDATE knowledge_queue
SET status = 'error', error_msg = :error_msg, updated_at = SYSTIMESTAMP
WHERE id = :id
"""
with _conn() as conn:
conn.cursor().execute(sql, {"id": row_id, "error_msg": error_msg})
def get_status_counts() -> dict:
"""Return count of rows per status.
Returns:
Dict like {'pending': 3, 'processing': 1, 'done': 42, 'error': 0}.
"""
sql = """
SELECT status, COUNT(*) FROM knowledge_queue GROUP BY status
"""
with _conn() as conn:
cursor = conn.cursor()
cursor.execute(sql)
rows = cursor.fetchall()
counts = {"pending": 0, "processing": 0, "done": 0, "error": 0}
for status, count in rows:
counts[status] = count
return counts

114
core/vector.py Normal file
View File

@@ -0,0 +1,114 @@
"""Embedding generation and Oracle vector store insertion."""
import array
import os
from contextlib import contextmanager
from typing import Generator
import oci
import oracledb
from oci.generative_ai_inference import GenerativeAiInferenceClient
from oci.generative_ai_inference.models import (
EmbedTextDetails,
OnDemandServingMode,
)
# Reuse same pool as queue_db but connect to same ADB instance
_pool: oracledb.ConnectionPool | None = None
def _get_pool() -> oracledb.ConnectionPool:
"""Return (or lazily create) the module-level connection pool."""
global _pool
if _pool is None:
kwargs: dict = dict(
user=os.environ["ORACLE_USER"],
password=os.environ["ORACLE_PASSWORD"],
dsn=os.environ["ORACLE_DSN"],
min=1,
max=5,
increment=1,
)
wallet = os.environ.get("ORACLE_WALLET")
if wallet:
kwargs["config_dir"] = wallet
_pool = oracledb.create_pool(**kwargs)
return _pool
@contextmanager
def _conn() -> Generator[oracledb.Connection, None, None]:
"""Context manager that acquires and releases a pooled connection."""
pool = _get_pool()
conn = pool.acquire()
try:
yield conn
conn.commit()
except Exception:
conn.rollback()
raise
finally:
pool.release(conn)
def _to_vector_param(embedding: list[float]) -> array.array:
return array.array("f", embedding)
def _embed_texts(texts: list[str]) -> list[list[float]]:
"""Generate embeddings using Cohere Embed v4 via OCI GenAI."""
config = oci.config.from_file()
client = GenerativeAiInferenceClient(
config,
service_endpoint=os.environ["OCI_GENAI_ENDPOINT"],
)
model_id = os.environ.get("OCI_EMBED_MODEL_ID", "cohere.embed-v4.0")
compartment_id = os.environ["OCI_COMPARTMENT_ID"]
details = EmbedTextDetails(
inputs=texts,
serving_mode=OnDemandServingMode(model_id=model_id),
compartment_id=compartment_id,
input_type="SEARCH_DOCUMENT",
)
response = client.embed_text(details)
return response.data.embeddings
def save_to_vector(doc_id: str, chunks: list[str]) -> list[str]:
"""Embed chunks and insert them into the Oracle vector store.
Args:
doc_id: Document identifier (e.g. 'youtube:abc12345').
chunks: List of text chunks to embed and store.
Returns:
List of inserted row UUIDs.
"""
if not chunks:
return []
embeddings = _embed_texts(chunks)
inserted_ids: list[str] = []
sql = """
INSERT INTO vector_store (doc_id, chunk_text, embedding)
VALUES (:doc_id, :chunk_text, :embedding)
RETURNING id INTO :out_id
"""
with _conn() as conn:
cursor = conn.cursor()
for chunk, embedding in zip(chunks, embeddings):
out_id_var = cursor.var(oracledb.STRING)
cursor.execute(
sql,
{
"doc_id": doc_id,
"chunk_text": chunk,
"embedding": _to_vector_param(embedding),
"out_id": out_id_var,
},
)
inserted_ids.append(out_id_var.getvalue()[0])
return inserted_ids

59
core/web.py Normal file
View File

@@ -0,0 +1,59 @@
"""URL fetching and HTML-to-text extraction."""
import re
from html.parser import HTMLParser
import httpx
class _TextExtractor(HTMLParser):
_SKIP_TAGS = {"script", "style", "head", "nav", "footer", "noscript"}
def __init__(self) -> None:
super().__init__()
self._buf: list[str] = []
self._skip = 0
def handle_starttag(self, tag: str, attrs: list) -> None:
if tag in self._SKIP_TAGS:
self._skip += 1
def handle_endtag(self, tag: str) -> None:
if tag in self._SKIP_TAGS and self._skip:
self._skip -= 1
def handle_data(self, data: str) -> None:
if not self._skip:
text = data.strip()
if text:
self._buf.append(text)
def get_text(self) -> str:
return " ".join(self._buf)
def _html_to_text(html: str) -> str:
parser = _TextExtractor()
parser.feed(html)
return re.sub(r"\s{3,}", " ", parser.get_text())
_HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; knowledge-inbox/1.0)"}
def fetch_page_text(url: str, max_chars: int = 8000) -> str:
"""Fetch a URL and return stripped plain text, truncated to max_chars.
Args:
url: The URL to fetch.
max_chars: Maximum characters to return.
Returns:
Extracted plain text, or empty string on failure.
"""
try:
r = httpx.get(url, timeout=15, follow_redirects=True, headers=_HEADERS)
r.raise_for_status()
return _html_to_text(r.text)[:max_chars]
except Exception:
return ""

50
core/youtube.py Normal file
View File

@@ -0,0 +1,50 @@
"""YouTube transcript extraction via youtube-transcript-api."""
import re
from youtube_transcript_api import YouTubeTranscriptApi
def _extract_video_id(url: str) -> str:
"""Extract YouTube video ID from a URL.
Args:
url: YouTube URL (watch?v= or youtu.be/ formats).
Returns:
The video ID string.
Raises:
ValueError: If no video ID can be found in the URL.
"""
match = re.search(r"(?:v=|youtu\.be/)([^&?/\s]+)", url)
if not match:
raise ValueError(f"Cannot extract video ID from URL: {url}")
return match.group(1)
def get_transcript(url: str) -> dict:
"""Fetch transcript text for a YouTube video.
Args:
url: YouTube video URL.
Returns:
Dict with keys: video_id, title, text, url.
title falls back to video_id if unavailable.
"""
video_id = _extract_video_id(url)
fetched = YouTubeTranscriptApi.fetch(video_id, languages=["ko", "en"])
segments = list(fetched)
text = " ".join(seg.text for seg in segments)
# Try to get title from fetched transcript metadata
title = getattr(fetched, "title", None) or video_id
return {
"video_id": video_id,
"title": title,
"text": text,
"url": url,
}