feat: initial knowledge-inbox pipeline implementation
- Oracle ADB queue table (sql/schema.sql) - Queue CRUD: core/queue_db.py - YouTube transcript: core/youtube.py - Web page fetch: core/web.py - LLM enrichment via OCI GenAI Gemini Flash: core/enricher.py - Text chunker: core/chunker.py - Obsidian note writer: core/obsidian.py - Oracle vector store insertion: core/vector.py - Polling daemon: daemon/worker.py - Telegram bot: bot/telegram_bot.py - Main runner: main.py
This commit is contained in:
20
.env.example
Normal file
20
.env.example
Normal file
@@ -0,0 +1,20 @@
|
||||
# Telegram
|
||||
TELEGRAM_BOT_TOKEN=
|
||||
|
||||
# Oracle ADB (queue + vector store shared)
|
||||
ORACLE_USER=admin
|
||||
ORACLE_PASSWORD=
|
||||
ORACLE_DSN=h8i4i0g8cxtd2lpf_high
|
||||
ORACLE_WALLET=/Users/joungmin/devkit/db_conn/Wallet_H8I4I0G8CXTD2LPF
|
||||
|
||||
# OCI GenAI
|
||||
OCI_COMPARTMENT_ID=
|
||||
OCI_GENAI_ENDPOINT=https://inference.generativeai.us-ashburn-1.oci.oraclecloud.com
|
||||
OCI_EMBED_MODEL_ID=cohere.embed-v4.0
|
||||
OCI_CHAT_MODEL_ID=ocid1.generativeaimodel.oc1.iad.amaaaaaask7dceyaeo4ehrn25guuats5s45hnvswlhxo6riop275l2bkr2vq
|
||||
|
||||
# Obsidian
|
||||
OBSIDIAN_VAULT=/Users/joungmin/Documents/Obsidian Vault
|
||||
|
||||
# Daemon
|
||||
DAEMON_INTERVAL=30
|
||||
8
.gitignore
vendored
Normal file
8
.gitignore
vendored
Normal file
@@ -0,0 +1,8 @@
|
||||
.env
|
||||
__pycache__/
|
||||
*.pyc
|
||||
*.pyo
|
||||
.venv/
|
||||
dist/
|
||||
*.egg-info/
|
||||
.DS_Store
|
||||
0
bot/__init__.py
Normal file
0
bot/__init__.py
Normal file
108
bot/telegram_bot.py
Normal file
108
bot/telegram_bot.py
Normal file
@@ -0,0 +1,108 @@
|
||||
"""Telegram bot for receiving knowledge inbox items."""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
|
||||
from telegram import Update
|
||||
from telegram.ext import Application, CommandHandler, ContextTypes, MessageHandler, filters
|
||||
|
||||
from core.queue_db import get_status_counts, insert_item
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def detect_type(text: str) -> str:
|
||||
"""Detect the input type of a user message.
|
||||
|
||||
Args:
|
||||
text: Raw message text from user.
|
||||
|
||||
Returns:
|
||||
One of 'youtube', 'url', 'text'.
|
||||
"""
|
||||
text = text.strip()
|
||||
if re.search(r"youtube\.com/watch|youtu\.be/", text):
|
||||
return "youtube"
|
||||
if text.startswith(("http://", "https://")):
|
||||
return "url"
|
||||
return "text"
|
||||
|
||||
|
||||
async def cmd_start(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
||||
"""Handle /start command."""
|
||||
await update.message.reply_text(
|
||||
"📚 *Knowledge Inbox Bot*\n\n"
|
||||
"다음을 전송하면 자동으로 처리하여 Obsidian에 저장합니다:\n\n"
|
||||
"• *YouTube URL* — 트랜스크립트 추출 후 요약\n"
|
||||
"• *웹 URL* — 페이지 내용 추출 후 요약\n"
|
||||
"• *자유 텍스트* — 그대로 저장 후 태그 추출\n\n"
|
||||
"/status — 처리 현황 조회",
|
||||
parse_mode="Markdown",
|
||||
)
|
||||
|
||||
|
||||
async def cmd_status(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
||||
"""Handle /status command."""
|
||||
try:
|
||||
counts = get_status_counts()
|
||||
msg = (
|
||||
"📊 *처리 현황*\n\n"
|
||||
f"⏳ 대기중: {counts.get('pending', 0)}\n"
|
||||
f"🔄 처리중: {counts.get('processing', 0)}\n"
|
||||
f"✅ 완료: {counts.get('done', 0)}\n"
|
||||
f"❌ 오류: {counts.get('error', 0)}"
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.error("Status query failed: %s", exc)
|
||||
msg = "❌ 상태 조회에 실패했습니다."
|
||||
await update.message.reply_text(msg, parse_mode="Markdown")
|
||||
|
||||
|
||||
async def handle_message(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
||||
"""Handle all non-command messages."""
|
||||
text = update.message.text or ""
|
||||
chat_id = str(update.effective_chat.id)
|
||||
|
||||
if not text.strip():
|
||||
return
|
||||
|
||||
input_type = detect_type(text)
|
||||
type_labels = {"youtube": "YouTube", "url": "웹페이지", "text": "텍스트"}
|
||||
|
||||
try:
|
||||
row_id = insert_item(input_type, text.strip(), chat_id)
|
||||
label = type_labels[input_type]
|
||||
await update.message.reply_text(
|
||||
f"📥 *{label}*이 큐에 추가됐습니다.\n"
|
||||
f"ID: `{row_id[:8]}`\n\n"
|
||||
"처리 완료 후 Obsidian에 저장됩니다.",
|
||||
parse_mode="Markdown",
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.error("insert_item failed: %s", exc)
|
||||
await update.message.reply_text("❌ 저장에 실패했습니다. 잠시 후 다시 시도해주세요.")
|
||||
|
||||
|
||||
def build_app() -> Application:
|
||||
"""Build and configure the Telegram Application.
|
||||
|
||||
Returns:
|
||||
Configured Application instance ready to run.
|
||||
"""
|
||||
token = os.environ["TELEGRAM_BOT_TOKEN"]
|
||||
app = Application.builder().token(token).build()
|
||||
app.add_handler(CommandHandler("start", cmd_start))
|
||||
app.add_handler(CommandHandler("status", cmd_status))
|
||||
app.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, handle_message))
|
||||
return app
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv()
|
||||
build_app().run_polling()
|
||||
0
core/__init__.py
Normal file
0
core/__init__.py
Normal file
28
core/chunker.py
Normal file
28
core/chunker.py
Normal file
@@ -0,0 +1,28 @@
|
||||
"""Simple sliding-window text chunking."""
|
||||
|
||||
|
||||
def chunk_text(text: str, size: int = 2000, overlap: int = 200) -> list[str]:
|
||||
"""Split text into overlapping chunks.
|
||||
|
||||
Args:
|
||||
text: The full text to split.
|
||||
size: Maximum characters per chunk.
|
||||
overlap: Characters of overlap between consecutive chunks.
|
||||
|
||||
Returns:
|
||||
List of text chunks. Returns single-item list for short text.
|
||||
"""
|
||||
if len(text) <= size:
|
||||
return [text]
|
||||
|
||||
chunks: list[str] = []
|
||||
step = size - overlap
|
||||
start = 0
|
||||
while start < len(text):
|
||||
end = start + size
|
||||
chunks.append(text[start:end])
|
||||
if end >= len(text):
|
||||
break
|
||||
start += step
|
||||
|
||||
return chunks
|
||||
96
core/enricher.py
Normal file
96
core/enricher.py
Normal file
@@ -0,0 +1,96 @@
|
||||
"""LLM-based content enrichment via OCI GenAI Gemini Flash."""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
|
||||
import oci
|
||||
from oci.generative_ai_inference import GenerativeAiInferenceClient
|
||||
from oci.generative_ai_inference.models import (
|
||||
ChatDetails,
|
||||
GenericChatRequest,
|
||||
OnDemandServingMode,
|
||||
TextContent,
|
||||
UserMessage,
|
||||
)
|
||||
|
||||
_PROMPT = """\
|
||||
You are a knowledge extraction assistant. Analyze the content below and return ONLY a valid JSON object with these fields:
|
||||
- "title": concise descriptive title for this content (string)
|
||||
- "summary": 3-5 sentence summary capturing key insights (string)
|
||||
- "tags": list of 3-7 relevant keywords or topics (string[])
|
||||
- "author": author or creator name, or null if not found (string | null)
|
||||
- "date": publication date in ISO 8601 format (YYYY-MM-DD), or null if not found (string | null)
|
||||
- "content_type": one of "youtube", "article", "documentation", "news", "forum", "code", "other" (string)
|
||||
|
||||
Content type: {content_type}
|
||||
Source URL: {url}
|
||||
Content:
|
||||
{text}
|
||||
|
||||
Return only the JSON object, no markdown, no explanation."""
|
||||
|
||||
|
||||
def _get_client() -> GenerativeAiInferenceClient:
|
||||
config = oci.config.from_file()
|
||||
return GenerativeAiInferenceClient(
|
||||
config,
|
||||
service_endpoint=os.environ["OCI_GENAI_ENDPOINT"],
|
||||
)
|
||||
|
||||
|
||||
def enrich(content_type: str, title: str, url: str, text: str) -> dict:
|
||||
"""Extract structured metadata from content using Gemini Flash.
|
||||
|
||||
Args:
|
||||
content_type: One of 'youtube', 'url', 'text'.
|
||||
title: Initial title hint (may be empty).
|
||||
url: Source URL (empty for plain text).
|
||||
text: The full content text to analyze.
|
||||
|
||||
Returns:
|
||||
Dict with keys: title, summary, tags, author, date, content_type.
|
||||
Falls back to minimal defaults on LLM failure.
|
||||
"""
|
||||
prompt = _PROMPT.format(
|
||||
content_type=content_type,
|
||||
url=url or "(none)",
|
||||
text=text[:6000],
|
||||
)
|
||||
|
||||
try:
|
||||
client = _get_client()
|
||||
req = GenericChatRequest(
|
||||
messages=[UserMessage(content=[TextContent(text=prompt)])],
|
||||
max_tokens=1024,
|
||||
temperature=0,
|
||||
)
|
||||
det = ChatDetails(
|
||||
compartment_id=os.environ["OCI_COMPARTMENT_ID"],
|
||||
serving_mode=OnDemandServingMode(model_id=os.environ["OCI_CHAT_MODEL_ID"]),
|
||||
chat_request=req,
|
||||
)
|
||||
response = client.chat(det)
|
||||
raw = response.data.chat_response.choices[0].message.content[0].text.strip()
|
||||
raw = re.sub(r"^```(?:json)?\s*|\s*```$", "", raw, flags=re.MULTILINE)
|
||||
metadata = json.loads(raw)
|
||||
except Exception as exc:
|
||||
metadata = {
|
||||
"title": title or url or text[:80],
|
||||
"summary": text[:300],
|
||||
"tags": [],
|
||||
"author": None,
|
||||
"date": None,
|
||||
"content_type": content_type,
|
||||
"_error": str(exc),
|
||||
}
|
||||
|
||||
# Ensure required keys exist
|
||||
metadata.setdefault("title", title or url or text[:80])
|
||||
metadata.setdefault("summary", "")
|
||||
metadata.setdefault("tags", [])
|
||||
metadata.setdefault("author", None)
|
||||
metadata.setdefault("date", None)
|
||||
metadata.setdefault("content_type", content_type)
|
||||
|
||||
return metadata
|
||||
86
core/obsidian.py
Normal file
86
core/obsidian.py
Normal file
@@ -0,0 +1,86 @@
|
||||
"""Save processed knowledge items as Obsidian markdown notes."""
|
||||
|
||||
import os
|
||||
import re
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def _slugify(text: str, max_len: int = 50) -> str:
|
||||
"""Convert text to a filesystem-safe slug."""
|
||||
text = re.sub(r"[^\w\s-]", "", text, flags=re.UNICODE)
|
||||
text = re.sub(r"[\s_]+", "-", text).strip("-")
|
||||
return text[:max_len].lower()
|
||||
|
||||
|
||||
def save_note(
|
||||
content_type: str,
|
||||
title: str,
|
||||
summary: str,
|
||||
body: str,
|
||||
tags: list[str],
|
||||
source_url: str = "",
|
||||
author: str = "",
|
||||
date: str = "",
|
||||
) -> Path:
|
||||
"""Save a processed knowledge item as an Obsidian markdown file.
|
||||
|
||||
Args:
|
||||
content_type: One of 'youtube', 'url', 'text'.
|
||||
title: The note title.
|
||||
summary: LLM-generated summary.
|
||||
body: Full content text.
|
||||
tags: List of topic tags.
|
||||
source_url: Original URL (empty for plain text).
|
||||
author: Author name (may be empty).
|
||||
date: Publication date in ISO 8601 format (may be empty).
|
||||
|
||||
Returns:
|
||||
Path of the created markdown file.
|
||||
"""
|
||||
vault = os.environ.get("OBSIDIAN_VAULT", "/Users/joungmin/Documents/Obsidian Vault")
|
||||
today = datetime.now().strftime("%Y-%m-%d")
|
||||
now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
slug = _slugify(title) or "untitled"
|
||||
|
||||
# Determine subfolder by content type
|
||||
subfolder_map = {
|
||||
"youtube": "20 Sources/YouTube",
|
||||
"url": "20 Sources/Web",
|
||||
"text": "20 Sources/Notes",
|
||||
}
|
||||
subfolder = subfolder_map.get(content_type, "20 Sources/Notes")
|
||||
note_dir = Path(vault) / subfolder
|
||||
note_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
filename = f"{today}-{slug}.md"
|
||||
note_path = note_dir / filename
|
||||
|
||||
# Build YAML frontmatter tags
|
||||
tags_yaml = ", ".join(tags) if tags else ""
|
||||
|
||||
content = f"""---
|
||||
title: {title}
|
||||
source_type: {content_type}
|
||||
url: {source_url}
|
||||
author: {author}
|
||||
date: {date}
|
||||
tags: [{tags_yaml}]
|
||||
created: {today}
|
||||
---
|
||||
|
||||
# {title}
|
||||
|
||||
## 요약
|
||||
{summary}
|
||||
|
||||
## 원문
|
||||
{body}
|
||||
|
||||
---
|
||||
*Source: {source_url}*
|
||||
*Saved: {now_str}*
|
||||
"""
|
||||
|
||||
note_path.write_text(content, encoding="utf-8")
|
||||
return note_path
|
||||
185
core/queue_db.py
Normal file
185
core/queue_db.py
Normal file
@@ -0,0 +1,185 @@
|
||||
"""Oracle ADB connection pool and CRUD operations for knowledge_queue."""
|
||||
|
||||
import json
|
||||
import os
|
||||
from contextlib import contextmanager
|
||||
from typing import Generator
|
||||
|
||||
import oracledb
|
||||
|
||||
_pool: oracledb.ConnectionPool | None = None
|
||||
|
||||
|
||||
def _get_pool() -> oracledb.ConnectionPool:
|
||||
"""Return (or lazily create) the module-level connection pool."""
|
||||
global _pool
|
||||
if _pool is None:
|
||||
kwargs: dict = dict(
|
||||
user=os.environ["ORACLE_USER"],
|
||||
password=os.environ["ORACLE_PASSWORD"],
|
||||
dsn=os.environ["ORACLE_DSN"],
|
||||
min=1,
|
||||
max=5,
|
||||
increment=1,
|
||||
)
|
||||
wallet = os.environ.get("ORACLE_WALLET")
|
||||
if wallet:
|
||||
kwargs["config_dir"] = wallet # tnsnames.ora + cwallet.sso live here
|
||||
_pool = oracledb.create_pool(**kwargs)
|
||||
return _pool
|
||||
|
||||
|
||||
@contextmanager
|
||||
def _conn() -> Generator[oracledb.Connection, None, None]:
|
||||
"""Context manager that acquires and releases a pooled connection."""
|
||||
pool = _get_pool()
|
||||
conn = pool.acquire()
|
||||
try:
|
||||
yield conn
|
||||
conn.commit()
|
||||
except Exception:
|
||||
conn.rollback()
|
||||
raise
|
||||
finally:
|
||||
pool.release(conn)
|
||||
|
||||
|
||||
def insert_item(input_type: str, content: str, chat_id: str = "") -> str:
|
||||
"""Insert a new queue item and return its generated UUID.
|
||||
|
||||
Args:
|
||||
input_type: One of 'youtube', 'url', 'text'.
|
||||
content: The URL or raw text to process.
|
||||
chat_id: Telegram chat ID for future notification support.
|
||||
|
||||
Returns:
|
||||
The UUID of the newly inserted row.
|
||||
"""
|
||||
sql = """
|
||||
INSERT INTO knowledge_queue (input_type, content, telegram_chat_id)
|
||||
VALUES (:input_type, :content, :chat_id)
|
||||
RETURNING id INTO :out_id
|
||||
"""
|
||||
with _conn() as conn:
|
||||
cursor = conn.cursor()
|
||||
out_id_var = cursor.var(oracledb.STRING)
|
||||
cursor.execute(
|
||||
sql,
|
||||
{
|
||||
"input_type": input_type,
|
||||
"content": content,
|
||||
"chat_id": chat_id,
|
||||
"out_id": out_id_var,
|
||||
},
|
||||
)
|
||||
return out_id_var.getvalue()[0]
|
||||
|
||||
|
||||
def fetch_pending(limit: int = 5) -> list[dict]:
|
||||
"""Fetch oldest pending items up to limit.
|
||||
|
||||
Args:
|
||||
limit: Maximum number of rows to return.
|
||||
|
||||
Returns:
|
||||
List of dicts with keys: id, input_type, content, telegram_chat_id.
|
||||
"""
|
||||
sql = """
|
||||
SELECT id, input_type, content, telegram_chat_id
|
||||
FROM knowledge_queue
|
||||
WHERE status = 'pending'
|
||||
ORDER BY created_at
|
||||
FETCH FIRST :n ROWS ONLY
|
||||
"""
|
||||
with _conn() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(sql, {"n": limit})
|
||||
rows = cursor.fetchall()
|
||||
|
||||
return [
|
||||
{
|
||||
"id": row[0],
|
||||
"input_type": row[1],
|
||||
"content": row[2].read() if hasattr(row[2], "read") else row[2],
|
||||
"telegram_chat_id": row[3],
|
||||
}
|
||||
for row in rows
|
||||
]
|
||||
|
||||
|
||||
def set_processing(row_id: str) -> None:
|
||||
"""Mark a queue item as processing.
|
||||
|
||||
Args:
|
||||
row_id: The UUID of the row to update.
|
||||
"""
|
||||
sql = """
|
||||
UPDATE knowledge_queue
|
||||
SET status = 'processing', updated_at = SYSTIMESTAMP
|
||||
WHERE id = :id
|
||||
"""
|
||||
with _conn() as conn:
|
||||
conn.cursor().execute(sql, {"id": row_id})
|
||||
|
||||
|
||||
def set_done(row_id: str, title: str, metadata: dict) -> None:
|
||||
"""Mark a queue item as done with extracted metadata.
|
||||
|
||||
Args:
|
||||
row_id: The UUID of the row to update.
|
||||
title: LLM-extracted title.
|
||||
metadata: Dict of enrichment results to store as JSON.
|
||||
"""
|
||||
sql = """
|
||||
UPDATE knowledge_queue
|
||||
SET status = 'done',
|
||||
title = :title,
|
||||
metadata_json = :meta_json,
|
||||
updated_at = SYSTIMESTAMP
|
||||
WHERE id = :id
|
||||
"""
|
||||
with _conn() as conn:
|
||||
conn.cursor().execute(
|
||||
sql,
|
||||
{
|
||||
"id": row_id,
|
||||
"title": title[:500] if title else "",
|
||||
"meta_json": json.dumps(metadata, ensure_ascii=False),
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def set_error(row_id: str, error_msg: str) -> None:
|
||||
"""Mark a queue item as error with a message.
|
||||
|
||||
Args:
|
||||
row_id: The UUID of the row to update.
|
||||
error_msg: Description of the error.
|
||||
"""
|
||||
sql = """
|
||||
UPDATE knowledge_queue
|
||||
SET status = 'error', error_msg = :error_msg, updated_at = SYSTIMESTAMP
|
||||
WHERE id = :id
|
||||
"""
|
||||
with _conn() as conn:
|
||||
conn.cursor().execute(sql, {"id": row_id, "error_msg": error_msg})
|
||||
|
||||
|
||||
def get_status_counts() -> dict:
|
||||
"""Return count of rows per status.
|
||||
|
||||
Returns:
|
||||
Dict like {'pending': 3, 'processing': 1, 'done': 42, 'error': 0}.
|
||||
"""
|
||||
sql = """
|
||||
SELECT status, COUNT(*) FROM knowledge_queue GROUP BY status
|
||||
"""
|
||||
with _conn() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(sql)
|
||||
rows = cursor.fetchall()
|
||||
|
||||
counts = {"pending": 0, "processing": 0, "done": 0, "error": 0}
|
||||
for status, count in rows:
|
||||
counts[status] = count
|
||||
return counts
|
||||
114
core/vector.py
Normal file
114
core/vector.py
Normal file
@@ -0,0 +1,114 @@
|
||||
"""Embedding generation and Oracle vector store insertion."""
|
||||
|
||||
import array
|
||||
import os
|
||||
from contextlib import contextmanager
|
||||
from typing import Generator
|
||||
|
||||
import oci
|
||||
import oracledb
|
||||
from oci.generative_ai_inference import GenerativeAiInferenceClient
|
||||
from oci.generative_ai_inference.models import (
|
||||
EmbedTextDetails,
|
||||
OnDemandServingMode,
|
||||
)
|
||||
|
||||
# Reuse same pool as queue_db but connect to same ADB instance
|
||||
_pool: oracledb.ConnectionPool | None = None
|
||||
|
||||
|
||||
def _get_pool() -> oracledb.ConnectionPool:
|
||||
"""Return (or lazily create) the module-level connection pool."""
|
||||
global _pool
|
||||
if _pool is None:
|
||||
kwargs: dict = dict(
|
||||
user=os.environ["ORACLE_USER"],
|
||||
password=os.environ["ORACLE_PASSWORD"],
|
||||
dsn=os.environ["ORACLE_DSN"],
|
||||
min=1,
|
||||
max=5,
|
||||
increment=1,
|
||||
)
|
||||
wallet = os.environ.get("ORACLE_WALLET")
|
||||
if wallet:
|
||||
kwargs["config_dir"] = wallet
|
||||
_pool = oracledb.create_pool(**kwargs)
|
||||
return _pool
|
||||
|
||||
|
||||
@contextmanager
|
||||
def _conn() -> Generator[oracledb.Connection, None, None]:
|
||||
"""Context manager that acquires and releases a pooled connection."""
|
||||
pool = _get_pool()
|
||||
conn = pool.acquire()
|
||||
try:
|
||||
yield conn
|
||||
conn.commit()
|
||||
except Exception:
|
||||
conn.rollback()
|
||||
raise
|
||||
finally:
|
||||
pool.release(conn)
|
||||
|
||||
|
||||
def _to_vector_param(embedding: list[float]) -> array.array:
|
||||
return array.array("f", embedding)
|
||||
|
||||
|
||||
def _embed_texts(texts: list[str]) -> list[list[float]]:
|
||||
"""Generate embeddings using Cohere Embed v4 via OCI GenAI."""
|
||||
config = oci.config.from_file()
|
||||
client = GenerativeAiInferenceClient(
|
||||
config,
|
||||
service_endpoint=os.environ["OCI_GENAI_ENDPOINT"],
|
||||
)
|
||||
model_id = os.environ.get("OCI_EMBED_MODEL_ID", "cohere.embed-v4.0")
|
||||
compartment_id = os.environ["OCI_COMPARTMENT_ID"]
|
||||
|
||||
details = EmbedTextDetails(
|
||||
inputs=texts,
|
||||
serving_mode=OnDemandServingMode(model_id=model_id),
|
||||
compartment_id=compartment_id,
|
||||
input_type="SEARCH_DOCUMENT",
|
||||
)
|
||||
response = client.embed_text(details)
|
||||
return response.data.embeddings
|
||||
|
||||
|
||||
def save_to_vector(doc_id: str, chunks: list[str]) -> list[str]:
|
||||
"""Embed chunks and insert them into the Oracle vector store.
|
||||
|
||||
Args:
|
||||
doc_id: Document identifier (e.g. 'youtube:abc12345').
|
||||
chunks: List of text chunks to embed and store.
|
||||
|
||||
Returns:
|
||||
List of inserted row UUIDs.
|
||||
"""
|
||||
if not chunks:
|
||||
return []
|
||||
|
||||
embeddings = _embed_texts(chunks)
|
||||
inserted_ids: list[str] = []
|
||||
|
||||
sql = """
|
||||
INSERT INTO vector_store (doc_id, chunk_text, embedding)
|
||||
VALUES (:doc_id, :chunk_text, :embedding)
|
||||
RETURNING id INTO :out_id
|
||||
"""
|
||||
with _conn() as conn:
|
||||
cursor = conn.cursor()
|
||||
for chunk, embedding in zip(chunks, embeddings):
|
||||
out_id_var = cursor.var(oracledb.STRING)
|
||||
cursor.execute(
|
||||
sql,
|
||||
{
|
||||
"doc_id": doc_id,
|
||||
"chunk_text": chunk,
|
||||
"embedding": _to_vector_param(embedding),
|
||||
"out_id": out_id_var,
|
||||
},
|
||||
)
|
||||
inserted_ids.append(out_id_var.getvalue()[0])
|
||||
|
||||
return inserted_ids
|
||||
59
core/web.py
Normal file
59
core/web.py
Normal file
@@ -0,0 +1,59 @@
|
||||
"""URL fetching and HTML-to-text extraction."""
|
||||
|
||||
import re
|
||||
from html.parser import HTMLParser
|
||||
|
||||
import httpx
|
||||
|
||||
|
||||
class _TextExtractor(HTMLParser):
|
||||
_SKIP_TAGS = {"script", "style", "head", "nav", "footer", "noscript"}
|
||||
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self._buf: list[str] = []
|
||||
self._skip = 0
|
||||
|
||||
def handle_starttag(self, tag: str, attrs: list) -> None:
|
||||
if tag in self._SKIP_TAGS:
|
||||
self._skip += 1
|
||||
|
||||
def handle_endtag(self, tag: str) -> None:
|
||||
if tag in self._SKIP_TAGS and self._skip:
|
||||
self._skip -= 1
|
||||
|
||||
def handle_data(self, data: str) -> None:
|
||||
if not self._skip:
|
||||
text = data.strip()
|
||||
if text:
|
||||
self._buf.append(text)
|
||||
|
||||
def get_text(self) -> str:
|
||||
return " ".join(self._buf)
|
||||
|
||||
|
||||
def _html_to_text(html: str) -> str:
|
||||
parser = _TextExtractor()
|
||||
parser.feed(html)
|
||||
return re.sub(r"\s{3,}", " ", parser.get_text())
|
||||
|
||||
|
||||
_HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; knowledge-inbox/1.0)"}
|
||||
|
||||
|
||||
def fetch_page_text(url: str, max_chars: int = 8000) -> str:
|
||||
"""Fetch a URL and return stripped plain text, truncated to max_chars.
|
||||
|
||||
Args:
|
||||
url: The URL to fetch.
|
||||
max_chars: Maximum characters to return.
|
||||
|
||||
Returns:
|
||||
Extracted plain text, or empty string on failure.
|
||||
"""
|
||||
try:
|
||||
r = httpx.get(url, timeout=15, follow_redirects=True, headers=_HEADERS)
|
||||
r.raise_for_status()
|
||||
return _html_to_text(r.text)[:max_chars]
|
||||
except Exception:
|
||||
return ""
|
||||
50
core/youtube.py
Normal file
50
core/youtube.py
Normal file
@@ -0,0 +1,50 @@
|
||||
"""YouTube transcript extraction via youtube-transcript-api."""
|
||||
|
||||
import re
|
||||
|
||||
from youtube_transcript_api import YouTubeTranscriptApi
|
||||
|
||||
|
||||
def _extract_video_id(url: str) -> str:
|
||||
"""Extract YouTube video ID from a URL.
|
||||
|
||||
Args:
|
||||
url: YouTube URL (watch?v= or youtu.be/ formats).
|
||||
|
||||
Returns:
|
||||
The video ID string.
|
||||
|
||||
Raises:
|
||||
ValueError: If no video ID can be found in the URL.
|
||||
"""
|
||||
match = re.search(r"(?:v=|youtu\.be/)([^&?/\s]+)", url)
|
||||
if not match:
|
||||
raise ValueError(f"Cannot extract video ID from URL: {url}")
|
||||
return match.group(1)
|
||||
|
||||
|
||||
def get_transcript(url: str) -> dict:
|
||||
"""Fetch transcript text for a YouTube video.
|
||||
|
||||
Args:
|
||||
url: YouTube video URL.
|
||||
|
||||
Returns:
|
||||
Dict with keys: video_id, title, text, url.
|
||||
title falls back to video_id if unavailable.
|
||||
"""
|
||||
video_id = _extract_video_id(url)
|
||||
|
||||
fetched = YouTubeTranscriptApi.fetch(video_id, languages=["ko", "en"])
|
||||
segments = list(fetched)
|
||||
text = " ".join(seg.text for seg in segments)
|
||||
|
||||
# Try to get title from fetched transcript metadata
|
||||
title = getattr(fetched, "title", None) or video_id
|
||||
|
||||
return {
|
||||
"video_id": video_id,
|
||||
"title": title,
|
||||
"text": text,
|
||||
"url": url,
|
||||
}
|
||||
0
daemon/__init__.py
Normal file
0
daemon/__init__.py
Normal file
108
daemon/worker.py
Normal file
108
daemon/worker.py
Normal file
@@ -0,0 +1,108 @@
|
||||
"""Polling daemon that processes knowledge_queue items."""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
|
||||
from core.chunker import chunk_text
|
||||
from core.enricher import enrich
|
||||
from core.obsidian import save_note
|
||||
from core.queue_db import fetch_pending, set_done, set_error, set_processing
|
||||
from core.vector import save_to_vector
|
||||
from core.web import fetch_page_text
|
||||
from core.youtube import get_transcript
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def process_item(item: dict) -> None:
|
||||
"""Process a single queue item end-to-end.
|
||||
|
||||
Args:
|
||||
item: Dict from fetch_pending() with keys: id, input_type, content.
|
||||
"""
|
||||
row_id = item["id"]
|
||||
input_type = item["input_type"]
|
||||
content = item["content"]
|
||||
|
||||
set_processing(row_id)
|
||||
logger.info("Processing %s [%s]", row_id[:8], input_type)
|
||||
|
||||
try:
|
||||
url = ""
|
||||
yt_title = ""
|
||||
|
||||
if input_type == "youtube":
|
||||
result = get_transcript(content)
|
||||
text = result["text"]
|
||||
url = content
|
||||
yt_title = result["title"]
|
||||
elif input_type == "url":
|
||||
text = fetch_page_text(content)
|
||||
url = content
|
||||
else: # text
|
||||
text = content
|
||||
|
||||
if not text:
|
||||
raise ValueError("No text content extracted")
|
||||
|
||||
meta = enrich(input_type, yt_title, url, text)
|
||||
title = meta.get("title") or yt_title or url or row_id[:8]
|
||||
|
||||
note_path = save_note(
|
||||
content_type=input_type,
|
||||
title=title,
|
||||
summary=meta.get("summary", ""),
|
||||
body=text,
|
||||
tags=meta.get("tags", []),
|
||||
source_url=url,
|
||||
author=meta.get("author") or "",
|
||||
date=meta.get("date") or "",
|
||||
)
|
||||
logger.info("Obsidian note saved: %s", note_path)
|
||||
|
||||
chunks = chunk_text(text)
|
||||
doc_id = f"{input_type}:{row_id[:8]}"
|
||||
inserted = save_to_vector(doc_id, chunks)
|
||||
logger.info("Vector store: inserted %d chunks for doc_id=%s", len(inserted), doc_id)
|
||||
|
||||
set_done(row_id, title, meta)
|
||||
logger.info("Done: %s → %s", row_id[:8], title[:60])
|
||||
|
||||
except Exception as exc:
|
||||
logger.error("Error processing %s: %s", row_id[:8], exc, exc_info=True)
|
||||
set_error(row_id, str(exc))
|
||||
|
||||
|
||||
def run_loop(interval: int = 30) -> None:
|
||||
"""Poll for pending items indefinitely.
|
||||
|
||||
Args:
|
||||
interval: Seconds to sleep between polling cycles.
|
||||
"""
|
||||
interval = int(os.environ.get("DAEMON_INTERVAL", interval))
|
||||
logger.info("Daemon started (interval=%ds)", interval)
|
||||
|
||||
while True:
|
||||
try:
|
||||
items = fetch_pending(limit=5)
|
||||
if items:
|
||||
logger.info("Found %d pending item(s)", len(items))
|
||||
for item in items:
|
||||
process_item(item)
|
||||
else:
|
||||
logger.debug("No pending items")
|
||||
except Exception as exc:
|
||||
logger.error("Polling error: %s", exc, exc_info=True)
|
||||
|
||||
time.sleep(interval)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv()
|
||||
run_loop()
|
||||
23
main.py
Normal file
23
main.py
Normal file
@@ -0,0 +1,23 @@
|
||||
"""Main entry point: starts daemon thread + Telegram bot."""
|
||||
|
||||
import threading
|
||||
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
|
||||
from bot.telegram_bot import build_app
|
||||
from daemon.worker import run_loop
|
||||
|
||||
|
||||
def main() -> None:
|
||||
"""Start the daemon in a background thread, then run the bot (blocking)."""
|
||||
t = threading.Thread(target=run_loop, args=(30,), daemon=True)
|
||||
t.start()
|
||||
|
||||
app = build_app()
|
||||
app.run_polling()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
19
pyproject.toml
Normal file
19
pyproject.toml
Normal file
@@ -0,0 +1,19 @@
|
||||
[project]
|
||||
name = "knowledge-inbox"
|
||||
version = "0.1.0"
|
||||
requires-python = ">=3.11"
|
||||
dependencies = [
|
||||
"python-telegram-bot>=21.0",
|
||||
"youtube-transcript-api>=0.6",
|
||||
"httpx>=0.27",
|
||||
"oracledb>=2.0",
|
||||
"oci>=2.100",
|
||||
"python-dotenv>=1.0",
|
||||
]
|
||||
|
||||
[build-system]
|
||||
requires = ["hatchling"]
|
||||
build-backend = "hatchling.build"
|
||||
|
||||
[tool.hatch.build.targets.wheel]
|
||||
packages = ["bot", "core", "daemon"]
|
||||
6
requirements.txt
Normal file
6
requirements.txt
Normal file
@@ -0,0 +1,6 @@
|
||||
python-telegram-bot>=21.0
|
||||
youtube-transcript-api>=0.6
|
||||
httpx>=0.27
|
||||
oracledb>=2.0
|
||||
oci>=2.100
|
||||
python-dotenv>=1.0
|
||||
16
sql/schema.sql
Normal file
16
sql/schema.sql
Normal file
@@ -0,0 +1,16 @@
|
||||
-- knowledge_queue table for processing pipeline
|
||||
CREATE TABLE knowledge_queue (
|
||||
id VARCHAR2(36) DEFAULT SYS_GUID() PRIMARY KEY,
|
||||
input_type VARCHAR2(20) NOT NULL, -- 'youtube' | 'url' | 'text'
|
||||
content CLOB NOT NULL, -- URL or raw text
|
||||
status VARCHAR2(20) DEFAULT 'pending' NOT NULL,
|
||||
-- pending | processing | done | error
|
||||
title VARCHAR2(500), -- LLM-extracted title (after processing)
|
||||
error_msg CLOB,
|
||||
metadata_json CLOB, -- JSON: summary, tags, author, etc.
|
||||
telegram_chat_id VARCHAR2(50), -- for future notification support
|
||||
created_at TIMESTAMP DEFAULT SYSTIMESTAMP NOT NULL,
|
||||
updated_at TIMESTAMP DEFAULT SYSTIMESTAMP NOT NULL
|
||||
);
|
||||
|
||||
CREATE INDEX idx_kq_status ON knowledge_queue (status, created_at);
|
||||
Reference in New Issue
Block a user