feat: initial knowledge-inbox pipeline implementation
- Oracle ADB queue table (sql/schema.sql) - Queue CRUD: core/queue_db.py - YouTube transcript: core/youtube.py - Web page fetch: core/web.py - LLM enrichment via OCI GenAI Gemini Flash: core/enricher.py - Text chunker: core/chunker.py - Obsidian note writer: core/obsidian.py - Oracle vector store insertion: core/vector.py - Polling daemon: daemon/worker.py - Telegram bot: bot/telegram_bot.py - Main runner: main.py
This commit is contained in:
108
daemon/worker.py
Normal file
108
daemon/worker.py
Normal file
@@ -0,0 +1,108 @@
|
||||
"""Polling daemon that processes knowledge_queue items."""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
|
||||
from core.chunker import chunk_text
|
||||
from core.enricher import enrich
|
||||
from core.obsidian import save_note
|
||||
from core.queue_db import fetch_pending, set_done, set_error, set_processing
|
||||
from core.vector import save_to_vector
|
||||
from core.web import fetch_page_text
|
||||
from core.youtube import get_transcript
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def process_item(item: dict) -> None:
|
||||
"""Process a single queue item end-to-end.
|
||||
|
||||
Args:
|
||||
item: Dict from fetch_pending() with keys: id, input_type, content.
|
||||
"""
|
||||
row_id = item["id"]
|
||||
input_type = item["input_type"]
|
||||
content = item["content"]
|
||||
|
||||
set_processing(row_id)
|
||||
logger.info("Processing %s [%s]", row_id[:8], input_type)
|
||||
|
||||
try:
|
||||
url = ""
|
||||
yt_title = ""
|
||||
|
||||
if input_type == "youtube":
|
||||
result = get_transcript(content)
|
||||
text = result["text"]
|
||||
url = content
|
||||
yt_title = result["title"]
|
||||
elif input_type == "url":
|
||||
text = fetch_page_text(content)
|
||||
url = content
|
||||
else: # text
|
||||
text = content
|
||||
|
||||
if not text:
|
||||
raise ValueError("No text content extracted")
|
||||
|
||||
meta = enrich(input_type, yt_title, url, text)
|
||||
title = meta.get("title") or yt_title or url or row_id[:8]
|
||||
|
||||
note_path = save_note(
|
||||
content_type=input_type,
|
||||
title=title,
|
||||
summary=meta.get("summary", ""),
|
||||
body=text,
|
||||
tags=meta.get("tags", []),
|
||||
source_url=url,
|
||||
author=meta.get("author") or "",
|
||||
date=meta.get("date") or "",
|
||||
)
|
||||
logger.info("Obsidian note saved: %s", note_path)
|
||||
|
||||
chunks = chunk_text(text)
|
||||
doc_id = f"{input_type}:{row_id[:8]}"
|
||||
inserted = save_to_vector(doc_id, chunks)
|
||||
logger.info("Vector store: inserted %d chunks for doc_id=%s", len(inserted), doc_id)
|
||||
|
||||
set_done(row_id, title, meta)
|
||||
logger.info("Done: %s → %s", row_id[:8], title[:60])
|
||||
|
||||
except Exception as exc:
|
||||
logger.error("Error processing %s: %s", row_id[:8], exc, exc_info=True)
|
||||
set_error(row_id, str(exc))
|
||||
|
||||
|
||||
def run_loop(interval: int = 30) -> None:
|
||||
"""Poll for pending items indefinitely.
|
||||
|
||||
Args:
|
||||
interval: Seconds to sleep between polling cycles.
|
||||
"""
|
||||
interval = int(os.environ.get("DAEMON_INTERVAL", interval))
|
||||
logger.info("Daemon started (interval=%ds)", interval)
|
||||
|
||||
while True:
|
||||
try:
|
||||
items = fetch_pending(limit=5)
|
||||
if items:
|
||||
logger.info("Found %d pending item(s)", len(items))
|
||||
for item in items:
|
||||
process_item(item)
|
||||
else:
|
||||
logger.debug("No pending items")
|
||||
except Exception as exc:
|
||||
logger.error("Polling error: %s", exc, exc_info=True)
|
||||
|
||||
time.sleep(interval)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv()
|
||||
run_loop()
|
||||
Reference in New Issue
Block a user