feat: initial knowledge-inbox pipeline implementation
- Oracle ADB queue table (sql/schema.sql) - Queue CRUD: core/queue_db.py - YouTube transcript: core/youtube.py - Web page fetch: core/web.py - LLM enrichment via OCI GenAI Gemini Flash: core/enricher.py - Text chunker: core/chunker.py - Obsidian note writer: core/obsidian.py - Oracle vector store insertion: core/vector.py - Polling daemon: daemon/worker.py - Telegram bot: bot/telegram_bot.py - Main runner: main.py
This commit is contained in:
28
core/chunker.py
Normal file
28
core/chunker.py
Normal file
@@ -0,0 +1,28 @@
|
||||
"""Simple sliding-window text chunking."""
|
||||
|
||||
|
||||
def chunk_text(text: str, size: int = 2000, overlap: int = 200) -> list[str]:
|
||||
"""Split text into overlapping chunks.
|
||||
|
||||
Args:
|
||||
text: The full text to split.
|
||||
size: Maximum characters per chunk.
|
||||
overlap: Characters of overlap between consecutive chunks.
|
||||
|
||||
Returns:
|
||||
List of text chunks. Returns single-item list for short text.
|
||||
"""
|
||||
if len(text) <= size:
|
||||
return [text]
|
||||
|
||||
chunks: list[str] = []
|
||||
step = size - overlap
|
||||
start = 0
|
||||
while start < len(text):
|
||||
end = start + size
|
||||
chunks.append(text[start:end])
|
||||
if end >= len(text):
|
||||
break
|
||||
start += step
|
||||
|
||||
return chunks
|
||||
Reference in New Issue
Block a user