- Oracle ADB queue table (sql/schema.sql) - Queue CRUD: core/queue_db.py - YouTube transcript: core/youtube.py - Web page fetch: core/web.py - LLM enrichment via OCI GenAI Gemini Flash: core/enricher.py - Text chunker: core/chunker.py - Obsidian note writer: core/obsidian.py - Oracle vector store insertion: core/vector.py - Polling daemon: daemon/worker.py - Telegram bot: bot/telegram_bot.py - Main runner: main.py
29 lines
717 B
Python
29 lines
717 B
Python
"""Simple sliding-window text chunking."""
|
|
|
|
|
|
def chunk_text(text: str, size: int = 2000, overlap: int = 200) -> list[str]:
|
|
"""Split text into overlapping chunks.
|
|
|
|
Args:
|
|
text: The full text to split.
|
|
size: Maximum characters per chunk.
|
|
overlap: Characters of overlap between consecutive chunks.
|
|
|
|
Returns:
|
|
List of text chunks. Returns single-item list for short text.
|
|
"""
|
|
if len(text) <= size:
|
|
return [text]
|
|
|
|
chunks: list[str] = []
|
|
step = size - overlap
|
|
start = 0
|
|
while start < len(text):
|
|
end = start + size
|
|
chunks.append(text[start:end])
|
|
if end >= len(text):
|
|
break
|
|
start += step
|
|
|
|
return chunks
|