Files
knowledge-inbox/core/chunker.py
joungmin 86a4104ae3 feat: initial knowledge-inbox pipeline implementation
- Oracle ADB queue table (sql/schema.sql)
- Queue CRUD: core/queue_db.py
- YouTube transcript: core/youtube.py
- Web page fetch: core/web.py
- LLM enrichment via OCI GenAI Gemini Flash: core/enricher.py
- Text chunker: core/chunker.py
- Obsidian note writer: core/obsidian.py
- Oracle vector store insertion: core/vector.py
- Polling daemon: daemon/worker.py
- Telegram bot: bot/telegram_bot.py
- Main runner: main.py
2026-02-28 08:16:11 +09:00

29 lines
717 B
Python

"""Simple sliding-window text chunking."""
def chunk_text(text: str, size: int = 2000, overlap: int = 200) -> list[str]:
"""Split text into overlapping chunks.
Args:
text: The full text to split.
size: Maximum characters per chunk.
overlap: Characters of overlap between consecutive chunks.
Returns:
List of text chunks. Returns single-item list for short text.
"""
if len(text) <= size:
return [text]
chunks: list[str] = []
step = size - overlap
start = 0
while start < len(text):
end = start + size
chunks.append(text[start:end])
if end >= len(text):
break
start += step
return chunks