89 lines
2.7 KiB
Python
89 lines
2.7 KiB
Python
"""Heading-aware Markdown chunking for RAG pipelines (zero-dependency).
|
|
|
|
``chunk_markdown`` splits a parsed Markdown document into retrieval-sized chunks
|
|
that preserve heading context — matching the RAG-friendliness of LlamaParse /
|
|
Unstructured without any dependency.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
|
|
_HEADING = re.compile(r"^(#{1,6})\s+(.*)$")
|
|
|
|
|
|
def _slug(text: str) -> str:
|
|
text = (text or "doc").strip().lower()
|
|
text = re.sub(r"[^a-z0-9]+", "-", text).strip("-")
|
|
return text or "doc"
|
|
|
|
|
|
def _split_by_size(text: str, max_chars: int) -> list:
|
|
"""Split text into <= max_chars pieces on paragraph boundaries (hard-split if needed)."""
|
|
if len(text) <= max_chars:
|
|
return [text]
|
|
pieces: list = []
|
|
current = ""
|
|
for para in text.split("\n\n"):
|
|
if len(para) > max_chars:
|
|
if current:
|
|
pieces.append(current)
|
|
current = ""
|
|
for i in range(0, len(para), max_chars):
|
|
pieces.append(para[i:i + max_chars])
|
|
elif not current:
|
|
current = para
|
|
elif len(current) + len(para) + 2 <= max_chars:
|
|
current = f"{current}\n\n{para}"
|
|
else:
|
|
pieces.append(current)
|
|
current = para
|
|
if current:
|
|
pieces.append(current)
|
|
return pieces
|
|
|
|
|
|
def chunk_markdown(markdown: str, *, max_chars: int = 2000, source: str = "") -> list:
|
|
"""Chunk Markdown by heading, size-splitting long sections.
|
|
|
|
Returns ``[{id, index, heading, text, chars, source}, ...]`` where ``heading``
|
|
is the ``H1 > H2 > H3`` breadcrumb for the chunk.
|
|
"""
|
|
lines = markdown.replace("\r\n", "\n").split("\n")
|
|
chunks: list = []
|
|
stack: list = [] # (level, text) heading breadcrumb
|
|
buf: list = []
|
|
base = _slug(source)
|
|
|
|
def breadcrumb() -> str:
|
|
return " > ".join(t for _, t in stack)
|
|
|
|
def flush():
|
|
text = "\n".join(buf).strip()
|
|
buf.clear()
|
|
if not text:
|
|
return
|
|
head = breadcrumb()
|
|
for piece in _split_by_size(text, max_chars):
|
|
idx = len(chunks)
|
|
chunks.append({
|
|
"id": f"{base}-{idx}",
|
|
"index": idx,
|
|
"heading": head,
|
|
"text": piece,
|
|
"chars": len(piece),
|
|
"source": source,
|
|
})
|
|
|
|
for line in lines:
|
|
match = _HEADING.match(line.strip())
|
|
if match:
|
|
flush() # close the previous section under its own breadcrumb
|
|
level = len(match.group(1))
|
|
while stack and stack[-1][0] >= level:
|
|
stack.pop()
|
|
stack.append((level, match.group(2)))
|
|
buf.append(line)
|
|
flush()
|
|
return chunks
|