qwen_agent/skills/developing/mineru/scripts/chunking.py

"""Heading-aware Markdown chunking for RAG pipelines (zero-dependency).

``chunk_markdown`` splits a parsed Markdown document into retrieval-sized chunks
that preserve heading context — matching the RAG-friendliness of LlamaParse /
Unstructured without any dependency.
"""

from __future__ import annotations

import re

_HEADING = re.compile(r"^(#{1,6})\s+(.*)$")


def _slug(text: str) -> str:
    text = (text or "doc").strip().lower()
    text = re.sub(r"[^a-z0-9]+", "-", text).strip("-")
    return text or "doc"


def _split_by_size(text: str, max_chars: int) -> list:
    """Split text into <= max_chars pieces on paragraph boundaries (hard-split if needed)."""
    if len(text) <= max_chars:
        return [text]
    pieces: list = []
    current = ""
    for para in text.split("\n\n"):
        if len(para) > max_chars:
            if current:
                pieces.append(current)
                current = ""
            for i in range(0, len(para), max_chars):
                pieces.append(para[i:i + max_chars])
        elif not current:
            current = para
        elif len(current) + len(para) + 2 <= max_chars:
            current = f"{current}\n\n{para}"
        else:
            pieces.append(current)
            current = para
    if current:
        pieces.append(current)
    return pieces


def chunk_markdown(markdown: str, *, max_chars: int = 2000, source: str = "") -> list:
    """Chunk Markdown by heading, size-splitting long sections.

    Returns ``[{id, index, heading, text, chars, source}, ...]`` where ``heading``
    is the ``H1 > H2 > H3`` breadcrumb for the chunk.
    """
    lines = markdown.replace("\r\n", "\n").split("\n")
    chunks: list = []
    stack: list = []  # (level, text) heading breadcrumb
    buf: list = []
    base = _slug(source)

    def breadcrumb() -> str:
        return " > ".join(t for _, t in stack)

    def flush():
        text = "\n".join(buf).strip()
        buf.clear()
        if not text:
            return
        head = breadcrumb()
        for piece in _split_by_size(text, max_chars):
            idx = len(chunks)
            chunks.append({
                "id": f"{base}-{idx}",
                "index": idx,
                "heading": head,
                "text": piece,
                "chars": len(piece),
                "source": source,
            })

    for line in lines:
        match = _HEADING.match(line.strip())
        if match:
            flush()  # close the previous section under its own breadcrumb
            level = len(match.group(1))
            while stack and stack[-1][0] >= level:
                stack.pop()
            stack.append((level, match.group(2)))
        buf.append(line)
    flush()
    return chunks