qwen_agent/skills/developing/mineru/scripts/splitter.py
2026-06-05 14:35:17 +08:00

65 lines
2.1 KiB
Python

"""Split oversized PDFs into cap-sized parts so they clear the MinerU API limits.
The MinerU cloud caps at 20 pages (free Agent API) / 200 pages (Standard API).
``--split`` slices a larger PDF into parts locally, each is parsed, and the
Markdown is merged back — so we are no longer bound by those page caps (the same
trick mineru-converter uses). Uses the optional ``pypdf`` library, lazily
imported, so the core stays zero-dependency.
pip install "mineru-skill[split]" # i.e. pip install pypdf
"""
from __future__ import annotations
from pathlib import Path
class SplitError(Exception):
"""Raised when splitting is requested but cannot be performed."""
def _load_pypdf():
try:
import pypdf # noqa: F401
return pypdf
except ImportError as exc:
raise SplitError(
"--split needs the pypdf library — pip install 'mineru-skill[split]' "
"(i.e. pip install pypdf)"
) from exc
def pdf_page_count(path) -> int:
"""Return the page count of a local PDF (requires pypdf)."""
pypdf = _load_pypdf()
return len(pypdf.PdfReader(str(path)).pages)
def split_pdf(path, max_pages: int, out_dir) -> list:
"""Slice ``path`` into ``max_pages``-page parts under ``out_dir``.
Returns the list of part paths (a single-element list pointing at the original
file if it already fits).
"""
if max_pages < 1:
raise SplitError("max_pages must be >= 1")
pypdf = _load_pypdf()
reader = pypdf.PdfReader(str(path))
total = len(reader.pages)
if total <= max_pages:
return [Path(path)]
out_dir = Path(out_dir)
out_dir.mkdir(parents=True, exist_ok=True)
stem = Path(path).stem
parts = []
for part_index, start in enumerate(range(0, total, max_pages), start=1):
writer = pypdf.PdfWriter()
for page in range(start, min(start + max_pages, total)):
writer.add_page(reader.pages[page])
part_path = out_dir / f"{stem}__part{part_index:03d}.pdf"
with open(part_path, "wb") as handle:
writer.write(handle)
parts.append(part_path)
return parts