"""Split oversized PDFs into cap-sized parts so they clear the MinerU API limits. The MinerU cloud caps at 20 pages (free Agent API) / 200 pages (Standard API). ``--split`` slices a larger PDF into parts locally, each is parsed, and the Markdown is merged back — so we are no longer bound by those page caps (the same trick mineru-converter uses). Uses the optional ``pypdf`` library, lazily imported, so the core stays zero-dependency. pip install "mineru-skill[split]" # i.e. pip install pypdf """ from __future__ import annotations from pathlib import Path class SplitError(Exception): """Raised when splitting is requested but cannot be performed.""" def _load_pypdf(): try: import pypdf # noqa: F401 return pypdf except ImportError as exc: raise SplitError( "--split needs the pypdf library — pip install 'mineru-skill[split]' " "(i.e. pip install pypdf)" ) from exc def pdf_page_count(path) -> int: """Return the page count of a local PDF (requires pypdf).""" pypdf = _load_pypdf() return len(pypdf.PdfReader(str(path)).pages) def split_pdf(path, max_pages: int, out_dir) -> list: """Slice ``path`` into ``max_pages``-page parts under ``out_dir``. Returns the list of part paths (a single-element list pointing at the original file if it already fits). """ if max_pages < 1: raise SplitError("max_pages must be >= 1") pypdf = _load_pypdf() reader = pypdf.PdfReader(str(path)) total = len(reader.pages) if total <= max_pages: return [Path(path)] out_dir = Path(out_dir) out_dir.mkdir(parents=True, exist_ok=True) stem = Path(path).stem parts = [] for part_index, start in enumerate(range(0, total, max_pages), start=1): writer = pypdf.PdfWriter() for page in range(start, min(start + max_pages, total)): writer.add_page(reader.pages[page]) part_path = out_dir / f"{stem}__part{part_index:03d}.pdf" with open(part_path, "wb") as handle: writer.write(handle) parts.append(part_path) return parts