65 lines
2.1 KiB
Python
65 lines
2.1 KiB
Python
"""Split oversized PDFs into cap-sized parts so they clear the MinerU API limits.
|
|
|
|
The MinerU cloud caps at 20 pages (free Agent API) / 200 pages (Standard API).
|
|
``--split`` slices a larger PDF into parts locally, each is parsed, and the
|
|
Markdown is merged back — so we are no longer bound by those page caps (the same
|
|
trick mineru-converter uses). Uses the optional ``pypdf`` library, lazily
|
|
imported, so the core stays zero-dependency.
|
|
|
|
pip install "mineru-skill[split]" # i.e. pip install pypdf
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from pathlib import Path
|
|
|
|
|
|
class SplitError(Exception):
|
|
"""Raised when splitting is requested but cannot be performed."""
|
|
|
|
|
|
def _load_pypdf():
|
|
try:
|
|
import pypdf # noqa: F401
|
|
return pypdf
|
|
except ImportError as exc:
|
|
raise SplitError(
|
|
"--split needs the pypdf library — pip install 'mineru-skill[split]' "
|
|
"(i.e. pip install pypdf)"
|
|
) from exc
|
|
|
|
|
|
def pdf_page_count(path) -> int:
|
|
"""Return the page count of a local PDF (requires pypdf)."""
|
|
pypdf = _load_pypdf()
|
|
return len(pypdf.PdfReader(str(path)).pages)
|
|
|
|
|
|
def split_pdf(path, max_pages: int, out_dir) -> list:
|
|
"""Slice ``path`` into ``max_pages``-page parts under ``out_dir``.
|
|
|
|
Returns the list of part paths (a single-element list pointing at the original
|
|
file if it already fits).
|
|
"""
|
|
if max_pages < 1:
|
|
raise SplitError("max_pages must be >= 1")
|
|
pypdf = _load_pypdf()
|
|
reader = pypdf.PdfReader(str(path))
|
|
total = len(reader.pages)
|
|
if total <= max_pages:
|
|
return [Path(path)]
|
|
|
|
out_dir = Path(out_dir)
|
|
out_dir.mkdir(parents=True, exist_ok=True)
|
|
stem = Path(path).stem
|
|
parts = []
|
|
for part_index, start in enumerate(range(0, total, max_pages), start=1):
|
|
writer = pypdf.PdfWriter()
|
|
for page in range(start, min(start + max_pages, total)):
|
|
writer.add_page(reader.pages[page])
|
|
part_path = out_dir / f"{stem}__part{part_index:03d}.pdf"
|
|
with open(part_path, "wb") as handle:
|
|
writer.write(handle)
|
|
parts.append(part_path)
|
|
return parts
|