60 lines
1.9 KiB
Python
60 lines
1.9 KiB
Python
"""Optional fully-offline parsing backend for born-digital PDFs.
|
|
|
|
Our single biggest honest gap is being cloud-only. ``--engine local`` parses a
|
|
PDF **entirely offline** with the optional, lightweight ``pymupdf4llm`` library
|
|
(no GPU, no cloud, no upload caps) — ideal for confidential or born-digital PDFs
|
|
where MinerU's cloud VLM is overkill. Scanned/complex docs still want the cloud
|
|
engine, so ``--engine auto`` only uses local when the PDF has real text.
|
|
|
|
pip install "mineru-skill[local]" # i.e. pip install pymupdf4llm
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from pathlib import Path
|
|
|
|
_HINT = (
|
|
"--engine local needs pymupdf4llm — pip install 'mineru-skill[local]' "
|
|
"(i.e. pip install pymupdf4llm)"
|
|
)
|
|
|
|
|
|
class LocalEngineError(Exception):
|
|
"""Raised when local parsing is requested but cannot be performed."""
|
|
|
|
|
|
def available() -> bool:
|
|
try:
|
|
import pymupdf4llm # noqa: F401
|
|
return True
|
|
except ImportError:
|
|
return False
|
|
|
|
|
|
def is_born_digital(path, min_chars: int = 200) -> bool:
|
|
"""True if the PDF has extractable text (so local parsing is appropriate)."""
|
|
try:
|
|
import pymupdf
|
|
except ImportError:
|
|
return False
|
|
doc = pymupdf.open(str(path))
|
|
total = 0
|
|
for page in doc:
|
|
total += len(page.get_text().strip())
|
|
if total >= min_chars:
|
|
return True
|
|
return total >= min_chars
|
|
|
|
|
|
def parse_local(path, output_dir=None) -> str:
|
|
"""Parse a PDF to Markdown fully offline. Returns the Markdown string."""
|
|
try:
|
|
import pymupdf4llm
|
|
except ImportError as exc:
|
|
raise LocalEngineError(_HINT) from exc
|
|
if output_dir is not None:
|
|
images = Path(output_dir) / "images"
|
|
images.mkdir(parents=True, exist_ok=True)
|
|
return pymupdf4llm.to_markdown(str(path), write_images=True, image_path=str(images))
|
|
return pymupdf4llm.to_markdown(str(path))
|