qwen_agent/skills/developing/mineru/scripts/local_engine.py
2026-06-05 14:35:17 +08:00

60 lines
1.9 KiB
Python

"""Optional fully-offline parsing backend for born-digital PDFs.
Our single biggest honest gap is being cloud-only. ``--engine local`` parses a
PDF **entirely offline** with the optional, lightweight ``pymupdf4llm`` library
(no GPU, no cloud, no upload caps) — ideal for confidential or born-digital PDFs
where MinerU's cloud VLM is overkill. Scanned/complex docs still want the cloud
engine, so ``--engine auto`` only uses local when the PDF has real text.
pip install "mineru-skill[local]" # i.e. pip install pymupdf4llm
"""
from __future__ import annotations
from pathlib import Path
_HINT = (
"--engine local needs pymupdf4llm — pip install 'mineru-skill[local]' "
"(i.e. pip install pymupdf4llm)"
)
class LocalEngineError(Exception):
"""Raised when local parsing is requested but cannot be performed."""
def available() -> bool:
try:
import pymupdf4llm # noqa: F401
return True
except ImportError:
return False
def is_born_digital(path, min_chars: int = 200) -> bool:
"""True if the PDF has extractable text (so local parsing is appropriate)."""
try:
import pymupdf
except ImportError:
return False
doc = pymupdf.open(str(path))
total = 0
for page in doc:
total += len(page.get_text().strip())
if total >= min_chars:
return True
return total >= min_chars
def parse_local(path, output_dir=None) -> str:
"""Parse a PDF to Markdown fully offline. Returns the Markdown string."""
try:
import pymupdf4llm
except ImportError as exc:
raise LocalEngineError(_HINT) from exc
if output_dir is not None:
images = Path(output_dir) / "images"
images.mkdir(parents=True, exist_ok=True)
return pymupdf4llm.to_markdown(str(path), write_images=True, image_path=str(images))
return pymupdf4llm.to_markdown(str(path))